{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 220, "global_step": 4390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002280501710376283, "grad_norm": 44.5, "learning_rate": 0.0, "loss": 2.4621, "mean_token_accuracy": 0.659612387418747, "num_tokens": 107751.0, "step": 1 }, { "epoch": 0.004561003420752566, "grad_norm": 42.25, "learning_rate": 2.272727272727273e-08, "loss": 2.5474, "mean_token_accuracy": 0.6517423838376999, "num_tokens": 214469.0, "step": 2 }, { "epoch": 0.0068415051311288486, "grad_norm": 48.75, "learning_rate": 4.545454545454546e-08, "loss": 2.5963, "mean_token_accuracy": 0.6451424211263657, "num_tokens": 321493.0, "step": 3 }, { "epoch": 0.009122006841505131, "grad_norm": 52.75, "learning_rate": 6.818181818181819e-08, "loss": 2.5346, "mean_token_accuracy": 0.656927764415741, "num_tokens": 428374.0, "step": 4 }, { "epoch": 0.011402508551881414, "grad_norm": 33.75, "learning_rate": 9.090909090909091e-08, "loss": 2.5273, "mean_token_accuracy": 0.6588353216648102, "num_tokens": 535756.0, "step": 5 }, { "epoch": 0.013683010262257697, "grad_norm": 32.0, "learning_rate": 1.1363636363636364e-07, "loss": 2.4468, "mean_token_accuracy": 0.6671261787414551, "num_tokens": 642560.0, "step": 6 }, { "epoch": 0.01596351197263398, "grad_norm": 33.5, "learning_rate": 1.3636363636363637e-07, "loss": 2.5426, "mean_token_accuracy": 0.6598037928342819, "num_tokens": 749804.0, "step": 7 }, { "epoch": 0.018244013683010263, "grad_norm": 33.25, "learning_rate": 1.590909090909091e-07, "loss": 2.5504, "mean_token_accuracy": 0.6524623930454254, "num_tokens": 857462.0, "step": 8 }, { "epoch": 0.020524515393386546, "grad_norm": 39.25, "learning_rate": 1.8181818181818183e-07, "loss": 2.4795, "mean_token_accuracy": 0.6614948958158493, "num_tokens": 964652.0, "step": 9 }, { "epoch": 0.02280501710376283, "grad_norm": 32.25, "learning_rate": 2.0454545454545456e-07, "loss": 2.4681, "mean_token_accuracy": 0.6571934372186661, "num_tokens": 1072114.0, "step": 10 }, { "epoch": 0.02508551881413911, "grad_norm": 56.75, "learning_rate": 2.2727272727272729e-07, "loss": 2.5162, "mean_token_accuracy": 0.6555981487035751, "num_tokens": 1179487.0, "step": 11 }, { "epoch": 0.027366020524515394, "grad_norm": 50.0, "learning_rate": 2.5000000000000004e-07, "loss": 2.5639, "mean_token_accuracy": 0.6516416519880295, "num_tokens": 1286278.0, "step": 12 }, { "epoch": 0.029646522234891677, "grad_norm": 36.5, "learning_rate": 2.7272727272727274e-07, "loss": 2.4399, "mean_token_accuracy": 0.669710099697113, "num_tokens": 1393298.0, "step": 13 }, { "epoch": 0.03192702394526796, "grad_norm": 39.0, "learning_rate": 2.954545454545455e-07, "loss": 2.4725, "mean_token_accuracy": 0.6617565155029297, "num_tokens": 1500702.0, "step": 14 }, { "epoch": 0.03420752565564424, "grad_norm": 35.5, "learning_rate": 3.181818181818182e-07, "loss": 2.4827, "mean_token_accuracy": 0.656602531671524, "num_tokens": 1607486.0, "step": 15 }, { "epoch": 0.036488027366020526, "grad_norm": 47.0, "learning_rate": 3.409090909090909e-07, "loss": 2.5839, "mean_token_accuracy": 0.6526448428630829, "num_tokens": 1714872.0, "step": 16 }, { "epoch": 0.03876852907639681, "grad_norm": 35.25, "learning_rate": 3.6363636363636366e-07, "loss": 2.467, "mean_token_accuracy": 0.6649379134178162, "num_tokens": 1822351.0, "step": 17 }, { "epoch": 0.04104903078677309, "grad_norm": 40.75, "learning_rate": 3.8636363636363636e-07, "loss": 2.5003, "mean_token_accuracy": 0.6564287096261978, "num_tokens": 1930114.0, "step": 18 }, { "epoch": 0.043329532497149374, "grad_norm": 33.5, "learning_rate": 4.090909090909091e-07, "loss": 2.5395, "mean_token_accuracy": 0.6506040990352631, "num_tokens": 2038041.0, "step": 19 }, { "epoch": 0.04561003420752566, "grad_norm": 32.75, "learning_rate": 4.3181818181818187e-07, "loss": 2.5359, "mean_token_accuracy": 0.6554941684007645, "num_tokens": 2144846.0, "step": 20 }, { "epoch": 0.04789053591790194, "grad_norm": 31.125, "learning_rate": 4.5454545454545457e-07, "loss": 2.422, "mean_token_accuracy": 0.6640460044145584, "num_tokens": 2252295.0, "step": 21 }, { "epoch": 0.05017103762827822, "grad_norm": 37.5, "learning_rate": 4.772727272727274e-07, "loss": 2.5004, "mean_token_accuracy": 0.659651443362236, "num_tokens": 2359439.0, "step": 22 }, { "epoch": 0.052451539338654506, "grad_norm": 33.5, "learning_rate": 5.000000000000001e-07, "loss": 2.5621, "mean_token_accuracy": 0.6528295874595642, "num_tokens": 2466337.0, "step": 23 }, { "epoch": 0.05473204104903079, "grad_norm": 33.0, "learning_rate": 5.227272727272728e-07, "loss": 2.4917, "mean_token_accuracy": 0.661816731095314, "num_tokens": 2572770.0, "step": 24 }, { "epoch": 0.05701254275940707, "grad_norm": 45.0, "learning_rate": 5.454545454545455e-07, "loss": 2.402, "mean_token_accuracy": 0.663006380200386, "num_tokens": 2680167.0, "step": 25 }, { "epoch": 0.059293044469783354, "grad_norm": 44.5, "learning_rate": 5.681818181818182e-07, "loss": 2.4679, "mean_token_accuracy": 0.6532121747732162, "num_tokens": 2787861.0, "step": 26 }, { "epoch": 0.06157354618015964, "grad_norm": 34.0, "learning_rate": 5.90909090909091e-07, "loss": 2.5954, "mean_token_accuracy": 0.64468814432621, "num_tokens": 2894486.0, "step": 27 }, { "epoch": 0.06385404789053592, "grad_norm": 56.5, "learning_rate": 6.136363636363637e-07, "loss": 2.5431, "mean_token_accuracy": 0.6518365442752838, "num_tokens": 3001693.0, "step": 28 }, { "epoch": 0.0661345496009122, "grad_norm": 37.75, "learning_rate": 6.363636363636364e-07, "loss": 2.5421, "mean_token_accuracy": 0.6533436477184296, "num_tokens": 3108771.0, "step": 29 }, { "epoch": 0.06841505131128849, "grad_norm": 31.5, "learning_rate": 6.590909090909091e-07, "loss": 2.4357, "mean_token_accuracy": 0.6666090935468674, "num_tokens": 3215312.0, "step": 30 }, { "epoch": 0.07069555302166476, "grad_norm": 31.5, "learning_rate": 6.818181818181818e-07, "loss": 2.4502, "mean_token_accuracy": 0.6624381989240646, "num_tokens": 3322496.0, "step": 31 }, { "epoch": 0.07297605473204105, "grad_norm": 32.25, "learning_rate": 7.045454545454545e-07, "loss": 2.5082, "mean_token_accuracy": 0.6581026613712311, "num_tokens": 3429869.0, "step": 32 }, { "epoch": 0.07525655644241733, "grad_norm": 32.5, "learning_rate": 7.272727272727273e-07, "loss": 2.4792, "mean_token_accuracy": 0.6572518944740295, "num_tokens": 3536755.0, "step": 33 }, { "epoch": 0.07753705815279362, "grad_norm": 31.75, "learning_rate": 7.5e-07, "loss": 2.4916, "mean_token_accuracy": 0.6520580649375916, "num_tokens": 3644347.0, "step": 34 }, { "epoch": 0.07981755986316989, "grad_norm": 36.0, "learning_rate": 7.727272727272727e-07, "loss": 2.4185, "mean_token_accuracy": 0.6675815433263779, "num_tokens": 3751037.0, "step": 35 }, { "epoch": 0.08209806157354618, "grad_norm": 32.75, "learning_rate": 7.954545454545455e-07, "loss": 2.4929, "mean_token_accuracy": 0.6536126732826233, "num_tokens": 3857928.0, "step": 36 }, { "epoch": 0.08437856328392246, "grad_norm": 31.125, "learning_rate": 8.181818181818182e-07, "loss": 2.401, "mean_token_accuracy": 0.6677189022302628, "num_tokens": 3965628.0, "step": 37 }, { "epoch": 0.08665906499429875, "grad_norm": 31.875, "learning_rate": 8.409090909090909e-07, "loss": 2.447, "mean_token_accuracy": 0.6582602560520172, "num_tokens": 4072519.0, "step": 38 }, { "epoch": 0.08893956670467502, "grad_norm": 31.375, "learning_rate": 8.636363636363637e-07, "loss": 2.4231, "mean_token_accuracy": 0.6686751246452332, "num_tokens": 4180022.0, "step": 39 }, { "epoch": 0.09122006841505131, "grad_norm": 47.25, "learning_rate": 8.863636363636364e-07, "loss": 2.4306, "mean_token_accuracy": 0.6607790142297745, "num_tokens": 4287045.0, "step": 40 }, { "epoch": 0.09350057012542759, "grad_norm": 33.0, "learning_rate": 9.090909090909091e-07, "loss": 2.326, "mean_token_accuracy": 0.6676892191171646, "num_tokens": 4394995.0, "step": 41 }, { "epoch": 0.09578107183580388, "grad_norm": 32.75, "learning_rate": 9.31818181818182e-07, "loss": 2.4869, "mean_token_accuracy": 0.6459519267082214, "num_tokens": 4501329.0, "step": 42 }, { "epoch": 0.09806157354618016, "grad_norm": 31.875, "learning_rate": 9.545454545454548e-07, "loss": 2.5084, "mean_token_accuracy": 0.6592301279306412, "num_tokens": 4608718.0, "step": 43 }, { "epoch": 0.10034207525655645, "grad_norm": 31.375, "learning_rate": 9.772727272727275e-07, "loss": 2.4805, "mean_token_accuracy": 0.6500378400087357, "num_tokens": 4715658.0, "step": 44 }, { "epoch": 0.10262257696693272, "grad_norm": 30.75, "learning_rate": 1.0000000000000002e-06, "loss": 2.4318, "mean_token_accuracy": 0.6553644686937332, "num_tokens": 4822368.0, "step": 45 }, { "epoch": 0.10490307867730901, "grad_norm": 31.625, "learning_rate": 1.0227272727272729e-06, "loss": 2.4468, "mean_token_accuracy": 0.6600785255432129, "num_tokens": 4929696.0, "step": 46 }, { "epoch": 0.10718358038768529, "grad_norm": 30.25, "learning_rate": 1.0454545454545456e-06, "loss": 2.3474, "mean_token_accuracy": 0.6583467125892639, "num_tokens": 5037621.0, "step": 47 }, { "epoch": 0.10946408209806158, "grad_norm": 30.375, "learning_rate": 1.0681818181818183e-06, "loss": 2.3681, "mean_token_accuracy": 0.6633005887269974, "num_tokens": 5144969.0, "step": 48 }, { "epoch": 0.11174458380843785, "grad_norm": 30.25, "learning_rate": 1.090909090909091e-06, "loss": 2.4083, "mean_token_accuracy": 0.6564080715179443, "num_tokens": 5252296.0, "step": 49 }, { "epoch": 0.11402508551881414, "grad_norm": 31.125, "learning_rate": 1.1136363636363637e-06, "loss": 2.3943, "mean_token_accuracy": 0.6526119261980057, "num_tokens": 5358939.0, "step": 50 }, { "epoch": 0.11630558722919042, "grad_norm": 31.125, "learning_rate": 1.1363636363636364e-06, "loss": 2.4018, "mean_token_accuracy": 0.6534301191568375, "num_tokens": 5465936.0, "step": 51 }, { "epoch": 0.11858608893956671, "grad_norm": 30.125, "learning_rate": 1.159090909090909e-06, "loss": 2.3053, "mean_token_accuracy": 0.6658475399017334, "num_tokens": 5572802.0, "step": 52 }, { "epoch": 0.12086659064994298, "grad_norm": 28.625, "learning_rate": 1.181818181818182e-06, "loss": 2.2423, "mean_token_accuracy": 0.6766009479761124, "num_tokens": 5680067.0, "step": 53 }, { "epoch": 0.12314709236031927, "grad_norm": 29.125, "learning_rate": 1.2045454545454547e-06, "loss": 2.3047, "mean_token_accuracy": 0.6613978892564774, "num_tokens": 5787260.0, "step": 54 }, { "epoch": 0.12542759407069556, "grad_norm": 28.875, "learning_rate": 1.2272727272727274e-06, "loss": 2.2284, "mean_token_accuracy": 0.6748173385858536, "num_tokens": 5894097.0, "step": 55 }, { "epoch": 0.12770809578107184, "grad_norm": 28.5, "learning_rate": 1.25e-06, "loss": 2.3062, "mean_token_accuracy": 0.665241539478302, "num_tokens": 6001210.0, "step": 56 }, { "epoch": 0.12998859749144812, "grad_norm": 26.75, "learning_rate": 1.2727272727272728e-06, "loss": 2.1486, "mean_token_accuracy": 0.6805437654256821, "num_tokens": 6109107.0, "step": 57 }, { "epoch": 0.1322690992018244, "grad_norm": 28.25, "learning_rate": 1.2954545454545455e-06, "loss": 2.2641, "mean_token_accuracy": 0.6705231070518494, "num_tokens": 6216222.0, "step": 58 }, { "epoch": 0.1345496009122007, "grad_norm": 28.75, "learning_rate": 1.3181818181818182e-06, "loss": 2.2772, "mean_token_accuracy": 0.6666675806045532, "num_tokens": 6323452.0, "step": 59 }, { "epoch": 0.13683010262257697, "grad_norm": 29.625, "learning_rate": 1.3409090909090911e-06, "loss": 2.231, "mean_token_accuracy": 0.6695638597011566, "num_tokens": 6430912.0, "step": 60 }, { "epoch": 0.13911060433295325, "grad_norm": 27.375, "learning_rate": 1.3636363636363636e-06, "loss": 2.2394, "mean_token_accuracy": 0.6609823554754257, "num_tokens": 6538019.0, "step": 61 }, { "epoch": 0.14139110604332952, "grad_norm": 27.75, "learning_rate": 1.3863636363636365e-06, "loss": 2.2672, "mean_token_accuracy": 0.6677799224853516, "num_tokens": 6644863.0, "step": 62 }, { "epoch": 0.14367160775370583, "grad_norm": 27.5, "learning_rate": 1.409090909090909e-06, "loss": 2.2019, "mean_token_accuracy": 0.66669762134552, "num_tokens": 6752333.0, "step": 63 }, { "epoch": 0.1459521094640821, "grad_norm": 27.0, "learning_rate": 1.431818181818182e-06, "loss": 2.2077, "mean_token_accuracy": 0.6657529324293137, "num_tokens": 6859405.0, "step": 64 }, { "epoch": 0.14823261117445838, "grad_norm": 37.75, "learning_rate": 1.4545454545454546e-06, "loss": 2.1799, "mean_token_accuracy": 0.6656498908996582, "num_tokens": 6966421.0, "step": 65 }, { "epoch": 0.15051311288483465, "grad_norm": 27.25, "learning_rate": 1.4772727272727275e-06, "loss": 2.2099, "mean_token_accuracy": 0.6709634065628052, "num_tokens": 7073157.0, "step": 66 }, { "epoch": 0.15279361459521096, "grad_norm": 29.5, "learning_rate": 1.5e-06, "loss": 2.1091, "mean_token_accuracy": 0.6776281297206879, "num_tokens": 7180043.0, "step": 67 }, { "epoch": 0.15507411630558723, "grad_norm": 27.25, "learning_rate": 1.522727272727273e-06, "loss": 2.1265, "mean_token_accuracy": 0.6751858294010162, "num_tokens": 7287133.0, "step": 68 }, { "epoch": 0.1573546180159635, "grad_norm": 25.875, "learning_rate": 1.5454545454545454e-06, "loss": 2.1089, "mean_token_accuracy": 0.6743377894163132, "num_tokens": 7394475.0, "step": 69 }, { "epoch": 0.15963511972633979, "grad_norm": 29.375, "learning_rate": 1.5681818181818184e-06, "loss": 2.0552, "mean_token_accuracy": 0.6845020353794098, "num_tokens": 7502908.0, "step": 70 }, { "epoch": 0.1619156214367161, "grad_norm": 26.25, "learning_rate": 1.590909090909091e-06, "loss": 2.1523, "mean_token_accuracy": 0.6694881021976471, "num_tokens": 7609855.0, "step": 71 }, { "epoch": 0.16419612314709237, "grad_norm": 26.125, "learning_rate": 1.613636363636364e-06, "loss": 2.168, "mean_token_accuracy": 0.6710262894630432, "num_tokens": 7716660.0, "step": 72 }, { "epoch": 0.16647662485746864, "grad_norm": 25.875, "learning_rate": 1.6363636363636365e-06, "loss": 2.0526, "mean_token_accuracy": 0.6768545061349869, "num_tokens": 7823567.0, "step": 73 }, { "epoch": 0.16875712656784492, "grad_norm": 25.375, "learning_rate": 1.6590909090909094e-06, "loss": 2.0024, "mean_token_accuracy": 0.6849700808525085, "num_tokens": 7930741.0, "step": 74 }, { "epoch": 0.17103762827822122, "grad_norm": 24.375, "learning_rate": 1.6818181818181819e-06, "loss": 2.0285, "mean_token_accuracy": 0.6854856461286545, "num_tokens": 8037391.0, "step": 75 }, { "epoch": 0.1733181299885975, "grad_norm": 24.125, "learning_rate": 1.7045454545454546e-06, "loss": 1.9599, "mean_token_accuracy": 0.6963415741920471, "num_tokens": 8144926.0, "step": 76 }, { "epoch": 0.17559863169897377, "grad_norm": 25.375, "learning_rate": 1.7272727272727275e-06, "loss": 1.9961, "mean_token_accuracy": 0.6796167641878128, "num_tokens": 8251774.0, "step": 77 }, { "epoch": 0.17787913340935005, "grad_norm": 23.25, "learning_rate": 1.75e-06, "loss": 1.8617, "mean_token_accuracy": 0.7025072127580643, "num_tokens": 8358808.0, "step": 78 }, { "epoch": 0.18015963511972635, "grad_norm": 23.625, "learning_rate": 1.7727272727272729e-06, "loss": 1.8932, "mean_token_accuracy": 0.6979697048664093, "num_tokens": 8466050.0, "step": 79 }, { "epoch": 0.18244013683010263, "grad_norm": 24.25, "learning_rate": 1.7954545454545456e-06, "loss": 1.9363, "mean_token_accuracy": 0.6897751837968826, "num_tokens": 8572892.0, "step": 80 }, { "epoch": 0.1847206385404789, "grad_norm": 23.375, "learning_rate": 1.8181818181818183e-06, "loss": 1.9175, "mean_token_accuracy": 0.6948112696409225, "num_tokens": 8679995.0, "step": 81 }, { "epoch": 0.18700114025085518, "grad_norm": 22.625, "learning_rate": 1.840909090909091e-06, "loss": 1.825, "mean_token_accuracy": 0.7063078880310059, "num_tokens": 8787388.0, "step": 82 }, { "epoch": 0.18928164196123148, "grad_norm": 21.875, "learning_rate": 1.863636363636364e-06, "loss": 1.8128, "mean_token_accuracy": 0.7035099864006042, "num_tokens": 8894141.0, "step": 83 }, { "epoch": 0.19156214367160776, "grad_norm": 22.375, "learning_rate": 1.8863636363636364e-06, "loss": 1.8433, "mean_token_accuracy": 0.6927594691514969, "num_tokens": 9000732.0, "step": 84 }, { "epoch": 0.19384264538198404, "grad_norm": 21.5, "learning_rate": 1.9090909090909095e-06, "loss": 1.7329, "mean_token_accuracy": 0.7171757072210312, "num_tokens": 9107855.0, "step": 85 }, { "epoch": 0.1961231470923603, "grad_norm": 21.5, "learning_rate": 1.931818181818182e-06, "loss": 1.7188, "mean_token_accuracy": 0.7101240158081055, "num_tokens": 9215544.0, "step": 86 }, { "epoch": 0.19840364880273662, "grad_norm": 21.625, "learning_rate": 1.954545454545455e-06, "loss": 1.7445, "mean_token_accuracy": 0.7046354413032532, "num_tokens": 9322163.0, "step": 87 }, { "epoch": 0.2006841505131129, "grad_norm": 31.75, "learning_rate": 1.977272727272727e-06, "loss": 1.6605, "mean_token_accuracy": 0.7182554006576538, "num_tokens": 9428959.0, "step": 88 }, { "epoch": 0.20296465222348917, "grad_norm": 20.625, "learning_rate": 2.0000000000000003e-06, "loss": 1.6955, "mean_token_accuracy": 0.7105437815189362, "num_tokens": 9535594.0, "step": 89 }, { "epoch": 0.20524515393386544, "grad_norm": 20.5, "learning_rate": 2.022727272727273e-06, "loss": 1.6534, "mean_token_accuracy": 0.7167372107505798, "num_tokens": 9642716.0, "step": 90 }, { "epoch": 0.20752565564424175, "grad_norm": 20.125, "learning_rate": 2.0454545454545457e-06, "loss": 1.6508, "mean_token_accuracy": 0.7186834812164307, "num_tokens": 9749032.0, "step": 91 }, { "epoch": 0.20980615735461802, "grad_norm": 19.625, "learning_rate": 2.0681818181818184e-06, "loss": 1.6197, "mean_token_accuracy": 0.7244506776332855, "num_tokens": 9855340.0, "step": 92 }, { "epoch": 0.2120866590649943, "grad_norm": 19.5, "learning_rate": 2.090909090909091e-06, "loss": 1.5969, "mean_token_accuracy": 0.7231418937444687, "num_tokens": 9962265.0, "step": 93 }, { "epoch": 0.21436716077537057, "grad_norm": 18.5, "learning_rate": 2.113636363636364e-06, "loss": 1.5364, "mean_token_accuracy": 0.7311695367097855, "num_tokens": 10069156.0, "step": 94 }, { "epoch": 0.21664766248574688, "grad_norm": 17.625, "learning_rate": 2.1363636363636365e-06, "loss": 1.4815, "mean_token_accuracy": 0.7454914003610611, "num_tokens": 10176688.0, "step": 95 }, { "epoch": 0.21892816419612315, "grad_norm": 17.625, "learning_rate": 2.1590909090909092e-06, "loss": 1.4978, "mean_token_accuracy": 0.7258649468421936, "num_tokens": 10283306.0, "step": 96 }, { "epoch": 0.22120866590649943, "grad_norm": 17.5, "learning_rate": 2.181818181818182e-06, "loss": 1.4677, "mean_token_accuracy": 0.7376868277788162, "num_tokens": 10390084.0, "step": 97 }, { "epoch": 0.2234891676168757, "grad_norm": 17.125, "learning_rate": 2.2045454545454547e-06, "loss": 1.4221, "mean_token_accuracy": 0.7373760044574738, "num_tokens": 10497627.0, "step": 98 }, { "epoch": 0.22576966932725198, "grad_norm": 23.875, "learning_rate": 2.2272727272727274e-06, "loss": 1.3852, "mean_token_accuracy": 0.7427248060703278, "num_tokens": 10604483.0, "step": 99 }, { "epoch": 0.22805017103762829, "grad_norm": 18.0, "learning_rate": 2.25e-06, "loss": 1.3928, "mean_token_accuracy": 0.7454716116189957, "num_tokens": 10711923.0, "step": 100 }, { "epoch": 0.23033067274800456, "grad_norm": 19.375, "learning_rate": 2.2727272727272728e-06, "loss": 1.3648, "mean_token_accuracy": 0.749587893486023, "num_tokens": 10818737.0, "step": 101 }, { "epoch": 0.23261117445838084, "grad_norm": 21.5, "learning_rate": 2.295454545454546e-06, "loss": 1.3399, "mean_token_accuracy": 0.7486944496631622, "num_tokens": 10925328.0, "step": 102 }, { "epoch": 0.2348916761687571, "grad_norm": 14.9375, "learning_rate": 2.318181818181818e-06, "loss": 1.2484, "mean_token_accuracy": 0.7670646905899048, "num_tokens": 11032717.0, "step": 103 }, { "epoch": 0.23717217787913342, "grad_norm": 14.9375, "learning_rate": 2.3409090909090913e-06, "loss": 1.2661, "mean_token_accuracy": 0.7583093047142029, "num_tokens": 11139345.0, "step": 104 }, { "epoch": 0.2394526795895097, "grad_norm": 14.375, "learning_rate": 2.363636363636364e-06, "loss": 1.2064, "mean_token_accuracy": 0.7690076977014542, "num_tokens": 11245962.0, "step": 105 }, { "epoch": 0.24173318129988597, "grad_norm": 13.9375, "learning_rate": 2.3863636363636367e-06, "loss": 1.2029, "mean_token_accuracy": 0.7617898732423782, "num_tokens": 11352972.0, "step": 106 }, { "epoch": 0.24401368301026224, "grad_norm": 13.75, "learning_rate": 2.4090909090909094e-06, "loss": 1.153, "mean_token_accuracy": 0.7770701050758362, "num_tokens": 11460132.0, "step": 107 }, { "epoch": 0.24629418472063855, "grad_norm": 13.8125, "learning_rate": 2.431818181818182e-06, "loss": 1.1218, "mean_token_accuracy": 0.7785277962684631, "num_tokens": 11567105.0, "step": 108 }, { "epoch": 0.24857468643101482, "grad_norm": 12.5625, "learning_rate": 2.454545454545455e-06, "loss": 1.0815, "mean_token_accuracy": 0.7852117866277695, "num_tokens": 11674498.0, "step": 109 }, { "epoch": 0.2508551881413911, "grad_norm": 12.625, "learning_rate": 2.4772727272727275e-06, "loss": 1.0581, "mean_token_accuracy": 0.7985803633928299, "num_tokens": 11780696.0, "step": 110 }, { "epoch": 0.2531356898517674, "grad_norm": 12.0, "learning_rate": 2.5e-06, "loss": 1.0621, "mean_token_accuracy": 0.7939664274454117, "num_tokens": 11887719.0, "step": 111 }, { "epoch": 0.2554161915621437, "grad_norm": 10.875, "learning_rate": 2.522727272727273e-06, "loss": 0.9893, "mean_token_accuracy": 0.812217116355896, "num_tokens": 11994518.0, "step": 112 }, { "epoch": 0.2576966932725199, "grad_norm": 10.4375, "learning_rate": 2.5454545454545456e-06, "loss": 1.009, "mean_token_accuracy": 0.8142502456903458, "num_tokens": 12101625.0, "step": 113 }, { "epoch": 0.25997719498289623, "grad_norm": 9.5625, "learning_rate": 2.5681818181818187e-06, "loss": 1.0013, "mean_token_accuracy": 0.8154999315738678, "num_tokens": 12208599.0, "step": 114 }, { "epoch": 0.26225769669327254, "grad_norm": 9.0625, "learning_rate": 2.590909090909091e-06, "loss": 0.9603, "mean_token_accuracy": 0.8178321123123169, "num_tokens": 12315919.0, "step": 115 }, { "epoch": 0.2645381984036488, "grad_norm": 8.25, "learning_rate": 2.6136363636363637e-06, "loss": 0.9488, "mean_token_accuracy": 0.8180135637521744, "num_tokens": 12422670.0, "step": 116 }, { "epoch": 0.2668187001140251, "grad_norm": 7.625, "learning_rate": 2.6363636363636364e-06, "loss": 0.9013, "mean_token_accuracy": 0.8282536566257477, "num_tokens": 12529416.0, "step": 117 }, { "epoch": 0.2690992018244014, "grad_norm": 13.1875, "learning_rate": 2.6590909090909095e-06, "loss": 0.8886, "mean_token_accuracy": 0.8260611891746521, "num_tokens": 12636798.0, "step": 118 }, { "epoch": 0.27137970353477764, "grad_norm": 6.9375, "learning_rate": 2.6818181818181822e-06, "loss": 0.896, "mean_token_accuracy": 0.8283491432666779, "num_tokens": 12743702.0, "step": 119 }, { "epoch": 0.27366020524515394, "grad_norm": 6.6875, "learning_rate": 2.7045454545454545e-06, "loss": 0.9264, "mean_token_accuracy": 0.8232417404651642, "num_tokens": 12851550.0, "step": 120 }, { "epoch": 0.2759407069555302, "grad_norm": 6.0625, "learning_rate": 2.7272727272727272e-06, "loss": 0.8567, "mean_token_accuracy": 0.8310881853103638, "num_tokens": 12958606.0, "step": 121 }, { "epoch": 0.2782212086659065, "grad_norm": 5.21875, "learning_rate": 2.7500000000000004e-06, "loss": 0.8334, "mean_token_accuracy": 0.8362232446670532, "num_tokens": 13065521.0, "step": 122 }, { "epoch": 0.2805017103762828, "grad_norm": 5.15625, "learning_rate": 2.772727272727273e-06, "loss": 0.8364, "mean_token_accuracy": 0.8403038382530212, "num_tokens": 13172067.0, "step": 123 }, { "epoch": 0.28278221208665905, "grad_norm": 4.75, "learning_rate": 2.7954545454545458e-06, "loss": 0.8005, "mean_token_accuracy": 0.8477163910865784, "num_tokens": 13279427.0, "step": 124 }, { "epoch": 0.28506271379703535, "grad_norm": 4.78125, "learning_rate": 2.818181818181818e-06, "loss": 0.8036, "mean_token_accuracy": 0.8398231416940689, "num_tokens": 13386332.0, "step": 125 }, { "epoch": 0.28734321550741165, "grad_norm": 4.625, "learning_rate": 2.8409090909090916e-06, "loss": 0.7985, "mean_token_accuracy": 0.8417368233203888, "num_tokens": 13493485.0, "step": 126 }, { "epoch": 0.2896237172177879, "grad_norm": 4.40625, "learning_rate": 2.863636363636364e-06, "loss": 0.8215, "mean_token_accuracy": 0.8364972919225693, "num_tokens": 13600526.0, "step": 127 }, { "epoch": 0.2919042189281642, "grad_norm": 4.21875, "learning_rate": 2.8863636363636366e-06, "loss": 0.7888, "mean_token_accuracy": 0.8411660343408585, "num_tokens": 13707335.0, "step": 128 }, { "epoch": 0.29418472063854045, "grad_norm": 3.9375, "learning_rate": 2.9090909090909093e-06, "loss": 0.7955, "mean_token_accuracy": 0.840514287352562, "num_tokens": 13814123.0, "step": 129 }, { "epoch": 0.29646522234891676, "grad_norm": 3.890625, "learning_rate": 2.931818181818182e-06, "loss": 0.7764, "mean_token_accuracy": 0.8397897183895111, "num_tokens": 13921444.0, "step": 130 }, { "epoch": 0.29874572405929306, "grad_norm": 3.84375, "learning_rate": 2.954545454545455e-06, "loss": 0.7721, "mean_token_accuracy": 0.8480268269777298, "num_tokens": 14028739.0, "step": 131 }, { "epoch": 0.3010262257696693, "grad_norm": 4.15625, "learning_rate": 2.9772727272727274e-06, "loss": 0.7483, "mean_token_accuracy": 0.8492945581674576, "num_tokens": 14136015.0, "step": 132 }, { "epoch": 0.3033067274800456, "grad_norm": 4.46875, "learning_rate": 3e-06, "loss": 0.7765, "mean_token_accuracy": 0.8442358523607254, "num_tokens": 14243016.0, "step": 133 }, { "epoch": 0.3055872291904219, "grad_norm": 4.34375, "learning_rate": 3.0227272727272728e-06, "loss": 0.7711, "mean_token_accuracy": 0.8423920571804047, "num_tokens": 14350319.0, "step": 134 }, { "epoch": 0.30786773090079816, "grad_norm": 3.609375, "learning_rate": 3.045454545454546e-06, "loss": 0.767, "mean_token_accuracy": 0.845877543091774, "num_tokens": 14457699.0, "step": 135 }, { "epoch": 0.31014823261117447, "grad_norm": 3.390625, "learning_rate": 3.0681818181818186e-06, "loss": 0.7406, "mean_token_accuracy": 0.8514658808708191, "num_tokens": 14565228.0, "step": 136 }, { "epoch": 0.3124287343215507, "grad_norm": 3.71875, "learning_rate": 3.090909090909091e-06, "loss": 0.7493, "mean_token_accuracy": 0.8524320423603058, "num_tokens": 14671853.0, "step": 137 }, { "epoch": 0.314709236031927, "grad_norm": 3.625, "learning_rate": 3.1136363636363636e-06, "loss": 0.7529, "mean_token_accuracy": 0.8469192087650299, "num_tokens": 14778717.0, "step": 138 }, { "epoch": 0.3169897377423033, "grad_norm": 4.03125, "learning_rate": 3.1363636363636367e-06, "loss": 0.751, "mean_token_accuracy": 0.847698837518692, "num_tokens": 14885625.0, "step": 139 }, { "epoch": 0.31927023945267957, "grad_norm": 3.359375, "learning_rate": 3.1590909090909094e-06, "loss": 0.7293, "mean_token_accuracy": 0.8514856845140457, "num_tokens": 14992824.0, "step": 140 }, { "epoch": 0.3215507411630559, "grad_norm": 4.28125, "learning_rate": 3.181818181818182e-06, "loss": 0.7383, "mean_token_accuracy": 0.8546594232320786, "num_tokens": 15100514.0, "step": 141 }, { "epoch": 0.3238312428734322, "grad_norm": 6.03125, "learning_rate": 3.204545454545455e-06, "loss": 0.7278, "mean_token_accuracy": 0.8488231599330902, "num_tokens": 15208020.0, "step": 142 }, { "epoch": 0.3261117445838084, "grad_norm": 4.875, "learning_rate": 3.227272727272728e-06, "loss": 0.7396, "mean_token_accuracy": 0.8457832187414169, "num_tokens": 15314965.0, "step": 143 }, { "epoch": 0.32839224629418473, "grad_norm": 3.5625, "learning_rate": 3.2500000000000002e-06, "loss": 0.7361, "mean_token_accuracy": 0.8558014631271362, "num_tokens": 15421585.0, "step": 144 }, { "epoch": 0.330672748004561, "grad_norm": 3.109375, "learning_rate": 3.272727272727273e-06, "loss": 0.7336, "mean_token_accuracy": 0.8494018763303757, "num_tokens": 15528412.0, "step": 145 }, { "epoch": 0.3329532497149373, "grad_norm": 3.21875, "learning_rate": 3.2954545454545456e-06, "loss": 0.7171, "mean_token_accuracy": 0.8540607988834381, "num_tokens": 15635689.0, "step": 146 }, { "epoch": 0.3352337514253136, "grad_norm": 3.28125, "learning_rate": 3.3181818181818188e-06, "loss": 0.6972, "mean_token_accuracy": 0.856657013297081, "num_tokens": 15742801.0, "step": 147 }, { "epoch": 0.33751425313568983, "grad_norm": 3.953125, "learning_rate": 3.3409090909090915e-06, "loss": 0.7182, "mean_token_accuracy": 0.8510304987430573, "num_tokens": 15850125.0, "step": 148 }, { "epoch": 0.33979475484606614, "grad_norm": 3.625, "learning_rate": 3.3636363636363637e-06, "loss": 0.7362, "mean_token_accuracy": 0.84682796895504, "num_tokens": 15956807.0, "step": 149 }, { "epoch": 0.34207525655644244, "grad_norm": 3.578125, "learning_rate": 3.3863636363636364e-06, "loss": 0.7022, "mean_token_accuracy": 0.85152368247509, "num_tokens": 16063797.0, "step": 150 }, { "epoch": 0.3443557582668187, "grad_norm": 3.53125, "learning_rate": 3.409090909090909e-06, "loss": 0.6747, "mean_token_accuracy": 0.8609962910413742, "num_tokens": 16170407.0, "step": 151 }, { "epoch": 0.346636259977195, "grad_norm": 3.484375, "learning_rate": 3.4318181818181823e-06, "loss": 0.7054, "mean_token_accuracy": 0.8567307740449905, "num_tokens": 16276735.0, "step": 152 }, { "epoch": 0.34891676168757124, "grad_norm": 3.375, "learning_rate": 3.454545454545455e-06, "loss": 0.6954, "mean_token_accuracy": 0.8543938845396042, "num_tokens": 16383665.0, "step": 153 }, { "epoch": 0.35119726339794755, "grad_norm": 3.984375, "learning_rate": 3.4772727272727277e-06, "loss": 0.7353, "mean_token_accuracy": 0.8494099676609039, "num_tokens": 16490654.0, "step": 154 }, { "epoch": 0.35347776510832385, "grad_norm": 3.53125, "learning_rate": 3.5e-06, "loss": 0.6977, "mean_token_accuracy": 0.8554576933383942, "num_tokens": 16597928.0, "step": 155 }, { "epoch": 0.3557582668187001, "grad_norm": 3.4375, "learning_rate": 3.522727272727273e-06, "loss": 0.666, "mean_token_accuracy": 0.8644505739212036, "num_tokens": 16705070.0, "step": 156 }, { "epoch": 0.3580387685290764, "grad_norm": 4.6875, "learning_rate": 3.5454545454545458e-06, "loss": 0.7023, "mean_token_accuracy": 0.8530746251344681, "num_tokens": 16812219.0, "step": 157 }, { "epoch": 0.3603192702394527, "grad_norm": 3.125, "learning_rate": 3.5681818181818185e-06, "loss": 0.6717, "mean_token_accuracy": 0.8598745912313461, "num_tokens": 16919765.0, "step": 158 }, { "epoch": 0.36259977194982895, "grad_norm": 2.84375, "learning_rate": 3.590909090909091e-06, "loss": 0.7021, "mean_token_accuracy": 0.8543113619089127, "num_tokens": 17026571.0, "step": 159 }, { "epoch": 0.36488027366020526, "grad_norm": 3.265625, "learning_rate": 3.6136363636363643e-06, "loss": 0.7072, "mean_token_accuracy": 0.8556502610445023, "num_tokens": 17133437.0, "step": 160 }, { "epoch": 0.3671607753705815, "grad_norm": 2.578125, "learning_rate": 3.6363636363636366e-06, "loss": 0.686, "mean_token_accuracy": 0.859677642583847, "num_tokens": 17240745.0, "step": 161 }, { "epoch": 0.3694412770809578, "grad_norm": 2.453125, "learning_rate": 3.6590909090909093e-06, "loss": 0.6827, "mean_token_accuracy": 0.8606048673391342, "num_tokens": 17347700.0, "step": 162 }, { "epoch": 0.3717217787913341, "grad_norm": 3.015625, "learning_rate": 3.681818181818182e-06, "loss": 0.6758, "mean_token_accuracy": 0.8614284843206406, "num_tokens": 17455182.0, "step": 163 }, { "epoch": 0.37400228050171036, "grad_norm": 2.671875, "learning_rate": 3.704545454545455e-06, "loss": 0.6935, "mean_token_accuracy": 0.8534423410892487, "num_tokens": 17562182.0, "step": 164 }, { "epoch": 0.37628278221208666, "grad_norm": 2.46875, "learning_rate": 3.727272727272728e-06, "loss": 0.7017, "mean_token_accuracy": 0.855252206325531, "num_tokens": 17668986.0, "step": 165 }, { "epoch": 0.37856328392246297, "grad_norm": 2.625, "learning_rate": 3.7500000000000005e-06, "loss": 0.6615, "mean_token_accuracy": 0.8632187992334366, "num_tokens": 17776052.0, "step": 166 }, { "epoch": 0.3808437856328392, "grad_norm": 2.65625, "learning_rate": 3.772727272727273e-06, "loss": 0.6868, "mean_token_accuracy": 0.854138657450676, "num_tokens": 17883630.0, "step": 167 }, { "epoch": 0.3831242873432155, "grad_norm": 2.546875, "learning_rate": 3.7954545454545455e-06, "loss": 0.6734, "mean_token_accuracy": 0.8595366477966309, "num_tokens": 17990666.0, "step": 168 }, { "epoch": 0.38540478905359177, "grad_norm": 2.890625, "learning_rate": 3.818181818181819e-06, "loss": 0.6898, "mean_token_accuracy": 0.8539352118968964, "num_tokens": 18097695.0, "step": 169 }, { "epoch": 0.38768529076396807, "grad_norm": 2.515625, "learning_rate": 3.840909090909091e-06, "loss": 0.691, "mean_token_accuracy": 0.8536562919616699, "num_tokens": 18204503.0, "step": 170 }, { "epoch": 0.3899657924743444, "grad_norm": 2.46875, "learning_rate": 3.863636363636364e-06, "loss": 0.6677, "mean_token_accuracy": 0.8611757457256317, "num_tokens": 18311196.0, "step": 171 }, { "epoch": 0.3922462941847206, "grad_norm": 3.125, "learning_rate": 3.886363636363637e-06, "loss": 0.7125, "mean_token_accuracy": 0.8503219485282898, "num_tokens": 18417786.0, "step": 172 }, { "epoch": 0.3945267958950969, "grad_norm": 2.0625, "learning_rate": 3.90909090909091e-06, "loss": 0.6665, "mean_token_accuracy": 0.8595822900533676, "num_tokens": 18525225.0, "step": 173 }, { "epoch": 0.39680729760547323, "grad_norm": 2.984375, "learning_rate": 3.931818181818182e-06, "loss": 0.6908, "mean_token_accuracy": 0.8568900525569916, "num_tokens": 18632231.0, "step": 174 }, { "epoch": 0.3990877993158495, "grad_norm": 2.3125, "learning_rate": 3.954545454545454e-06, "loss": 0.6955, "mean_token_accuracy": 0.8539032638072968, "num_tokens": 18739610.0, "step": 175 }, { "epoch": 0.4013683010262258, "grad_norm": 2.6875, "learning_rate": 3.9772727272727275e-06, "loss": 0.6726, "mean_token_accuracy": 0.8602974861860275, "num_tokens": 18846794.0, "step": 176 }, { "epoch": 0.40364880273660203, "grad_norm": 2.484375, "learning_rate": 4.000000000000001e-06, "loss": 0.6678, "mean_token_accuracy": 0.8603723049163818, "num_tokens": 18953693.0, "step": 177 }, { "epoch": 0.40592930444697833, "grad_norm": 3.265625, "learning_rate": 4.022727272727273e-06, "loss": 0.6585, "mean_token_accuracy": 0.8616438657045364, "num_tokens": 19060250.0, "step": 178 }, { "epoch": 0.40820980615735464, "grad_norm": 2.71875, "learning_rate": 4.045454545454546e-06, "loss": 0.6679, "mean_token_accuracy": 0.8587404191493988, "num_tokens": 19167003.0, "step": 179 }, { "epoch": 0.4104903078677309, "grad_norm": 2.265625, "learning_rate": 4.068181818181818e-06, "loss": 0.665, "mean_token_accuracy": 0.8621865957975388, "num_tokens": 19274555.0, "step": 180 }, { "epoch": 0.4127708095781072, "grad_norm": 2.921875, "learning_rate": 4.0909090909090915e-06, "loss": 0.6714, "mean_token_accuracy": 0.8582023978233337, "num_tokens": 19381948.0, "step": 181 }, { "epoch": 0.4150513112884835, "grad_norm": 3.171875, "learning_rate": 4.113636363636364e-06, "loss": 0.6432, "mean_token_accuracy": 0.8636541664600372, "num_tokens": 19489531.0, "step": 182 }, { "epoch": 0.41733181299885974, "grad_norm": 2.53125, "learning_rate": 4.136363636363637e-06, "loss": 0.6553, "mean_token_accuracy": 0.8614503294229507, "num_tokens": 19596788.0, "step": 183 }, { "epoch": 0.41961231470923605, "grad_norm": 2.375, "learning_rate": 4.159090909090909e-06, "loss": 0.6557, "mean_token_accuracy": 0.865259125828743, "num_tokens": 19703381.0, "step": 184 }, { "epoch": 0.4218928164196123, "grad_norm": 2.515625, "learning_rate": 4.181818181818182e-06, "loss": 0.675, "mean_token_accuracy": 0.8583291471004486, "num_tokens": 19810512.0, "step": 185 }, { "epoch": 0.4241733181299886, "grad_norm": 2.21875, "learning_rate": 4.204545454545455e-06, "loss": 0.6707, "mean_token_accuracy": 0.8560606837272644, "num_tokens": 19917996.0, "step": 186 }, { "epoch": 0.4264538198403649, "grad_norm": 3.015625, "learning_rate": 4.227272727272728e-06, "loss": 0.6663, "mean_token_accuracy": 0.8619653731584549, "num_tokens": 20025261.0, "step": 187 }, { "epoch": 0.42873432155074115, "grad_norm": 2.546875, "learning_rate": 4.25e-06, "loss": 0.6772, "mean_token_accuracy": 0.8561718165874481, "num_tokens": 20131720.0, "step": 188 }, { "epoch": 0.43101482326111745, "grad_norm": 2.921875, "learning_rate": 4.272727272727273e-06, "loss": 0.6727, "mean_token_accuracy": 0.8626906722784042, "num_tokens": 20238797.0, "step": 189 }, { "epoch": 0.43329532497149376, "grad_norm": 2.421875, "learning_rate": 4.295454545454546e-06, "loss": 0.6861, "mean_token_accuracy": 0.8580966591835022, "num_tokens": 20345319.0, "step": 190 }, { "epoch": 0.43557582668187, "grad_norm": 2.71875, "learning_rate": 4.3181818181818185e-06, "loss": 0.6749, "mean_token_accuracy": 0.8572024405002594, "num_tokens": 20452421.0, "step": 191 }, { "epoch": 0.4378563283922463, "grad_norm": 4.0, "learning_rate": 4.340909090909091e-06, "loss": 0.6529, "mean_token_accuracy": 0.862261489033699, "num_tokens": 20559771.0, "step": 192 }, { "epoch": 0.44013683010262256, "grad_norm": 2.75, "learning_rate": 4.363636363636364e-06, "loss": 0.6361, "mean_token_accuracy": 0.8651402294635773, "num_tokens": 20667009.0, "step": 193 }, { "epoch": 0.44241733181299886, "grad_norm": 2.84375, "learning_rate": 4.386363636363637e-06, "loss": 0.6849, "mean_token_accuracy": 0.8569408059120178, "num_tokens": 20774389.0, "step": 194 }, { "epoch": 0.44469783352337516, "grad_norm": 4.34375, "learning_rate": 4.409090909090909e-06, "loss": 0.667, "mean_token_accuracy": 0.8601485341787338, "num_tokens": 20881703.0, "step": 195 }, { "epoch": 0.4469783352337514, "grad_norm": 3.875, "learning_rate": 4.4318181818181824e-06, "loss": 0.656, "mean_token_accuracy": 0.8644906729459763, "num_tokens": 20988710.0, "step": 196 }, { "epoch": 0.4492588369441277, "grad_norm": 2.859375, "learning_rate": 4.454545454545455e-06, "loss": 0.6577, "mean_token_accuracy": 0.8626122176647186, "num_tokens": 21095340.0, "step": 197 }, { "epoch": 0.45153933865450396, "grad_norm": 3.40625, "learning_rate": 4.477272727272728e-06, "loss": 0.6547, "mean_token_accuracy": 0.8651628345251083, "num_tokens": 21202493.0, "step": 198 }, { "epoch": 0.45381984036488027, "grad_norm": 3.4375, "learning_rate": 4.5e-06, "loss": 0.6607, "mean_token_accuracy": 0.8622200191020966, "num_tokens": 21309591.0, "step": 199 }, { "epoch": 0.45610034207525657, "grad_norm": 2.46875, "learning_rate": 4.522727272727273e-06, "loss": 0.6405, "mean_token_accuracy": 0.8680934458971024, "num_tokens": 21417007.0, "step": 200 }, { "epoch": 0.4583808437856328, "grad_norm": 3.53125, "learning_rate": 4.5454545454545455e-06, "loss": 0.6589, "mean_token_accuracy": 0.8650902211666107, "num_tokens": 21524653.0, "step": 201 }, { "epoch": 0.4606613454960091, "grad_norm": 4.0625, "learning_rate": 4.568181818181819e-06, "loss": 0.6672, "mean_token_accuracy": 0.8640954792499542, "num_tokens": 21631985.0, "step": 202 }, { "epoch": 0.4629418472063854, "grad_norm": 2.4375, "learning_rate": 4.590909090909092e-06, "loss": 0.6749, "mean_token_accuracy": 0.8592006415128708, "num_tokens": 21738904.0, "step": 203 }, { "epoch": 0.4652223489167617, "grad_norm": 2.65625, "learning_rate": 4.613636363636364e-06, "loss": 0.6362, "mean_token_accuracy": 0.8670714944601059, "num_tokens": 21846587.0, "step": 204 }, { "epoch": 0.467502850627138, "grad_norm": 2.890625, "learning_rate": 4.636363636363636e-06, "loss": 0.6724, "mean_token_accuracy": 0.861112967133522, "num_tokens": 21953786.0, "step": 205 }, { "epoch": 0.4697833523375142, "grad_norm": 4.125, "learning_rate": 4.6590909090909095e-06, "loss": 0.6757, "mean_token_accuracy": 0.860434964299202, "num_tokens": 22061456.0, "step": 206 }, { "epoch": 0.47206385404789053, "grad_norm": 2.34375, "learning_rate": 4.681818181818183e-06, "loss": 0.6586, "mean_token_accuracy": 0.8619724959135056, "num_tokens": 22169315.0, "step": 207 }, { "epoch": 0.47434435575826683, "grad_norm": 2.6875, "learning_rate": 4.704545454545455e-06, "loss": 0.6591, "mean_token_accuracy": 0.8640032708644867, "num_tokens": 22276762.0, "step": 208 }, { "epoch": 0.4766248574686431, "grad_norm": 2.703125, "learning_rate": 4.727272727272728e-06, "loss": 0.6554, "mean_token_accuracy": 0.8638423681259155, "num_tokens": 22383321.0, "step": 209 }, { "epoch": 0.4789053591790194, "grad_norm": 2.03125, "learning_rate": 4.75e-06, "loss": 0.6595, "mean_token_accuracy": 0.8663541227579117, "num_tokens": 22490163.0, "step": 210 }, { "epoch": 0.4811858608893957, "grad_norm": 1.8984375, "learning_rate": 4.772727272727273e-06, "loss": 0.6458, "mean_token_accuracy": 0.8662288784980774, "num_tokens": 22596745.0, "step": 211 }, { "epoch": 0.48346636259977194, "grad_norm": 3.296875, "learning_rate": 4.795454545454546e-06, "loss": 0.6634, "mean_token_accuracy": 0.8609149158000946, "num_tokens": 22704241.0, "step": 212 }, { "epoch": 0.48574686431014824, "grad_norm": 2.296875, "learning_rate": 4.818181818181819e-06, "loss": 0.6559, "mean_token_accuracy": 0.8637297302484512, "num_tokens": 22810936.0, "step": 213 }, { "epoch": 0.4880273660205245, "grad_norm": 2.21875, "learning_rate": 4.840909090909091e-06, "loss": 0.6616, "mean_token_accuracy": 0.8652000278234482, "num_tokens": 22917635.0, "step": 214 }, { "epoch": 0.4903078677309008, "grad_norm": 3.921875, "learning_rate": 4.863636363636364e-06, "loss": 0.6662, "mean_token_accuracy": 0.8612215965986252, "num_tokens": 23024940.0, "step": 215 }, { "epoch": 0.4925883694412771, "grad_norm": 3.796875, "learning_rate": 4.8863636363636365e-06, "loss": 0.6785, "mean_token_accuracy": 0.8561449646949768, "num_tokens": 23132260.0, "step": 216 }, { "epoch": 0.49486887115165334, "grad_norm": 2.734375, "learning_rate": 4.90909090909091e-06, "loss": 0.6614, "mean_token_accuracy": 0.8678651452064514, "num_tokens": 23239311.0, "step": 217 }, { "epoch": 0.49714937286202965, "grad_norm": 2.6875, "learning_rate": 4.931818181818182e-06, "loss": 0.6485, "mean_token_accuracy": 0.8589539378881454, "num_tokens": 23346340.0, "step": 218 }, { "epoch": 0.49942987457240595, "grad_norm": 2.15625, "learning_rate": 4.954545454545455e-06, "loss": 0.6416, "mean_token_accuracy": 0.8672493249177933, "num_tokens": 23454269.0, "step": 219 }, { "epoch": 0.5017103762827823, "grad_norm": 2.53125, "learning_rate": 4.977272727272728e-06, "loss": 0.6669, "mean_token_accuracy": 0.8612665235996246, "num_tokens": 23560711.0, "step": 220 }, { "epoch": 0.5017103762827823, "eval_loss": 0.6530081033706665, "eval_mean_token_accuracy": 0.8651766348701013, "eval_num_tokens": 23560711.0, "eval_runtime": 58.5845, "eval_samples_per_second": 143.126, "eval_steps_per_second": 4.489, "step": 220 }, { "epoch": 0.5039908779931584, "grad_norm": 2.296875, "learning_rate": 5e-06, "loss": 0.62, "mean_token_accuracy": 0.8713521808385849, "num_tokens": 23668025.0, "step": 221 }, { "epoch": 0.5062713797035348, "grad_norm": 1.890625, "learning_rate": 4.999999290524132e-06, "loss": 0.6252, "mean_token_accuracy": 0.8714602589607239, "num_tokens": 23775006.0, "step": 222 }, { "epoch": 0.508551881413911, "grad_norm": 2.703125, "learning_rate": 4.999997162096932e-06, "loss": 0.6736, "mean_token_accuracy": 0.8643509894609451, "num_tokens": 23882355.0, "step": 223 }, { "epoch": 0.5108323831242874, "grad_norm": 2.9375, "learning_rate": 4.999993614719606e-06, "loss": 0.6412, "mean_token_accuracy": 0.8682572692632675, "num_tokens": 23989600.0, "step": 224 }, { "epoch": 0.5131128848346637, "grad_norm": 2.140625, "learning_rate": 4.999988648394169e-06, "loss": 0.6096, "mean_token_accuracy": 0.8725505918264389, "num_tokens": 24097243.0, "step": 225 }, { "epoch": 0.5153933865450399, "grad_norm": 2.890625, "learning_rate": 4.99998226312344e-06, "loss": 0.6531, "mean_token_accuracy": 0.8634347319602966, "num_tokens": 24203953.0, "step": 226 }, { "epoch": 0.5176738882554162, "grad_norm": 2.546875, "learning_rate": 4.999974458911041e-06, "loss": 0.6314, "mean_token_accuracy": 0.8699511885643005, "num_tokens": 24310863.0, "step": 227 }, { "epoch": 0.5199543899657925, "grad_norm": 2.484375, "learning_rate": 4.999965235761404e-06, "loss": 0.6486, "mean_token_accuracy": 0.8652396500110626, "num_tokens": 24417755.0, "step": 228 }, { "epoch": 0.5222348916761688, "grad_norm": 2.34375, "learning_rate": 4.999954593679762e-06, "loss": 0.659, "mean_token_accuracy": 0.864411011338234, "num_tokens": 24524785.0, "step": 229 }, { "epoch": 0.5245153933865451, "grad_norm": 4.3125, "learning_rate": 4.999942532672157e-06, "loss": 0.688, "mean_token_accuracy": 0.854931503534317, "num_tokens": 24631291.0, "step": 230 }, { "epoch": 0.5267958950969214, "grad_norm": 2.859375, "learning_rate": 4.999929052745434e-06, "loss": 0.6541, "mean_token_accuracy": 0.8636079728603363, "num_tokens": 24738568.0, "step": 231 }, { "epoch": 0.5290763968072976, "grad_norm": 3.953125, "learning_rate": 4.999914153907243e-06, "loss": 0.6538, "mean_token_accuracy": 0.8641977459192276, "num_tokens": 24845646.0, "step": 232 }, { "epoch": 0.5313568985176739, "grad_norm": 2.1875, "learning_rate": 4.999897836166041e-06, "loss": 0.6473, "mean_token_accuracy": 0.8661595731973648, "num_tokens": 24952336.0, "step": 233 }, { "epoch": 0.5336374002280502, "grad_norm": 2.6875, "learning_rate": 4.999880099531089e-06, "loss": 0.6576, "mean_token_accuracy": 0.8643601685762405, "num_tokens": 25058478.0, "step": 234 }, { "epoch": 0.5359179019384265, "grad_norm": 2.890625, "learning_rate": 4.999860944012455e-06, "loss": 0.6463, "mean_token_accuracy": 0.8663338273763657, "num_tokens": 25165512.0, "step": 235 }, { "epoch": 0.5381984036488028, "grad_norm": 2.390625, "learning_rate": 4.999840369621011e-06, "loss": 0.6453, "mean_token_accuracy": 0.8652057200670242, "num_tokens": 25272671.0, "step": 236 }, { "epoch": 0.540478905359179, "grad_norm": 2.734375, "learning_rate": 4.999818376368435e-06, "loss": 0.625, "mean_token_accuracy": 0.8729881942272186, "num_tokens": 25379431.0, "step": 237 }, { "epoch": 0.5427594070695553, "grad_norm": 2.15625, "learning_rate": 4.999794964267208e-06, "loss": 0.6275, "mean_token_accuracy": 0.8688799887895584, "num_tokens": 25486822.0, "step": 238 }, { "epoch": 0.5450399087799316, "grad_norm": 2.015625, "learning_rate": 4.9997701333306215e-06, "loss": 0.6471, "mean_token_accuracy": 0.8656931668519974, "num_tokens": 25594363.0, "step": 239 }, { "epoch": 0.5473204104903079, "grad_norm": 5.34375, "learning_rate": 4.999743883572766e-06, "loss": 0.6471, "mean_token_accuracy": 0.8659048229455948, "num_tokens": 25701619.0, "step": 240 }, { "epoch": 0.5496009122006842, "grad_norm": 4.125, "learning_rate": 4.999716215008542e-06, "loss": 0.6573, "mean_token_accuracy": 0.866451695561409, "num_tokens": 25808261.0, "step": 241 }, { "epoch": 0.5518814139110604, "grad_norm": 2.359375, "learning_rate": 4.999687127653654e-06, "loss": 0.6614, "mean_token_accuracy": 0.8627986013889313, "num_tokens": 25915402.0, "step": 242 }, { "epoch": 0.5541619156214367, "grad_norm": 2.421875, "learning_rate": 4.99965662152461e-06, "loss": 0.6355, "mean_token_accuracy": 0.8666711002588272, "num_tokens": 26023048.0, "step": 243 }, { "epoch": 0.556442417331813, "grad_norm": 2.046875, "learning_rate": 4.999624696638725e-06, "loss": 0.6311, "mean_token_accuracy": 0.869884267449379, "num_tokens": 26130137.0, "step": 244 }, { "epoch": 0.5587229190421893, "grad_norm": 2.765625, "learning_rate": 4.999591353014119e-06, "loss": 0.6488, "mean_token_accuracy": 0.8630170971155167, "num_tokens": 26237107.0, "step": 245 }, { "epoch": 0.5610034207525656, "grad_norm": 3.09375, "learning_rate": 4.999556590669718e-06, "loss": 0.6274, "mean_token_accuracy": 0.8699584752321243, "num_tokens": 26344577.0, "step": 246 }, { "epoch": 0.5632839224629419, "grad_norm": 2.34375, "learning_rate": 4.999520409625253e-06, "loss": 0.6507, "mean_token_accuracy": 0.8654352575540543, "num_tokens": 26451583.0, "step": 247 }, { "epoch": 0.5655644241733181, "grad_norm": 2.21875, "learning_rate": 4.999482809901257e-06, "loss": 0.6765, "mean_token_accuracy": 0.8594978898763657, "num_tokens": 26558350.0, "step": 248 }, { "epoch": 0.5678449258836944, "grad_norm": 2.265625, "learning_rate": 4.999443791519074e-06, "loss": 0.6401, "mean_token_accuracy": 0.8702575117349625, "num_tokens": 26665344.0, "step": 249 }, { "epoch": 0.5701254275940707, "grad_norm": 3.03125, "learning_rate": 4.999403354500847e-06, "loss": 0.6382, "mean_token_accuracy": 0.8670401573181152, "num_tokens": 26772368.0, "step": 250 }, { "epoch": 0.572405929304447, "grad_norm": 2.1875, "learning_rate": 4.99936149886953e-06, "loss": 0.6368, "mean_token_accuracy": 0.8696161508560181, "num_tokens": 26879565.0, "step": 251 }, { "epoch": 0.5746864310148233, "grad_norm": 2.734375, "learning_rate": 4.999318224648878e-06, "loss": 0.6335, "mean_token_accuracy": 0.8677998781204224, "num_tokens": 26986810.0, "step": 252 }, { "epoch": 0.5769669327251995, "grad_norm": 2.53125, "learning_rate": 4.999273531863453e-06, "loss": 0.6261, "mean_token_accuracy": 0.8713762909173965, "num_tokens": 27094179.0, "step": 253 }, { "epoch": 0.5792474344355758, "grad_norm": 2.65625, "learning_rate": 4.999227420538622e-06, "loss": 0.6654, "mean_token_accuracy": 0.8648567199707031, "num_tokens": 27201502.0, "step": 254 }, { "epoch": 0.5815279361459521, "grad_norm": 2.25, "learning_rate": 4.999179890700555e-06, "loss": 0.663, "mean_token_accuracy": 0.8646685779094696, "num_tokens": 27308101.0, "step": 255 }, { "epoch": 0.5838084378563284, "grad_norm": 4.6875, "learning_rate": 4.999130942376232e-06, "loss": 0.6307, "mean_token_accuracy": 0.8694456964731216, "num_tokens": 27414923.0, "step": 256 }, { "epoch": 0.5860889395667047, "grad_norm": 2.609375, "learning_rate": 4.999080575593433e-06, "loss": 0.6417, "mean_token_accuracy": 0.8683536648750305, "num_tokens": 27521506.0, "step": 257 }, { "epoch": 0.5883694412770809, "grad_norm": 2.546875, "learning_rate": 4.999028790380746e-06, "loss": 0.6632, "mean_token_accuracy": 0.8628116995096207, "num_tokens": 27628384.0, "step": 258 }, { "epoch": 0.5906499429874572, "grad_norm": 2.203125, "learning_rate": 4.9989755867675635e-06, "loss": 0.638, "mean_token_accuracy": 0.8686887472867966, "num_tokens": 27735457.0, "step": 259 }, { "epoch": 0.5929304446978335, "grad_norm": 2.234375, "learning_rate": 4.998920964784082e-06, "loss": 0.6394, "mean_token_accuracy": 0.868531346321106, "num_tokens": 27842440.0, "step": 260 }, { "epoch": 0.5952109464082098, "grad_norm": 2.578125, "learning_rate": 4.998864924461305e-06, "loss": 0.6169, "mean_token_accuracy": 0.8760952204465866, "num_tokens": 27949150.0, "step": 261 }, { "epoch": 0.5974914481185861, "grad_norm": 2.75, "learning_rate": 4.998807465831039e-06, "loss": 0.6455, "mean_token_accuracy": 0.8662375062704086, "num_tokens": 28056218.0, "step": 262 }, { "epoch": 0.5997719498289624, "grad_norm": 3.0, "learning_rate": 4.998748588925897e-06, "loss": 0.6565, "mean_token_accuracy": 0.8636495620012283, "num_tokens": 28163194.0, "step": 263 }, { "epoch": 0.6020524515393386, "grad_norm": 2.25, "learning_rate": 4.998688293779297e-06, "loss": 0.621, "mean_token_accuracy": 0.8721827417612076, "num_tokens": 28270611.0, "step": 264 }, { "epoch": 0.6043329532497149, "grad_norm": 4.4375, "learning_rate": 4.998626580425459e-06, "loss": 0.6558, "mean_token_accuracy": 0.8628446161746979, "num_tokens": 28377681.0, "step": 265 }, { "epoch": 0.6066134549600912, "grad_norm": 2.78125, "learning_rate": 4.998563448899413e-06, "loss": 0.6314, "mean_token_accuracy": 0.8687665909528732, "num_tokens": 28484960.0, "step": 266 }, { "epoch": 0.6088939566704675, "grad_norm": 2.96875, "learning_rate": 4.998498899236989e-06, "loss": 0.6571, "mean_token_accuracy": 0.8647327572107315, "num_tokens": 28592263.0, "step": 267 }, { "epoch": 0.6111744583808438, "grad_norm": 2.4375, "learning_rate": 4.998432931474825e-06, "loss": 0.6395, "mean_token_accuracy": 0.8670562505722046, "num_tokens": 28699389.0, "step": 268 }, { "epoch": 0.61345496009122, "grad_norm": 2.765625, "learning_rate": 4.998365545650365e-06, "loss": 0.6289, "mean_token_accuracy": 0.8697399348020554, "num_tokens": 28807311.0, "step": 269 }, { "epoch": 0.6157354618015963, "grad_norm": 6.0, "learning_rate": 4.998296741801852e-06, "loss": 0.657, "mean_token_accuracy": 0.8628436028957367, "num_tokens": 28913892.0, "step": 270 }, { "epoch": 0.6180159635119726, "grad_norm": 3.40625, "learning_rate": 4.998226519968341e-06, "loss": 0.6215, "mean_token_accuracy": 0.8716912418603897, "num_tokens": 29020775.0, "step": 271 }, { "epoch": 0.6202964652223489, "grad_norm": 4.0, "learning_rate": 4.998154880189688e-06, "loss": 0.6409, "mean_token_accuracy": 0.8669329136610031, "num_tokens": 29127842.0, "step": 272 }, { "epoch": 0.6225769669327252, "grad_norm": 2.484375, "learning_rate": 4.998081822506552e-06, "loss": 0.643, "mean_token_accuracy": 0.8676059246063232, "num_tokens": 29235156.0, "step": 273 }, { "epoch": 0.6248574686431014, "grad_norm": 3.734375, "learning_rate": 4.998007346960402e-06, "loss": 0.6382, "mean_token_accuracy": 0.8684564977884293, "num_tokens": 29342694.0, "step": 274 }, { "epoch": 0.6271379703534777, "grad_norm": 2.375, "learning_rate": 4.997931453593507e-06, "loss": 0.6342, "mean_token_accuracy": 0.8661504536867142, "num_tokens": 29449934.0, "step": 275 }, { "epoch": 0.629418472063854, "grad_norm": 2.71875, "learning_rate": 4.997854142448944e-06, "loss": 0.632, "mean_token_accuracy": 0.8716614693403244, "num_tokens": 29557007.0, "step": 276 }, { "epoch": 0.6316989737742303, "grad_norm": 2.65625, "learning_rate": 4.997775413570593e-06, "loss": 0.6217, "mean_token_accuracy": 0.8713774085044861, "num_tokens": 29664526.0, "step": 277 }, { "epoch": 0.6339794754846066, "grad_norm": 3.28125, "learning_rate": 4.997695267003139e-06, "loss": 0.6231, "mean_token_accuracy": 0.8719380050897598, "num_tokens": 29771396.0, "step": 278 }, { "epoch": 0.636259977194983, "grad_norm": 2.5, "learning_rate": 4.99761370279207e-06, "loss": 0.6335, "mean_token_accuracy": 0.8700685054063797, "num_tokens": 29877736.0, "step": 279 }, { "epoch": 0.6385404789053591, "grad_norm": 3.703125, "learning_rate": 4.997530720983682e-06, "loss": 0.6331, "mean_token_accuracy": 0.8674991726875305, "num_tokens": 29985140.0, "step": 280 }, { "epoch": 0.6408209806157354, "grad_norm": 3.34375, "learning_rate": 4.9974463216250735e-06, "loss": 0.6618, "mean_token_accuracy": 0.8660477548837662, "num_tokens": 30092099.0, "step": 281 }, { "epoch": 0.6431014823261118, "grad_norm": 3.421875, "learning_rate": 4.997360504764148e-06, "loss": 0.6333, "mean_token_accuracy": 0.8705120533704758, "num_tokens": 30198927.0, "step": 282 }, { "epoch": 0.645381984036488, "grad_norm": 2.25, "learning_rate": 4.997273270449614e-06, "loss": 0.6165, "mean_token_accuracy": 0.8727722465991974, "num_tokens": 30306090.0, "step": 283 }, { "epoch": 0.6476624857468644, "grad_norm": 2.875, "learning_rate": 4.997184618730983e-06, "loss": 0.648, "mean_token_accuracy": 0.8658933788537979, "num_tokens": 30413636.0, "step": 284 }, { "epoch": 0.6499429874572406, "grad_norm": 2.5625, "learning_rate": 4.997094549658572e-06, "loss": 0.6261, "mean_token_accuracy": 0.8701870143413544, "num_tokens": 30521647.0, "step": 285 }, { "epoch": 0.6522234891676169, "grad_norm": 2.46875, "learning_rate": 4.997003063283503e-06, "loss": 0.6482, "mean_token_accuracy": 0.8652370274066925, "num_tokens": 30628370.0, "step": 286 }, { "epoch": 0.6545039908779932, "grad_norm": 3.296875, "learning_rate": 4.996910159657703e-06, "loss": 0.6343, "mean_token_accuracy": 0.8690200746059418, "num_tokens": 30735820.0, "step": 287 }, { "epoch": 0.6567844925883695, "grad_norm": 2.34375, "learning_rate": 4.996815838833899e-06, "loss": 0.6356, "mean_token_accuracy": 0.8680445849895477, "num_tokens": 30843566.0, "step": 288 }, { "epoch": 0.6590649942987458, "grad_norm": 2.90625, "learning_rate": 4.99672010086563e-06, "loss": 0.6028, "mean_token_accuracy": 0.8737114071846008, "num_tokens": 30951346.0, "step": 289 }, { "epoch": 0.661345496009122, "grad_norm": 3.125, "learning_rate": 4.996622945807231e-06, "loss": 0.6135, "mean_token_accuracy": 0.8741404414176941, "num_tokens": 31058663.0, "step": 290 }, { "epoch": 0.6636259977194983, "grad_norm": 2.078125, "learning_rate": 4.996524373713848e-06, "loss": 0.6218, "mean_token_accuracy": 0.8712268769741058, "num_tokens": 31166704.0, "step": 291 }, { "epoch": 0.6659064994298746, "grad_norm": 5.09375, "learning_rate": 4.996424384641428e-06, "loss": 0.6119, "mean_token_accuracy": 0.874211773276329, "num_tokens": 31273610.0, "step": 292 }, { "epoch": 0.6681870011402509, "grad_norm": 6.875, "learning_rate": 4.996322978646722e-06, "loss": 0.6088, "mean_token_accuracy": 0.8734241724014282, "num_tokens": 31380662.0, "step": 293 }, { "epoch": 0.6704675028506272, "grad_norm": 2.796875, "learning_rate": 4.996220155787287e-06, "loss": 0.6526, "mean_token_accuracy": 0.8677922487258911, "num_tokens": 31487775.0, "step": 294 }, { "epoch": 0.6727480045610034, "grad_norm": 2.640625, "learning_rate": 4.996115916121483e-06, "loss": 0.6319, "mean_token_accuracy": 0.8704153895378113, "num_tokens": 31595052.0, "step": 295 }, { "epoch": 0.6750285062713797, "grad_norm": 5.28125, "learning_rate": 4.996010259708475e-06, "loss": 0.6412, "mean_token_accuracy": 0.8663373440504074, "num_tokens": 31702051.0, "step": 296 }, { "epoch": 0.677309007981756, "grad_norm": 2.75, "learning_rate": 4.99590318660823e-06, "loss": 0.6066, "mean_token_accuracy": 0.87331423163414, "num_tokens": 31809437.0, "step": 297 }, { "epoch": 0.6795895096921323, "grad_norm": 2.796875, "learning_rate": 4.9957946968815215e-06, "loss": 0.6374, "mean_token_accuracy": 0.8677855134010315, "num_tokens": 31916405.0, "step": 298 }, { "epoch": 0.6818700114025086, "grad_norm": 3.3125, "learning_rate": 4.995684790589927e-06, "loss": 0.6389, "mean_token_accuracy": 0.8677714616060257, "num_tokens": 32023457.0, "step": 299 }, { "epoch": 0.6841505131128849, "grad_norm": 3.1875, "learning_rate": 4.995573467795825e-06, "loss": 0.6213, "mean_token_accuracy": 0.8719353079795837, "num_tokens": 32130881.0, "step": 300 }, { "epoch": 0.6864310148232611, "grad_norm": 4.3125, "learning_rate": 4.995460728562403e-06, "loss": 0.6392, "mean_token_accuracy": 0.8694345206022263, "num_tokens": 32237937.0, "step": 301 }, { "epoch": 0.6887115165336374, "grad_norm": 3.046875, "learning_rate": 4.9953465729536475e-06, "loss": 0.6415, "mean_token_accuracy": 0.8696666061878204, "num_tokens": 32345103.0, "step": 302 }, { "epoch": 0.6909920182440137, "grad_norm": 2.265625, "learning_rate": 4.995231001034352e-06, "loss": 0.6148, "mean_token_accuracy": 0.868248924612999, "num_tokens": 32451683.0, "step": 303 }, { "epoch": 0.69327251995439, "grad_norm": 2.140625, "learning_rate": 4.995114012870112e-06, "loss": 0.6406, "mean_token_accuracy": 0.8688762336969376, "num_tokens": 32558767.0, "step": 304 }, { "epoch": 0.6955530216647663, "grad_norm": 3.6875, "learning_rate": 4.99499560852733e-06, "loss": 0.629, "mean_token_accuracy": 0.8722851425409317, "num_tokens": 32665998.0, "step": 305 }, { "epoch": 0.6978335233751425, "grad_norm": 5.0, "learning_rate": 4.994875788073207e-06, "loss": 0.6373, "mean_token_accuracy": 0.8682472556829453, "num_tokens": 32773798.0, "step": 306 }, { "epoch": 0.7001140250855188, "grad_norm": 3.5, "learning_rate": 4.994754551575752e-06, "loss": 0.65, "mean_token_accuracy": 0.8666471391916275, "num_tokens": 32880403.0, "step": 307 }, { "epoch": 0.7023945267958951, "grad_norm": 2.328125, "learning_rate": 4.994631899103777e-06, "loss": 0.6419, "mean_token_accuracy": 0.867155522108078, "num_tokens": 32987552.0, "step": 308 }, { "epoch": 0.7046750285062714, "grad_norm": 2.375, "learning_rate": 4.9945078307268974e-06, "loss": 0.6452, "mean_token_accuracy": 0.8653433471918106, "num_tokens": 33094230.0, "step": 309 }, { "epoch": 0.7069555302166477, "grad_norm": 4.84375, "learning_rate": 4.994382346515531e-06, "loss": 0.6171, "mean_token_accuracy": 0.8708781599998474, "num_tokens": 33201782.0, "step": 310 }, { "epoch": 0.7092360319270239, "grad_norm": 4.15625, "learning_rate": 4.9942554465409e-06, "loss": 0.6404, "mean_token_accuracy": 0.8667383790016174, "num_tokens": 33308197.0, "step": 311 }, { "epoch": 0.7115165336374002, "grad_norm": 3.296875, "learning_rate": 4.994127130875032e-06, "loss": 0.6098, "mean_token_accuracy": 0.8729591369628906, "num_tokens": 33415348.0, "step": 312 }, { "epoch": 0.7137970353477765, "grad_norm": 2.078125, "learning_rate": 4.993997399590755e-06, "loss": 0.6182, "mean_token_accuracy": 0.8707993477582932, "num_tokens": 33522331.0, "step": 313 }, { "epoch": 0.7160775370581528, "grad_norm": 4.25, "learning_rate": 4.993866252761702e-06, "loss": 0.6517, "mean_token_accuracy": 0.8653054535388947, "num_tokens": 33629329.0, "step": 314 }, { "epoch": 0.7183580387685291, "grad_norm": 4.1875, "learning_rate": 4.993733690462311e-06, "loss": 0.6286, "mean_token_accuracy": 0.8681600391864777, "num_tokens": 33736412.0, "step": 315 }, { "epoch": 0.7206385404789054, "grad_norm": 1.953125, "learning_rate": 4.99359971276782e-06, "loss": 0.6184, "mean_token_accuracy": 0.8719700872898102, "num_tokens": 33843463.0, "step": 316 }, { "epoch": 0.7229190421892816, "grad_norm": 1.8203125, "learning_rate": 4.993464319754273e-06, "loss": 0.6154, "mean_token_accuracy": 0.8726183176040649, "num_tokens": 33950234.0, "step": 317 }, { "epoch": 0.7251995438996579, "grad_norm": 2.9375, "learning_rate": 4.993327511498516e-06, "loss": 0.625, "mean_token_accuracy": 0.8715388774871826, "num_tokens": 34057495.0, "step": 318 }, { "epoch": 0.7274800456100342, "grad_norm": 2.4375, "learning_rate": 4.9931892880782e-06, "loss": 0.6312, "mean_token_accuracy": 0.8697730153799057, "num_tokens": 34164527.0, "step": 319 }, { "epoch": 0.7297605473204105, "grad_norm": 2.75, "learning_rate": 4.993049649571775e-06, "loss": 0.6444, "mean_token_accuracy": 0.8663023114204407, "num_tokens": 34271517.0, "step": 320 }, { "epoch": 0.7320410490307868, "grad_norm": 4.1875, "learning_rate": 4.992908596058501e-06, "loss": 0.6272, "mean_token_accuracy": 0.8702614009380341, "num_tokens": 34378878.0, "step": 321 }, { "epoch": 0.734321550741163, "grad_norm": 2.640625, "learning_rate": 4.992766127618434e-06, "loss": 0.6261, "mean_token_accuracy": 0.8703981339931488, "num_tokens": 34485783.0, "step": 322 }, { "epoch": 0.7366020524515393, "grad_norm": 2.5, "learning_rate": 4.992622244332439e-06, "loss": 0.6452, "mean_token_accuracy": 0.8647294193506241, "num_tokens": 34592416.0, "step": 323 }, { "epoch": 0.7388825541619156, "grad_norm": 1.953125, "learning_rate": 4.992476946282179e-06, "loss": 0.6313, "mean_token_accuracy": 0.8711864650249481, "num_tokens": 34699693.0, "step": 324 }, { "epoch": 0.7411630558722919, "grad_norm": 3.46875, "learning_rate": 4.992330233550124e-06, "loss": 0.6155, "mean_token_accuracy": 0.8703418523073196, "num_tokens": 34806837.0, "step": 325 }, { "epoch": 0.7434435575826682, "grad_norm": 3.140625, "learning_rate": 4.9921821062195445e-06, "loss": 0.6482, "mean_token_accuracy": 0.8661866039037704, "num_tokens": 34913354.0, "step": 326 }, { "epoch": 0.7457240592930444, "grad_norm": 3.015625, "learning_rate": 4.9920325643745145e-06, "loss": 0.6217, "mean_token_accuracy": 0.8691374510526657, "num_tokens": 35020432.0, "step": 327 }, { "epoch": 0.7480045610034207, "grad_norm": 6.3125, "learning_rate": 4.991881608099912e-06, "loss": 0.6274, "mean_token_accuracy": 0.8629052639007568, "num_tokens": 35127877.0, "step": 328 }, { "epoch": 0.750285062713797, "grad_norm": 4.5, "learning_rate": 4.991729237481417e-06, "loss": 0.6361, "mean_token_accuracy": 0.8679073601961136, "num_tokens": 35234847.0, "step": 329 }, { "epoch": 0.7525655644241733, "grad_norm": 2.21875, "learning_rate": 4.991575452605511e-06, "loss": 0.6226, "mean_token_accuracy": 0.8713287711143494, "num_tokens": 35342056.0, "step": 330 }, { "epoch": 0.7548460661345496, "grad_norm": 2.6875, "learning_rate": 4.9914202535594795e-06, "loss": 0.6452, "mean_token_accuracy": 0.8670705258846283, "num_tokens": 35448907.0, "step": 331 }, { "epoch": 0.7571265678449259, "grad_norm": 3.890625, "learning_rate": 4.991263640431411e-06, "loss": 0.6152, "mean_token_accuracy": 0.8704476803541183, "num_tokens": 35556170.0, "step": 332 }, { "epoch": 0.7594070695553021, "grad_norm": 5.5, "learning_rate": 4.9911056133101965e-06, "loss": 0.6195, "mean_token_accuracy": 0.8712238371372223, "num_tokens": 35662971.0, "step": 333 }, { "epoch": 0.7616875712656784, "grad_norm": 3.171875, "learning_rate": 4.990946172285528e-06, "loss": 0.6431, "mean_token_accuracy": 0.8710373938083649, "num_tokens": 35769814.0, "step": 334 }, { "epoch": 0.7639680729760547, "grad_norm": 2.140625, "learning_rate": 4.990785317447901e-06, "loss": 0.6003, "mean_token_accuracy": 0.8746102750301361, "num_tokens": 35877019.0, "step": 335 }, { "epoch": 0.766248574686431, "grad_norm": 2.953125, "learning_rate": 4.990623048888615e-06, "loss": 0.627, "mean_token_accuracy": 0.8735091537237167, "num_tokens": 35983784.0, "step": 336 }, { "epoch": 0.7685290763968073, "grad_norm": 3.9375, "learning_rate": 4.9904593666997704e-06, "loss": 0.6629, "mean_token_accuracy": 0.8641745299100876, "num_tokens": 36090599.0, "step": 337 }, { "epoch": 0.7708095781071835, "grad_norm": 3.875, "learning_rate": 4.990294270974268e-06, "loss": 0.6328, "mean_token_accuracy": 0.8708094358444214, "num_tokens": 36197751.0, "step": 338 }, { "epoch": 0.7730900798175598, "grad_norm": 2.078125, "learning_rate": 4.990127761805816e-06, "loss": 0.6345, "mean_token_accuracy": 0.8706918656826019, "num_tokens": 36304049.0, "step": 339 }, { "epoch": 0.7753705815279361, "grad_norm": 1.9375, "learning_rate": 4.989959839288919e-06, "loss": 0.6264, "mean_token_accuracy": 0.8699767887592316, "num_tokens": 36411843.0, "step": 340 }, { "epoch": 0.7776510832383124, "grad_norm": 2.765625, "learning_rate": 4.989790503518888e-06, "loss": 0.6192, "mean_token_accuracy": 0.8728293031454086, "num_tokens": 36518890.0, "step": 341 }, { "epoch": 0.7799315849486887, "grad_norm": 5.875, "learning_rate": 4.9896197545918345e-06, "loss": 0.6203, "mean_token_accuracy": 0.8697379231452942, "num_tokens": 36626348.0, "step": 342 }, { "epoch": 0.7822120866590649, "grad_norm": 1.8984375, "learning_rate": 4.989447592604673e-06, "loss": 0.6028, "mean_token_accuracy": 0.8770763874053955, "num_tokens": 36733926.0, "step": 343 }, { "epoch": 0.7844925883694412, "grad_norm": 3.515625, "learning_rate": 4.989274017655117e-06, "loss": 0.6, "mean_token_accuracy": 0.8766891658306122, "num_tokens": 36841051.0, "step": 344 }, { "epoch": 0.7867730900798175, "grad_norm": 3.078125, "learning_rate": 4.989099029841687e-06, "loss": 0.6305, "mean_token_accuracy": 0.8703635185956955, "num_tokens": 36947674.0, "step": 345 }, { "epoch": 0.7890535917901939, "grad_norm": 2.109375, "learning_rate": 4.988922629263701e-06, "loss": 0.6234, "mean_token_accuracy": 0.87026646733284, "num_tokens": 37054412.0, "step": 346 }, { "epoch": 0.7913340935005702, "grad_norm": 3.0625, "learning_rate": 4.988744816021283e-06, "loss": 0.64, "mean_token_accuracy": 0.8698112666606903, "num_tokens": 37162022.0, "step": 347 }, { "epoch": 0.7936145952109465, "grad_norm": 2.15625, "learning_rate": 4.988565590215352e-06, "loss": 0.6163, "mean_token_accuracy": 0.8709748089313507, "num_tokens": 37269040.0, "step": 348 }, { "epoch": 0.7958950969213227, "grad_norm": 4.15625, "learning_rate": 4.9883849519476364e-06, "loss": 0.6075, "mean_token_accuracy": 0.8744599372148514, "num_tokens": 37376445.0, "step": 349 }, { "epoch": 0.798175598631699, "grad_norm": 2.984375, "learning_rate": 4.988202901320663e-06, "loss": 0.6218, "mean_token_accuracy": 0.8723867684602737, "num_tokens": 37483781.0, "step": 350 }, { "epoch": 0.8004561003420753, "grad_norm": 5.0625, "learning_rate": 4.988019438437759e-06, "loss": 0.6543, "mean_token_accuracy": 0.8646156787872314, "num_tokens": 37590388.0, "step": 351 }, { "epoch": 0.8027366020524516, "grad_norm": 3.359375, "learning_rate": 4.987834563403055e-06, "loss": 0.6162, "mean_token_accuracy": 0.8731140941381454, "num_tokens": 37698197.0, "step": 352 }, { "epoch": 0.8050171037628279, "grad_norm": 3.765625, "learning_rate": 4.987648276321482e-06, "loss": 0.658, "mean_token_accuracy": 0.8623346835374832, "num_tokens": 37804721.0, "step": 353 }, { "epoch": 0.8072976054732041, "grad_norm": 5.15625, "learning_rate": 4.987460577298774e-06, "loss": 0.6171, "mean_token_accuracy": 0.8700351715087891, "num_tokens": 37912089.0, "step": 354 }, { "epoch": 0.8095781071835804, "grad_norm": 5.53125, "learning_rate": 4.9872714664414635e-06, "loss": 0.6288, "mean_token_accuracy": 0.8672404289245605, "num_tokens": 38018353.0, "step": 355 }, { "epoch": 0.8118586088939567, "grad_norm": 5.0, "learning_rate": 4.987080943856887e-06, "loss": 0.6396, "mean_token_accuracy": 0.8685039430856705, "num_tokens": 38124920.0, "step": 356 }, { "epoch": 0.814139110604333, "grad_norm": 2.546875, "learning_rate": 4.986889009653183e-06, "loss": 0.6103, "mean_token_accuracy": 0.8747462034225464, "num_tokens": 38232424.0, "step": 357 }, { "epoch": 0.8164196123147093, "grad_norm": 4.5625, "learning_rate": 4.986695663939288e-06, "loss": 0.6204, "mean_token_accuracy": 0.8709569126367569, "num_tokens": 38339566.0, "step": 358 }, { "epoch": 0.8187001140250855, "grad_norm": 4.21875, "learning_rate": 4.986500906824942e-06, "loss": 0.6181, "mean_token_accuracy": 0.8750324100255966, "num_tokens": 38446417.0, "step": 359 }, { "epoch": 0.8209806157354618, "grad_norm": 5.5, "learning_rate": 4.986304738420684e-06, "loss": 0.6115, "mean_token_accuracy": 0.8724595308303833, "num_tokens": 38553490.0, "step": 360 }, { "epoch": 0.8232611174458381, "grad_norm": 3.28125, "learning_rate": 4.9861071588378565e-06, "loss": 0.644, "mean_token_accuracy": 0.8690180629491806, "num_tokens": 38660341.0, "step": 361 }, { "epoch": 0.8255416191562144, "grad_norm": 2.203125, "learning_rate": 4.985908168188602e-06, "loss": 0.6163, "mean_token_accuracy": 0.8723935782909393, "num_tokens": 38767171.0, "step": 362 }, { "epoch": 0.8278221208665907, "grad_norm": 2.390625, "learning_rate": 4.985707766585865e-06, "loss": 0.6186, "mean_token_accuracy": 0.8713973164558411, "num_tokens": 38873913.0, "step": 363 }, { "epoch": 0.830102622576967, "grad_norm": 2.75, "learning_rate": 4.985505954143387e-06, "loss": 0.6212, "mean_token_accuracy": 0.8741555064916611, "num_tokens": 38981023.0, "step": 364 }, { "epoch": 0.8323831242873432, "grad_norm": 1.8359375, "learning_rate": 4.985302730975713e-06, "loss": 0.6252, "mean_token_accuracy": 0.8745421171188354, "num_tokens": 39088181.0, "step": 365 }, { "epoch": 0.8346636259977195, "grad_norm": 2.65625, "learning_rate": 4.9850980971981914e-06, "loss": 0.6419, "mean_token_accuracy": 0.870025172829628, "num_tokens": 39195055.0, "step": 366 }, { "epoch": 0.8369441277080958, "grad_norm": 3.375, "learning_rate": 4.984892052926965e-06, "loss": 0.6414, "mean_token_accuracy": 0.8648830950260162, "num_tokens": 39302023.0, "step": 367 }, { "epoch": 0.8392246294184721, "grad_norm": 1.9609375, "learning_rate": 4.984684598278982e-06, "loss": 0.6332, "mean_token_accuracy": 0.8697508275508881, "num_tokens": 39408616.0, "step": 368 }, { "epoch": 0.8415051311288484, "grad_norm": 2.265625, "learning_rate": 4.984475733371991e-06, "loss": 0.649, "mean_token_accuracy": 0.8658408671617508, "num_tokens": 39515948.0, "step": 369 }, { "epoch": 0.8437856328392246, "grad_norm": 3.265625, "learning_rate": 4.984265458324538e-06, "loss": 0.6415, "mean_token_accuracy": 0.8648363202810287, "num_tokens": 39622628.0, "step": 370 }, { "epoch": 0.8460661345496009, "grad_norm": 3.078125, "learning_rate": 4.984053773255971e-06, "loss": 0.6158, "mean_token_accuracy": 0.8734622299671173, "num_tokens": 39730107.0, "step": 371 }, { "epoch": 0.8483466362599772, "grad_norm": 2.609375, "learning_rate": 4.9838406782864394e-06, "loss": 0.6134, "mean_token_accuracy": 0.8723733127117157, "num_tokens": 39837288.0, "step": 372 }, { "epoch": 0.8506271379703535, "grad_norm": 1.9296875, "learning_rate": 4.983626173536891e-06, "loss": 0.6114, "mean_token_accuracy": 0.8771011680364609, "num_tokens": 39944365.0, "step": 373 }, { "epoch": 0.8529076396807298, "grad_norm": 2.140625, "learning_rate": 4.983410259129075e-06, "loss": 0.6515, "mean_token_accuracy": 0.8640440553426743, "num_tokens": 40051377.0, "step": 374 }, { "epoch": 0.855188141391106, "grad_norm": 2.203125, "learning_rate": 4.983192935185539e-06, "loss": 0.6192, "mean_token_accuracy": 0.8728392422199249, "num_tokens": 40158071.0, "step": 375 }, { "epoch": 0.8574686431014823, "grad_norm": 2.03125, "learning_rate": 4.9829742018296335e-06, "loss": 0.6264, "mean_token_accuracy": 0.8718508183956146, "num_tokens": 40265163.0, "step": 376 }, { "epoch": 0.8597491448118586, "grad_norm": 2.046875, "learning_rate": 4.9827540591855064e-06, "loss": 0.6263, "mean_token_accuracy": 0.8694735020399094, "num_tokens": 40372053.0, "step": 377 }, { "epoch": 0.8620296465222349, "grad_norm": 2.8125, "learning_rate": 4.9825325073781075e-06, "loss": 0.6302, "mean_token_accuracy": 0.8715538680553436, "num_tokens": 40479008.0, "step": 378 }, { "epoch": 0.8643101482326112, "grad_norm": 3.15625, "learning_rate": 4.982309546533184e-06, "loss": 0.6379, "mean_token_accuracy": 0.866651251912117, "num_tokens": 40585719.0, "step": 379 }, { "epoch": 0.8665906499429875, "grad_norm": 2.078125, "learning_rate": 4.982085176777285e-06, "loss": 0.6138, "mean_token_accuracy": 0.8743686825037003, "num_tokens": 40692870.0, "step": 380 }, { "epoch": 0.8688711516533637, "grad_norm": 4.75, "learning_rate": 4.981859398237758e-06, "loss": 0.6326, "mean_token_accuracy": 0.8692844212055206, "num_tokens": 40800059.0, "step": 381 }, { "epoch": 0.87115165336374, "grad_norm": 3.171875, "learning_rate": 4.9816322110427505e-06, "loss": 0.6292, "mean_token_accuracy": 0.8697078227996826, "num_tokens": 40907542.0, "step": 382 }, { "epoch": 0.8734321550741163, "grad_norm": 3.734375, "learning_rate": 4.98140361532121e-06, "loss": 0.6362, "mean_token_accuracy": 0.871175691485405, "num_tokens": 41014351.0, "step": 383 }, { "epoch": 0.8757126567844926, "grad_norm": 4.21875, "learning_rate": 4.981173611202883e-06, "loss": 0.6454, "mean_token_accuracy": 0.8670124560594559, "num_tokens": 41121154.0, "step": 384 }, { "epoch": 0.8779931584948689, "grad_norm": 3.234375, "learning_rate": 4.980942198818315e-06, "loss": 0.63, "mean_token_accuracy": 0.8680609911680222, "num_tokens": 41228488.0, "step": 385 }, { "epoch": 0.8802736602052451, "grad_norm": 4.96875, "learning_rate": 4.980709378298851e-06, "loss": 0.6381, "mean_token_accuracy": 0.8670986741781235, "num_tokens": 41335487.0, "step": 386 }, { "epoch": 0.8825541619156214, "grad_norm": 2.328125, "learning_rate": 4.980475149776636e-06, "loss": 0.6267, "mean_token_accuracy": 0.8722383230924606, "num_tokens": 41442292.0, "step": 387 }, { "epoch": 0.8848346636259977, "grad_norm": 2.421875, "learning_rate": 4.980239513384614e-06, "loss": 0.6291, "mean_token_accuracy": 0.8699522018432617, "num_tokens": 41549340.0, "step": 388 }, { "epoch": 0.887115165336374, "grad_norm": 7.5, "learning_rate": 4.980002469256527e-06, "loss": 0.6157, "mean_token_accuracy": 0.8692153990268707, "num_tokens": 41656335.0, "step": 389 }, { "epoch": 0.8893956670467503, "grad_norm": 3.171875, "learning_rate": 4.979764017526916e-06, "loss": 0.6327, "mean_token_accuracy": 0.8701380044221878, "num_tokens": 41763377.0, "step": 390 }, { "epoch": 0.8916761687571265, "grad_norm": 2.09375, "learning_rate": 4.979524158331123e-06, "loss": 0.635, "mean_token_accuracy": 0.8717087209224701, "num_tokens": 41870195.0, "step": 391 }, { "epoch": 0.8939566704675028, "grad_norm": 2.09375, "learning_rate": 4.979282891805287e-06, "loss": 0.6182, "mean_token_accuracy": 0.8729406297206879, "num_tokens": 41977586.0, "step": 392 }, { "epoch": 0.8962371721778791, "grad_norm": 2.078125, "learning_rate": 4.979040218086345e-06, "loss": 0.6338, "mean_token_accuracy": 0.8731931746006012, "num_tokens": 42084265.0, "step": 393 }, { "epoch": 0.8985176738882554, "grad_norm": 2.109375, "learning_rate": 4.978796137312036e-06, "loss": 0.6323, "mean_token_accuracy": 0.8713531494140625, "num_tokens": 42191215.0, "step": 394 }, { "epoch": 0.9007981755986317, "grad_norm": 2.84375, "learning_rate": 4.978550649620894e-06, "loss": 0.6286, "mean_token_accuracy": 0.868993267416954, "num_tokens": 42298587.0, "step": 395 }, { "epoch": 0.9030786773090079, "grad_norm": 2.140625, "learning_rate": 4.978303755152254e-06, "loss": 0.6027, "mean_token_accuracy": 0.8749285191297531, "num_tokens": 42406175.0, "step": 396 }, { "epoch": 0.9053591790193842, "grad_norm": 4.3125, "learning_rate": 4.978055454046247e-06, "loss": 0.6109, "mean_token_accuracy": 0.8721934705972672, "num_tokens": 42513138.0, "step": 397 }, { "epoch": 0.9076396807297605, "grad_norm": 2.953125, "learning_rate": 4.977805746443807e-06, "loss": 0.6093, "mean_token_accuracy": 0.87620410323143, "num_tokens": 42620098.0, "step": 398 }, { "epoch": 0.9099201824401368, "grad_norm": 2.0625, "learning_rate": 4.9775546324866596e-06, "loss": 0.5789, "mean_token_accuracy": 0.8835956156253815, "num_tokens": 42727074.0, "step": 399 }, { "epoch": 0.9122006841505131, "grad_norm": 4.28125, "learning_rate": 4.977302112317334e-06, "loss": 0.6503, "mean_token_accuracy": 0.8638210296630859, "num_tokens": 42834015.0, "step": 400 }, { "epoch": 0.9144811858608894, "grad_norm": 5.78125, "learning_rate": 4.977048186079155e-06, "loss": 0.6314, "mean_token_accuracy": 0.8687157332897186, "num_tokens": 42940807.0, "step": 401 }, { "epoch": 0.9167616875712656, "grad_norm": 3.328125, "learning_rate": 4.976792853916248e-06, "loss": 0.6368, "mean_token_accuracy": 0.8687217086553574, "num_tokens": 43047379.0, "step": 402 }, { "epoch": 0.9190421892816419, "grad_norm": 4.625, "learning_rate": 4.9765361159735335e-06, "loss": 0.6258, "mean_token_accuracy": 0.8750667423009872, "num_tokens": 43154795.0, "step": 403 }, { "epoch": 0.9213226909920182, "grad_norm": 6.25, "learning_rate": 4.97627797239673e-06, "loss": 0.6312, "mean_token_accuracy": 0.8691485226154327, "num_tokens": 43261640.0, "step": 404 }, { "epoch": 0.9236031927023945, "grad_norm": 2.96875, "learning_rate": 4.976018423332357e-06, "loss": 0.6191, "mean_token_accuracy": 0.875212773680687, "num_tokens": 43368543.0, "step": 405 }, { "epoch": 0.9258836944127709, "grad_norm": 2.890625, "learning_rate": 4.975757468927727e-06, "loss": 0.6041, "mean_token_accuracy": 0.8764694184064865, "num_tokens": 43475828.0, "step": 406 }, { "epoch": 0.928164196123147, "grad_norm": 4.1875, "learning_rate": 4.975495109330954e-06, "loss": 0.6317, "mean_token_accuracy": 0.8676637560129166, "num_tokens": 43582569.0, "step": 407 }, { "epoch": 0.9304446978335233, "grad_norm": 8.4375, "learning_rate": 4.97523134469095e-06, "loss": 0.6399, "mean_token_accuracy": 0.8702896982431412, "num_tokens": 43689655.0, "step": 408 }, { "epoch": 0.9327251995438997, "grad_norm": 2.875, "learning_rate": 4.97496617515742e-06, "loss": 0.5977, "mean_token_accuracy": 0.8774754852056503, "num_tokens": 43797009.0, "step": 409 }, { "epoch": 0.935005701254276, "grad_norm": 2.40625, "learning_rate": 4.974699600880869e-06, "loss": 0.6182, "mean_token_accuracy": 0.8728543817996979, "num_tokens": 43904821.0, "step": 410 }, { "epoch": 0.9372862029646523, "grad_norm": 3.734375, "learning_rate": 4.974431622012601e-06, "loss": 0.6565, "mean_token_accuracy": 0.866395503282547, "num_tokens": 44011564.0, "step": 411 }, { "epoch": 0.9395667046750285, "grad_norm": 2.015625, "learning_rate": 4.974162238704716e-06, "loss": 0.594, "mean_token_accuracy": 0.8790825754404068, "num_tokens": 44118685.0, "step": 412 }, { "epoch": 0.9418472063854048, "grad_norm": 4.4375, "learning_rate": 4.973891451110109e-06, "loss": 0.6196, "mean_token_accuracy": 0.8710049241781235, "num_tokens": 44225866.0, "step": 413 }, { "epoch": 0.9441277080957811, "grad_norm": 5.15625, "learning_rate": 4.973619259382475e-06, "loss": 0.6301, "mean_token_accuracy": 0.8691826015710831, "num_tokens": 44333305.0, "step": 414 }, { "epoch": 0.9464082098061574, "grad_norm": 3.46875, "learning_rate": 4.973345663676305e-06, "loss": 0.6057, "mean_token_accuracy": 0.8764727264642715, "num_tokens": 44439926.0, "step": 415 }, { "epoch": 0.9486887115165337, "grad_norm": 3.078125, "learning_rate": 4.973070664146885e-06, "loss": 0.6067, "mean_token_accuracy": 0.871231347322464, "num_tokens": 44547096.0, "step": 416 }, { "epoch": 0.95096921322691, "grad_norm": 4.25, "learning_rate": 4.972794260950301e-06, "loss": 0.6118, "mean_token_accuracy": 0.8720909953117371, "num_tokens": 44654153.0, "step": 417 }, { "epoch": 0.9532497149372862, "grad_norm": 5.15625, "learning_rate": 4.972516454243433e-06, "loss": 0.6346, "mean_token_accuracy": 0.8684164136648178, "num_tokens": 44761805.0, "step": 418 }, { "epoch": 0.9555302166476625, "grad_norm": 6.25, "learning_rate": 4.972237244183961e-06, "loss": 0.6235, "mean_token_accuracy": 0.8743065893650055, "num_tokens": 44869180.0, "step": 419 }, { "epoch": 0.9578107183580388, "grad_norm": 4.3125, "learning_rate": 4.971956630930356e-06, "loss": 0.6178, "mean_token_accuracy": 0.8736863434314728, "num_tokens": 44976595.0, "step": 420 }, { "epoch": 0.9600912200684151, "grad_norm": 2.21875, "learning_rate": 4.971674614641891e-06, "loss": 0.618, "mean_token_accuracy": 0.8725884109735489, "num_tokens": 45083791.0, "step": 421 }, { "epoch": 0.9623717217787914, "grad_norm": 3.609375, "learning_rate": 4.971391195478632e-06, "loss": 0.6255, "mean_token_accuracy": 0.875871405005455, "num_tokens": 45190655.0, "step": 422 }, { "epoch": 0.9646522234891676, "grad_norm": 4.0, "learning_rate": 4.971106373601443e-06, "loss": 0.6342, "mean_token_accuracy": 0.8689168095588684, "num_tokens": 45297243.0, "step": 423 }, { "epoch": 0.9669327251995439, "grad_norm": 3.5625, "learning_rate": 4.9708201491719825e-06, "loss": 0.623, "mean_token_accuracy": 0.8707065731287003, "num_tokens": 45404013.0, "step": 424 }, { "epoch": 0.9692132269099202, "grad_norm": 2.46875, "learning_rate": 4.9705325223527055e-06, "loss": 0.6136, "mean_token_accuracy": 0.8755741715431213, "num_tokens": 45511141.0, "step": 425 }, { "epoch": 0.9714937286202965, "grad_norm": 2.578125, "learning_rate": 4.970243493306865e-06, "loss": 0.6068, "mean_token_accuracy": 0.8742416203022003, "num_tokens": 45618184.0, "step": 426 }, { "epoch": 0.9737742303306728, "grad_norm": 3.78125, "learning_rate": 4.969953062198508e-06, "loss": 0.6242, "mean_token_accuracy": 0.8716663718223572, "num_tokens": 45725266.0, "step": 427 }, { "epoch": 0.976054732041049, "grad_norm": 2.359375, "learning_rate": 4.969661229192477e-06, "loss": 0.6127, "mean_token_accuracy": 0.8720242828130722, "num_tokens": 45832262.0, "step": 428 }, { "epoch": 0.9783352337514253, "grad_norm": 2.1875, "learning_rate": 4.969367994454412e-06, "loss": 0.5931, "mean_token_accuracy": 0.8772017359733582, "num_tokens": 45939904.0, "step": 429 }, { "epoch": 0.9806157354618016, "grad_norm": 1.9609375, "learning_rate": 4.9690733581507445e-06, "loss": 0.6109, "mean_token_accuracy": 0.8746745586395264, "num_tokens": 46046844.0, "step": 430 }, { "epoch": 0.9828962371721779, "grad_norm": 1.84375, "learning_rate": 4.968777320448707e-06, "loss": 0.622, "mean_token_accuracy": 0.8727656751871109, "num_tokens": 46154121.0, "step": 431 }, { "epoch": 0.9851767388825542, "grad_norm": 2.953125, "learning_rate": 4.9684798815163235e-06, "loss": 0.6052, "mean_token_accuracy": 0.8737929463386536, "num_tokens": 46260970.0, "step": 432 }, { "epoch": 0.9874572405929305, "grad_norm": 2.390625, "learning_rate": 4.968181041522416e-06, "loss": 0.6273, "mean_token_accuracy": 0.8705646842718124, "num_tokens": 46368205.0, "step": 433 }, { "epoch": 0.9897377423033067, "grad_norm": 2.171875, "learning_rate": 4.967880800636599e-06, "loss": 0.6285, "mean_token_accuracy": 0.8682654201984406, "num_tokens": 46475197.0, "step": 434 }, { "epoch": 0.992018244013683, "grad_norm": 2.140625, "learning_rate": 4.967579159029284e-06, "loss": 0.6114, "mean_token_accuracy": 0.8732243329286575, "num_tokens": 46582041.0, "step": 435 }, { "epoch": 0.9942987457240593, "grad_norm": 3.28125, "learning_rate": 4.9672761168716766e-06, "loss": 0.6324, "mean_token_accuracy": 0.8705407828092575, "num_tokens": 46688998.0, "step": 436 }, { "epoch": 0.9965792474344356, "grad_norm": 2.953125, "learning_rate": 4.966971674335778e-06, "loss": 0.6035, "mean_token_accuracy": 0.8775283247232437, "num_tokens": 46795830.0, "step": 437 }, { "epoch": 0.9988597491448119, "grad_norm": 2.125, "learning_rate": 4.966665831594383e-06, "loss": 0.6028, "mean_token_accuracy": 0.8748981207609177, "num_tokens": 46903068.0, "step": 438 }, { "epoch": 1.0, "grad_norm": 7.34375, "learning_rate": 4.966358588821084e-06, "loss": 0.6496, "mean_token_accuracy": 0.8653521537780762, "num_tokens": 46942232.0, "step": 439 }, { "epoch": 1.0022805017103762, "grad_norm": 2.0, "learning_rate": 4.966049946190265e-06, "loss": 0.624, "mean_token_accuracy": 0.8737205117940903, "num_tokens": 47049296.0, "step": 440 }, { "epoch": 1.0022805017103762, "eval_loss": 0.6221891045570374, "eval_mean_token_accuracy": 0.8722115945453426, "eval_num_tokens": 47049296.0, "eval_runtime": 58.6597, "eval_samples_per_second": 142.943, "eval_steps_per_second": 4.483, "step": 440 }, { "epoch": 1.0045610034207526, "grad_norm": 2.015625, "learning_rate": 4.9657399038771045e-06, "loss": 0.5872, "mean_token_accuracy": 0.8777357935905457, "num_tokens": 47156210.0, "step": 441 }, { "epoch": 1.0068415051311288, "grad_norm": 2.4375, "learning_rate": 4.965428462057578e-06, "loss": 0.6234, "mean_token_accuracy": 0.8698296397924423, "num_tokens": 47263285.0, "step": 442 }, { "epoch": 1.0091220068415052, "grad_norm": 4.34375, "learning_rate": 4.965115620908453e-06, "loss": 0.5926, "mean_token_accuracy": 0.8760122805833817, "num_tokens": 47370395.0, "step": 443 }, { "epoch": 1.0114025085518814, "grad_norm": 3.65625, "learning_rate": 4.964801380607293e-06, "loss": 0.6211, "mean_token_accuracy": 0.8716463297605515, "num_tokens": 47477476.0, "step": 444 }, { "epoch": 1.0136830102622576, "grad_norm": 3.25, "learning_rate": 4.964485741332453e-06, "loss": 0.6086, "mean_token_accuracy": 0.8730912059545517, "num_tokens": 47585481.0, "step": 445 }, { "epoch": 1.015963511972634, "grad_norm": 2.5625, "learning_rate": 4.964168703263086e-06, "loss": 0.5887, "mean_token_accuracy": 0.8801993578672409, "num_tokens": 47693090.0, "step": 446 }, { "epoch": 1.0182440136830102, "grad_norm": 5.8125, "learning_rate": 4.963850266579136e-06, "loss": 0.6235, "mean_token_accuracy": 0.8687308132648468, "num_tokens": 47800542.0, "step": 447 }, { "epoch": 1.0205245153933866, "grad_norm": 5.0, "learning_rate": 4.963530431461341e-06, "loss": 0.6048, "mean_token_accuracy": 0.8741424828767776, "num_tokens": 47907379.0, "step": 448 }, { "epoch": 1.0228050171037628, "grad_norm": 5.15625, "learning_rate": 4.963209198091232e-06, "loss": 0.6295, "mean_token_accuracy": 0.8697365671396255, "num_tokens": 48014994.0, "step": 449 }, { "epoch": 1.025085518814139, "grad_norm": 2.203125, "learning_rate": 4.962886566651138e-06, "loss": 0.6177, "mean_token_accuracy": 0.873694971203804, "num_tokens": 48122545.0, "step": 450 }, { "epoch": 1.0273660205245154, "grad_norm": 8.5625, "learning_rate": 4.962562537324176e-06, "loss": 0.6066, "mean_token_accuracy": 0.8789347857236862, "num_tokens": 48230071.0, "step": 451 }, { "epoch": 1.0296465222348916, "grad_norm": 4.5625, "learning_rate": 4.96223711029426e-06, "loss": 0.641, "mean_token_accuracy": 0.8675385862588882, "num_tokens": 48336830.0, "step": 452 }, { "epoch": 1.031927023945268, "grad_norm": 5.90625, "learning_rate": 4.961910285746094e-06, "loss": 0.5954, "mean_token_accuracy": 0.8765672594308853, "num_tokens": 48443698.0, "step": 453 }, { "epoch": 1.0342075256556442, "grad_norm": 2.6875, "learning_rate": 4.9615820638651805e-06, "loss": 0.642, "mean_token_accuracy": 0.8699254840612411, "num_tokens": 48550340.0, "step": 454 }, { "epoch": 1.0364880273660204, "grad_norm": 5.71875, "learning_rate": 4.961252444837809e-06, "loss": 0.6359, "mean_token_accuracy": 0.8699039667844772, "num_tokens": 48657031.0, "step": 455 }, { "epoch": 1.0387685290763968, "grad_norm": 4.90625, "learning_rate": 4.960921428851066e-06, "loss": 0.6169, "mean_token_accuracy": 0.8748660534620285, "num_tokens": 48763597.0, "step": 456 }, { "epoch": 1.041049030786773, "grad_norm": 5.0, "learning_rate": 4.960589016092832e-06, "loss": 0.6144, "mean_token_accuracy": 0.8746305704116821, "num_tokens": 48871423.0, "step": 457 }, { "epoch": 1.0433295324971494, "grad_norm": 6.84375, "learning_rate": 4.960255206751774e-06, "loss": 0.627, "mean_token_accuracy": 0.8704584836959839, "num_tokens": 48978350.0, "step": 458 }, { "epoch": 1.0456100342075256, "grad_norm": 3.515625, "learning_rate": 4.959920001017358e-06, "loss": 0.6198, "mean_token_accuracy": 0.8744053989648819, "num_tokens": 49085534.0, "step": 459 }, { "epoch": 1.047890535917902, "grad_norm": 4.59375, "learning_rate": 4.95958339907984e-06, "loss": 0.6185, "mean_token_accuracy": 0.8731739073991776, "num_tokens": 49192856.0, "step": 460 }, { "epoch": 1.0501710376282782, "grad_norm": 9.1875, "learning_rate": 4.959245401130269e-06, "loss": 0.6314, "mean_token_accuracy": 0.8677381575107574, "num_tokens": 49299373.0, "step": 461 }, { "epoch": 1.0524515393386544, "grad_norm": 6.5625, "learning_rate": 4.958906007360487e-06, "loss": 0.5992, "mean_token_accuracy": 0.876073032617569, "num_tokens": 49406428.0, "step": 462 }, { "epoch": 1.0547320410490308, "grad_norm": 3.96875, "learning_rate": 4.958565217963125e-06, "loss": 0.6201, "mean_token_accuracy": 0.8712072819471359, "num_tokens": 49513674.0, "step": 463 }, { "epoch": 1.057012542759407, "grad_norm": 4.65625, "learning_rate": 4.95822303313161e-06, "loss": 0.6191, "mean_token_accuracy": 0.8719818741083145, "num_tokens": 49620690.0, "step": 464 }, { "epoch": 1.0592930444697835, "grad_norm": 2.4375, "learning_rate": 4.957879453060159e-06, "loss": 0.607, "mean_token_accuracy": 0.874313622713089, "num_tokens": 49728319.0, "step": 465 }, { "epoch": 1.0615735461801596, "grad_norm": 2.859375, "learning_rate": 4.957534477943782e-06, "loss": 0.5972, "mean_token_accuracy": 0.8772328495979309, "num_tokens": 49835825.0, "step": 466 }, { "epoch": 1.0638540478905358, "grad_norm": 4.71875, "learning_rate": 4.957188107978279e-06, "loss": 0.6051, "mean_token_accuracy": 0.8736355155706406, "num_tokens": 49943303.0, "step": 467 }, { "epoch": 1.0661345496009123, "grad_norm": 5.3125, "learning_rate": 4.956840343360245e-06, "loss": 0.6138, "mean_token_accuracy": 0.8750191628932953, "num_tokens": 50050828.0, "step": 468 }, { "epoch": 1.0684150513112884, "grad_norm": 2.71875, "learning_rate": 4.956491184287062e-06, "loss": 0.6051, "mean_token_accuracy": 0.8774718195199966, "num_tokens": 50158185.0, "step": 469 }, { "epoch": 1.0706955530216649, "grad_norm": 2.484375, "learning_rate": 4.9561406309569084e-06, "loss": 0.6381, "mean_token_accuracy": 0.8694217354059219, "num_tokens": 50265385.0, "step": 470 }, { "epoch": 1.072976054732041, "grad_norm": 2.78125, "learning_rate": 4.955788683568749e-06, "loss": 0.6035, "mean_token_accuracy": 0.8717086911201477, "num_tokens": 50372238.0, "step": 471 }, { "epoch": 1.0752565564424172, "grad_norm": 3.625, "learning_rate": 4.955435342322345e-06, "loss": 0.5958, "mean_token_accuracy": 0.8751126825809479, "num_tokens": 50479115.0, "step": 472 }, { "epoch": 1.0775370581527937, "grad_norm": 3.6875, "learning_rate": 4.955080607418244e-06, "loss": 0.6061, "mean_token_accuracy": 0.8753761649131775, "num_tokens": 50586257.0, "step": 473 }, { "epoch": 1.0798175598631699, "grad_norm": 2.421875, "learning_rate": 4.954724479057788e-06, "loss": 0.6195, "mean_token_accuracy": 0.8744408041238785, "num_tokens": 50692536.0, "step": 474 }, { "epoch": 1.0820980615735463, "grad_norm": 2.453125, "learning_rate": 4.954366957443107e-06, "loss": 0.6102, "mean_token_accuracy": 0.8744495362043381, "num_tokens": 50799101.0, "step": 475 }, { "epoch": 1.0843785632839225, "grad_norm": 4.96875, "learning_rate": 4.954008042777125e-06, "loss": 0.6226, "mean_token_accuracy": 0.871866300702095, "num_tokens": 50906128.0, "step": 476 }, { "epoch": 1.0866590649942987, "grad_norm": 3.65625, "learning_rate": 4.953647735263555e-06, "loss": 0.5962, "mean_token_accuracy": 0.8779451102018356, "num_tokens": 51013166.0, "step": 477 }, { "epoch": 1.088939566704675, "grad_norm": 3.40625, "learning_rate": 4.953286035106898e-06, "loss": 0.6202, "mean_token_accuracy": 0.8745466768741608, "num_tokens": 51120460.0, "step": 478 }, { "epoch": 1.0912200684150513, "grad_norm": 2.140625, "learning_rate": 4.952922942512452e-06, "loss": 0.6224, "mean_token_accuracy": 0.8701803684234619, "num_tokens": 51227376.0, "step": 479 }, { "epoch": 1.0935005701254277, "grad_norm": 1.8515625, "learning_rate": 4.9525584576862985e-06, "loss": 0.623, "mean_token_accuracy": 0.8713761270046234, "num_tokens": 51334284.0, "step": 480 }, { "epoch": 1.0957810718358039, "grad_norm": 3.71875, "learning_rate": 4.952192580835313e-06, "loss": 0.6286, "mean_token_accuracy": 0.8694994300603867, "num_tokens": 51441531.0, "step": 481 }, { "epoch": 1.09806157354618, "grad_norm": 4.1875, "learning_rate": 4.9518253121671595e-06, "loss": 0.6201, "mean_token_accuracy": 0.8716080784797668, "num_tokens": 51549055.0, "step": 482 }, { "epoch": 1.1003420752565565, "grad_norm": 2.796875, "learning_rate": 4.951456651890294e-06, "loss": 0.5823, "mean_token_accuracy": 0.881346806883812, "num_tokens": 51656798.0, "step": 483 }, { "epoch": 1.1026225769669327, "grad_norm": 2.359375, "learning_rate": 4.951086600213959e-06, "loss": 0.6183, "mean_token_accuracy": 0.8716641664505005, "num_tokens": 51763790.0, "step": 484 }, { "epoch": 1.104903078677309, "grad_norm": 2.5625, "learning_rate": 4.950715157348191e-06, "loss": 0.6196, "mean_token_accuracy": 0.8753807693719864, "num_tokens": 51870653.0, "step": 485 }, { "epoch": 1.1071835803876853, "grad_norm": 2.921875, "learning_rate": 4.950342323503812e-06, "loss": 0.6146, "mean_token_accuracy": 0.870696559548378, "num_tokens": 51977455.0, "step": 486 }, { "epoch": 1.1094640820980617, "grad_norm": 4.625, "learning_rate": 4.949968098892436e-06, "loss": 0.642, "mean_token_accuracy": 0.8685554414987564, "num_tokens": 52083822.0, "step": 487 }, { "epoch": 1.1117445838084379, "grad_norm": 2.484375, "learning_rate": 4.949592483726465e-06, "loss": 0.6149, "mean_token_accuracy": 0.8732410371303558, "num_tokens": 52191177.0, "step": 488 }, { "epoch": 1.114025085518814, "grad_norm": 2.140625, "learning_rate": 4.949215478219092e-06, "loss": 0.5947, "mean_token_accuracy": 0.8758023232221603, "num_tokens": 52297640.0, "step": 489 }, { "epoch": 1.1163055872291905, "grad_norm": 2.0625, "learning_rate": 4.948837082584298e-06, "loss": 0.6011, "mean_token_accuracy": 0.8783663511276245, "num_tokens": 52404875.0, "step": 490 }, { "epoch": 1.1185860889395667, "grad_norm": 4.5625, "learning_rate": 4.9484572970368516e-06, "loss": 0.6263, "mean_token_accuracy": 0.8689621537923813, "num_tokens": 52511463.0, "step": 491 }, { "epoch": 1.120866590649943, "grad_norm": 3.234375, "learning_rate": 4.948076121792313e-06, "loss": 0.6115, "mean_token_accuracy": 0.8712314665317535, "num_tokens": 52617733.0, "step": 492 }, { "epoch": 1.1231470923603193, "grad_norm": 2.5625, "learning_rate": 4.9476935570670294e-06, "loss": 0.6101, "mean_token_accuracy": 0.8728629648685455, "num_tokens": 52725104.0, "step": 493 }, { "epoch": 1.1254275940706955, "grad_norm": 2.796875, "learning_rate": 4.947309603078138e-06, "loss": 0.646, "mean_token_accuracy": 0.8683103322982788, "num_tokens": 52832460.0, "step": 494 }, { "epoch": 1.127708095781072, "grad_norm": 4.84375, "learning_rate": 4.946924260043563e-06, "loss": 0.6341, "mean_token_accuracy": 0.8687001317739487, "num_tokens": 52939314.0, "step": 495 }, { "epoch": 1.129988597491448, "grad_norm": 2.65625, "learning_rate": 4.946537528182017e-06, "loss": 0.6301, "mean_token_accuracy": 0.8715783804655075, "num_tokens": 53045997.0, "step": 496 }, { "epoch": 1.1322690992018245, "grad_norm": 2.5, "learning_rate": 4.946149407713002e-06, "loss": 0.6211, "mean_token_accuracy": 0.8704388439655304, "num_tokens": 53153160.0, "step": 497 }, { "epoch": 1.1345496009122007, "grad_norm": 2.78125, "learning_rate": 4.945759898856809e-06, "loss": 0.6162, "mean_token_accuracy": 0.8732296079397202, "num_tokens": 53260206.0, "step": 498 }, { "epoch": 1.1368301026225769, "grad_norm": 2.140625, "learning_rate": 4.9453690018345144e-06, "loss": 0.6392, "mean_token_accuracy": 0.8706572353839874, "num_tokens": 53366657.0, "step": 499 }, { "epoch": 1.1391106043329533, "grad_norm": 3.25, "learning_rate": 4.944976716867984e-06, "loss": 0.635, "mean_token_accuracy": 0.8699849843978882, "num_tokens": 53473137.0, "step": 500 }, { "epoch": 1.1413911060433295, "grad_norm": 2.09375, "learning_rate": 4.944583044179871e-06, "loss": 0.6214, "mean_token_accuracy": 0.875198557972908, "num_tokens": 53580438.0, "step": 501 }, { "epoch": 1.143671607753706, "grad_norm": 2.421875, "learning_rate": 4.944187983993617e-06, "loss": 0.5929, "mean_token_accuracy": 0.8774997144937515, "num_tokens": 53687875.0, "step": 502 }, { "epoch": 1.145952109464082, "grad_norm": 3.453125, "learning_rate": 4.94379153653345e-06, "loss": 0.6153, "mean_token_accuracy": 0.8719532340764999, "num_tokens": 53794260.0, "step": 503 }, { "epoch": 1.1482326111744583, "grad_norm": 3.890625, "learning_rate": 4.9433937020243854e-06, "loss": 0.6351, "mean_token_accuracy": 0.869365006685257, "num_tokens": 53901213.0, "step": 504 }, { "epoch": 1.1505131128848347, "grad_norm": 2.609375, "learning_rate": 4.942994480692228e-06, "loss": 0.6199, "mean_token_accuracy": 0.8735174834728241, "num_tokens": 54007865.0, "step": 505 }, { "epoch": 1.152793614595211, "grad_norm": 3.3125, "learning_rate": 4.942593872763566e-06, "loss": 0.5824, "mean_token_accuracy": 0.8790851831436157, "num_tokens": 54114696.0, "step": 506 }, { "epoch": 1.1550741163055873, "grad_norm": 2.984375, "learning_rate": 4.9421918784657795e-06, "loss": 0.6112, "mean_token_accuracy": 0.8767211586236954, "num_tokens": 54221690.0, "step": 507 }, { "epoch": 1.1573546180159635, "grad_norm": 3.390625, "learning_rate": 4.94178849802703e-06, "loss": 0.6044, "mean_token_accuracy": 0.8764821738004684, "num_tokens": 54328424.0, "step": 508 }, { "epoch": 1.1596351197263397, "grad_norm": 2.40625, "learning_rate": 4.9413837316762705e-06, "loss": 0.6264, "mean_token_accuracy": 0.8703786879777908, "num_tokens": 54435061.0, "step": 509 }, { "epoch": 1.1619156214367161, "grad_norm": 2.109375, "learning_rate": 4.940977579643237e-06, "loss": 0.6096, "mean_token_accuracy": 0.8734538704156876, "num_tokens": 54541917.0, "step": 510 }, { "epoch": 1.1641961231470923, "grad_norm": 3.15625, "learning_rate": 4.940570042158454e-06, "loss": 0.6149, "mean_token_accuracy": 0.8750556707382202, "num_tokens": 54649030.0, "step": 511 }, { "epoch": 1.1664766248574687, "grad_norm": 4.15625, "learning_rate": 4.940161119453232e-06, "loss": 0.6084, "mean_token_accuracy": 0.8727173060178757, "num_tokens": 54756245.0, "step": 512 }, { "epoch": 1.168757126567845, "grad_norm": 4.53125, "learning_rate": 4.939750811759668e-06, "loss": 0.6083, "mean_token_accuracy": 0.8715848624706268, "num_tokens": 54863315.0, "step": 513 }, { "epoch": 1.171037628278221, "grad_norm": 2.46875, "learning_rate": 4.939339119310645e-06, "loss": 0.621, "mean_token_accuracy": 0.8712294846773148, "num_tokens": 54970714.0, "step": 514 }, { "epoch": 1.1733181299885975, "grad_norm": 3.453125, "learning_rate": 4.93892604233983e-06, "loss": 0.6305, "mean_token_accuracy": 0.8707058429718018, "num_tokens": 55077726.0, "step": 515 }, { "epoch": 1.1755986316989737, "grad_norm": 4.5625, "learning_rate": 4.93851158108168e-06, "loss": 0.588, "mean_token_accuracy": 0.8780573010444641, "num_tokens": 55185327.0, "step": 516 }, { "epoch": 1.1778791334093501, "grad_norm": 3.953125, "learning_rate": 4.938095735771433e-06, "loss": 0.6184, "mean_token_accuracy": 0.8706268519163132, "num_tokens": 55292419.0, "step": 517 }, { "epoch": 1.1801596351197263, "grad_norm": 1.8984375, "learning_rate": 4.937678506645116e-06, "loss": 0.5988, "mean_token_accuracy": 0.873799741268158, "num_tokens": 55399684.0, "step": 518 }, { "epoch": 1.1824401368301025, "grad_norm": 5.84375, "learning_rate": 4.937259893939539e-06, "loss": 0.6202, "mean_token_accuracy": 0.8741802275180817, "num_tokens": 55507140.0, "step": 519 }, { "epoch": 1.184720638540479, "grad_norm": 4.78125, "learning_rate": 4.9368398978923e-06, "loss": 0.6149, "mean_token_accuracy": 0.8741926103830338, "num_tokens": 55614345.0, "step": 520 }, { "epoch": 1.1870011402508551, "grad_norm": 2.984375, "learning_rate": 4.93641851874178e-06, "loss": 0.631, "mean_token_accuracy": 0.8710474520921707, "num_tokens": 55721149.0, "step": 521 }, { "epoch": 1.1892816419612315, "grad_norm": 4.5, "learning_rate": 4.935995756727146e-06, "loss": 0.6192, "mean_token_accuracy": 0.8732537627220154, "num_tokens": 55828356.0, "step": 522 }, { "epoch": 1.1915621436716077, "grad_norm": 2.53125, "learning_rate": 4.935571612088349e-06, "loss": 0.6065, "mean_token_accuracy": 0.875878319144249, "num_tokens": 55935753.0, "step": 523 }, { "epoch": 1.193842645381984, "grad_norm": 2.96875, "learning_rate": 4.935146085066125e-06, "loss": 0.6175, "mean_token_accuracy": 0.874293178319931, "num_tokens": 56042443.0, "step": 524 }, { "epoch": 1.1961231470923603, "grad_norm": 2.59375, "learning_rate": 4.934719175901996e-06, "loss": 0.6016, "mean_token_accuracy": 0.8778766244649887, "num_tokens": 56149378.0, "step": 525 }, { "epoch": 1.1984036488027365, "grad_norm": 3.421875, "learning_rate": 4.934290884838266e-06, "loss": 0.6168, "mean_token_accuracy": 0.8713551312685013, "num_tokens": 56256623.0, "step": 526 }, { "epoch": 1.200684150513113, "grad_norm": 3.703125, "learning_rate": 4.933861212118027e-06, "loss": 0.6319, "mean_token_accuracy": 0.8705524206161499, "num_tokens": 56363653.0, "step": 527 }, { "epoch": 1.2029646522234891, "grad_norm": 3.828125, "learning_rate": 4.933430157985151e-06, "loss": 0.6279, "mean_token_accuracy": 0.8724389970302582, "num_tokens": 56470381.0, "step": 528 }, { "epoch": 1.2052451539338653, "grad_norm": 2.59375, "learning_rate": 4.932997722684296e-06, "loss": 0.6157, "mean_token_accuracy": 0.8716868907213211, "num_tokens": 56577262.0, "step": 529 }, { "epoch": 1.2075256556442417, "grad_norm": 4.625, "learning_rate": 4.932563906460905e-06, "loss": 0.6121, "mean_token_accuracy": 0.8729743212461472, "num_tokens": 56683964.0, "step": 530 }, { "epoch": 1.209806157354618, "grad_norm": 7.375, "learning_rate": 4.932128709561202e-06, "loss": 0.6031, "mean_token_accuracy": 0.8723554909229279, "num_tokens": 56791540.0, "step": 531 }, { "epoch": 1.2120866590649944, "grad_norm": 6.375, "learning_rate": 4.931692132232198e-06, "loss": 0.601, "mean_token_accuracy": 0.8752623051404953, "num_tokens": 56898876.0, "step": 532 }, { "epoch": 1.2143671607753705, "grad_norm": 3.40625, "learning_rate": 4.931254174721687e-06, "loss": 0.6311, "mean_token_accuracy": 0.867269441485405, "num_tokens": 57005808.0, "step": 533 }, { "epoch": 1.216647662485747, "grad_norm": 2.125, "learning_rate": 4.930814837278242e-06, "loss": 0.5913, "mean_token_accuracy": 0.8760450780391693, "num_tokens": 57113633.0, "step": 534 }, { "epoch": 1.2189281641961232, "grad_norm": 5.71875, "learning_rate": 4.930374120151225e-06, "loss": 0.6388, "mean_token_accuracy": 0.8680548220872879, "num_tokens": 57220723.0, "step": 535 }, { "epoch": 1.2212086659064993, "grad_norm": 5.71875, "learning_rate": 4.929932023590776e-06, "loss": 0.6436, "mean_token_accuracy": 0.867982491850853, "num_tokens": 57327185.0, "step": 536 }, { "epoch": 1.2234891676168758, "grad_norm": 7.25, "learning_rate": 4.929488547847823e-06, "loss": 0.6068, "mean_token_accuracy": 0.8724008947610855, "num_tokens": 57433998.0, "step": 537 }, { "epoch": 1.225769669327252, "grad_norm": 5.0, "learning_rate": 4.9290436931740735e-06, "loss": 0.6194, "mean_token_accuracy": 0.8717511296272278, "num_tokens": 57541338.0, "step": 538 }, { "epoch": 1.2280501710376284, "grad_norm": 3.171875, "learning_rate": 4.928597459822018e-06, "loss": 0.6031, "mean_token_accuracy": 0.879209354519844, "num_tokens": 57648210.0, "step": 539 }, { "epoch": 1.2303306727480046, "grad_norm": 3.359375, "learning_rate": 4.928149848044931e-06, "loss": 0.6069, "mean_token_accuracy": 0.8770927637815475, "num_tokens": 57755187.0, "step": 540 }, { "epoch": 1.2326111744583808, "grad_norm": 4.375, "learning_rate": 4.9277008580968665e-06, "loss": 0.5956, "mean_token_accuracy": 0.8778523355722427, "num_tokens": 57862318.0, "step": 541 }, { "epoch": 1.2348916761687572, "grad_norm": 5.03125, "learning_rate": 4.927250490232664e-06, "loss": 0.6139, "mean_token_accuracy": 0.8774797171354294, "num_tokens": 57969230.0, "step": 542 }, { "epoch": 1.2371721778791334, "grad_norm": 3.109375, "learning_rate": 4.926798744707943e-06, "loss": 0.5933, "mean_token_accuracy": 0.8759450018405914, "num_tokens": 58076744.0, "step": 543 }, { "epoch": 1.2394526795895098, "grad_norm": 2.359375, "learning_rate": 4.926345621779106e-06, "loss": 0.6241, "mean_token_accuracy": 0.8749475181102753, "num_tokens": 58184125.0, "step": 544 }, { "epoch": 1.241733181299886, "grad_norm": 2.015625, "learning_rate": 4.9258911217033355e-06, "loss": 0.6161, "mean_token_accuracy": 0.8739427775144577, "num_tokens": 58290980.0, "step": 545 }, { "epoch": 1.2440136830102622, "grad_norm": 3.453125, "learning_rate": 4.925435244738599e-06, "loss": 0.6038, "mean_token_accuracy": 0.8763206750154495, "num_tokens": 58397982.0, "step": 546 }, { "epoch": 1.2462941847206386, "grad_norm": 2.71875, "learning_rate": 4.924977991143642e-06, "loss": 0.5985, "mean_token_accuracy": 0.8783524632453918, "num_tokens": 58505934.0, "step": 547 }, { "epoch": 1.2485746864310148, "grad_norm": 2.5, "learning_rate": 4.924519361177993e-06, "loss": 0.5877, "mean_token_accuracy": 0.8753952831029892, "num_tokens": 58613394.0, "step": 548 }, { "epoch": 1.2508551881413912, "grad_norm": 1.8984375, "learning_rate": 4.9240593551019625e-06, "loss": 0.6099, "mean_token_accuracy": 0.8755134344100952, "num_tokens": 58721077.0, "step": 549 }, { "epoch": 1.2531356898517674, "grad_norm": 2.484375, "learning_rate": 4.92359797317664e-06, "loss": 0.6153, "mean_token_accuracy": 0.8757786601781845, "num_tokens": 58828151.0, "step": 550 }, { "epoch": 1.2554161915621438, "grad_norm": 2.375, "learning_rate": 4.923135215663897e-06, "loss": 0.6188, "mean_token_accuracy": 0.8740972727537155, "num_tokens": 58935083.0, "step": 551 }, { "epoch": 1.25769669327252, "grad_norm": 4.09375, "learning_rate": 4.922671082826386e-06, "loss": 0.6087, "mean_token_accuracy": 0.8730472475290298, "num_tokens": 59041838.0, "step": 552 }, { "epoch": 1.2599771949828962, "grad_norm": 2.3125, "learning_rate": 4.92220557492754e-06, "loss": 0.6085, "mean_token_accuracy": 0.8769043833017349, "num_tokens": 59149008.0, "step": 553 }, { "epoch": 1.2622576966932726, "grad_norm": 2.328125, "learning_rate": 4.921738692231572e-06, "loss": 0.6053, "mean_token_accuracy": 0.8772878795862198, "num_tokens": 59255903.0, "step": 554 }, { "epoch": 1.2645381984036488, "grad_norm": 2.203125, "learning_rate": 4.9212704350034764e-06, "loss": 0.6136, "mean_token_accuracy": 0.8748277425765991, "num_tokens": 59362809.0, "step": 555 }, { "epoch": 1.2668187001140252, "grad_norm": 2.921875, "learning_rate": 4.920800803509026e-06, "loss": 0.5977, "mean_token_accuracy": 0.87485072016716, "num_tokens": 59469454.0, "step": 556 }, { "epoch": 1.2690992018244014, "grad_norm": 4.1875, "learning_rate": 4.920329798014775e-06, "loss": 0.6206, "mean_token_accuracy": 0.8719239979982376, "num_tokens": 59576709.0, "step": 557 }, { "epoch": 1.2713797035347776, "grad_norm": 2.265625, "learning_rate": 4.919857418788056e-06, "loss": 0.6285, "mean_token_accuracy": 0.8737199753522873, "num_tokens": 59683387.0, "step": 558 }, { "epoch": 1.273660205245154, "grad_norm": 2.53125, "learning_rate": 4.919383666096985e-06, "loss": 0.6085, "mean_token_accuracy": 0.8742072433233261, "num_tokens": 59790354.0, "step": 559 }, { "epoch": 1.2759407069555302, "grad_norm": 4.8125, "learning_rate": 4.918908540210452e-06, "loss": 0.6398, "mean_token_accuracy": 0.869797870516777, "num_tokens": 59897756.0, "step": 560 }, { "epoch": 1.2782212086659066, "grad_norm": 2.859375, "learning_rate": 4.91843204139813e-06, "loss": 0.5845, "mean_token_accuracy": 0.88178950548172, "num_tokens": 60004961.0, "step": 561 }, { "epoch": 1.2805017103762828, "grad_norm": 2.40625, "learning_rate": 4.917954169930472e-06, "loss": 0.5929, "mean_token_accuracy": 0.8791621029376984, "num_tokens": 60112141.0, "step": 562 }, { "epoch": 1.282782212086659, "grad_norm": 2.46875, "learning_rate": 4.917474926078707e-06, "loss": 0.6084, "mean_token_accuracy": 0.8750548511743546, "num_tokens": 60220073.0, "step": 563 }, { "epoch": 1.2850627137970354, "grad_norm": 2.75, "learning_rate": 4.916994310114845e-06, "loss": 0.5995, "mean_token_accuracy": 0.8793503940105438, "num_tokens": 60327794.0, "step": 564 }, { "epoch": 1.2873432155074116, "grad_norm": 3.0625, "learning_rate": 4.916512322311675e-06, "loss": 0.6275, "mean_token_accuracy": 0.86960369348526, "num_tokens": 60434871.0, "step": 565 }, { "epoch": 1.289623717217788, "grad_norm": 3.703125, "learning_rate": 4.916028962942763e-06, "loss": 0.5944, "mean_token_accuracy": 0.8765391111373901, "num_tokens": 60542243.0, "step": 566 }, { "epoch": 1.2919042189281642, "grad_norm": 2.890625, "learning_rate": 4.915544232282455e-06, "loss": 0.6144, "mean_token_accuracy": 0.8739331364631653, "num_tokens": 60649576.0, "step": 567 }, { "epoch": 1.2941847206385404, "grad_norm": 1.8828125, "learning_rate": 4.915058130605874e-06, "loss": 0.585, "mean_token_accuracy": 0.8797845989465714, "num_tokens": 60756480.0, "step": 568 }, { "epoch": 1.2964652223489168, "grad_norm": 1.9609375, "learning_rate": 4.9145706581889235e-06, "loss": 0.6264, "mean_token_accuracy": 0.8693940490484238, "num_tokens": 60863068.0, "step": 569 }, { "epoch": 1.298745724059293, "grad_norm": 5.1875, "learning_rate": 4.914081815308283e-06, "loss": 0.5955, "mean_token_accuracy": 0.8792444169521332, "num_tokens": 60969966.0, "step": 570 }, { "epoch": 1.3010262257696694, "grad_norm": 2.75, "learning_rate": 4.913591602241409e-06, "loss": 0.6145, "mean_token_accuracy": 0.8721539825201035, "num_tokens": 61076972.0, "step": 571 }, { "epoch": 1.3033067274800456, "grad_norm": 3.53125, "learning_rate": 4.9131000192665365e-06, "loss": 0.6187, "mean_token_accuracy": 0.8705037534236908, "num_tokens": 61183878.0, "step": 572 }, { "epoch": 1.3055872291904218, "grad_norm": 2.671875, "learning_rate": 4.9126070666626815e-06, "loss": 0.6065, "mean_token_accuracy": 0.8774714469909668, "num_tokens": 61290915.0, "step": 573 }, { "epoch": 1.3078677309007982, "grad_norm": 6.53125, "learning_rate": 4.912112744709632e-06, "loss": 0.599, "mean_token_accuracy": 0.876824364066124, "num_tokens": 61397686.0, "step": 574 }, { "epoch": 1.3101482326111744, "grad_norm": 4.25, "learning_rate": 4.911617053687957e-06, "loss": 0.5937, "mean_token_accuracy": 0.8789774626493454, "num_tokens": 61504802.0, "step": 575 }, { "epoch": 1.3124287343215508, "grad_norm": 2.453125, "learning_rate": 4.911119993878999e-06, "loss": 0.615, "mean_token_accuracy": 0.8741600215435028, "num_tokens": 61611778.0, "step": 576 }, { "epoch": 1.314709236031927, "grad_norm": 2.65625, "learning_rate": 4.910621565564882e-06, "loss": 0.6029, "mean_token_accuracy": 0.8803430199623108, "num_tokens": 61719136.0, "step": 577 }, { "epoch": 1.3169897377423032, "grad_norm": 4.375, "learning_rate": 4.910121769028503e-06, "loss": 0.5989, "mean_token_accuracy": 0.8754008263349533, "num_tokens": 61826166.0, "step": 578 }, { "epoch": 1.3192702394526796, "grad_norm": 8.625, "learning_rate": 4.909620604553537e-06, "loss": 0.6315, "mean_token_accuracy": 0.8695260733366013, "num_tokens": 61933001.0, "step": 579 }, { "epoch": 1.3215507411630558, "grad_norm": 7.34375, "learning_rate": 4.909118072424436e-06, "loss": 0.5984, "mean_token_accuracy": 0.880437433719635, "num_tokens": 62039905.0, "step": 580 }, { "epoch": 1.3238312428734322, "grad_norm": 6.21875, "learning_rate": 4.908614172926426e-06, "loss": 0.5988, "mean_token_accuracy": 0.8764496594667435, "num_tokens": 62147016.0, "step": 581 }, { "epoch": 1.3261117445838084, "grad_norm": 3.203125, "learning_rate": 4.908108906345512e-06, "loss": 0.6047, "mean_token_accuracy": 0.8747891783714294, "num_tokens": 62254504.0, "step": 582 }, { "epoch": 1.3283922462941846, "grad_norm": 3.75, "learning_rate": 4.907602272968473e-06, "loss": 0.6253, "mean_token_accuracy": 0.8759507387876511, "num_tokens": 62361146.0, "step": 583 }, { "epoch": 1.330672748004561, "grad_norm": 2.734375, "learning_rate": 4.907094273082865e-06, "loss": 0.6254, "mean_token_accuracy": 0.8714919984340668, "num_tokens": 62467825.0, "step": 584 }, { "epoch": 1.3329532497149372, "grad_norm": 2.8125, "learning_rate": 4.906584906977018e-06, "loss": 0.606, "mean_token_accuracy": 0.8751325905323029, "num_tokens": 62575437.0, "step": 585 }, { "epoch": 1.3352337514253136, "grad_norm": 2.171875, "learning_rate": 4.906074174940038e-06, "loss": 0.5997, "mean_token_accuracy": 0.8743661940097809, "num_tokens": 62682439.0, "step": 586 }, { "epoch": 1.3375142531356898, "grad_norm": 2.125, "learning_rate": 4.905562077261808e-06, "loss": 0.6095, "mean_token_accuracy": 0.8771595358848572, "num_tokens": 62789139.0, "step": 587 }, { "epoch": 1.339794754846066, "grad_norm": 2.1875, "learning_rate": 4.905048614232984e-06, "loss": 0.62, "mean_token_accuracy": 0.8709544539451599, "num_tokens": 62896172.0, "step": 588 }, { "epoch": 1.3420752565564424, "grad_norm": 1.9765625, "learning_rate": 4.904533786144998e-06, "loss": 0.6211, "mean_token_accuracy": 0.8748406618833542, "num_tokens": 63003536.0, "step": 589 }, { "epoch": 1.3443557582668186, "grad_norm": 2.296875, "learning_rate": 4.904017593290056e-06, "loss": 0.6059, "mean_token_accuracy": 0.8730844259262085, "num_tokens": 63110204.0, "step": 590 }, { "epoch": 1.346636259977195, "grad_norm": 3.03125, "learning_rate": 4.903500035961139e-06, "loss": 0.5986, "mean_token_accuracy": 0.8759681731462479, "num_tokens": 63217782.0, "step": 591 }, { "epoch": 1.3489167616875712, "grad_norm": 2.046875, "learning_rate": 4.902981114452005e-06, "loss": 0.6124, "mean_token_accuracy": 0.8746581524610519, "num_tokens": 63324751.0, "step": 592 }, { "epoch": 1.3511972633979474, "grad_norm": 2.140625, "learning_rate": 4.90246082905718e-06, "loss": 0.617, "mean_token_accuracy": 0.8720570355653763, "num_tokens": 63431709.0, "step": 593 }, { "epoch": 1.3534777651083238, "grad_norm": 3.09375, "learning_rate": 4.90193918007197e-06, "loss": 0.6156, "mean_token_accuracy": 0.8742910176515579, "num_tokens": 63538728.0, "step": 594 }, { "epoch": 1.3557582668187, "grad_norm": 2.921875, "learning_rate": 4.901416167792452e-06, "loss": 0.6044, "mean_token_accuracy": 0.875694528222084, "num_tokens": 63645888.0, "step": 595 }, { "epoch": 1.3580387685290765, "grad_norm": 2.71875, "learning_rate": 4.9008917925154795e-06, "loss": 0.595, "mean_token_accuracy": 0.8773985803127289, "num_tokens": 63753292.0, "step": 596 }, { "epoch": 1.3603192702394526, "grad_norm": 2.046875, "learning_rate": 4.900366054538675e-06, "loss": 0.6149, "mean_token_accuracy": 0.8744669109582901, "num_tokens": 63859832.0, "step": 597 }, { "epoch": 1.3625997719498288, "grad_norm": 9.0625, "learning_rate": 4.8998389541604405e-06, "loss": 0.6178, "mean_token_accuracy": 0.8743558824062347, "num_tokens": 63966869.0, "step": 598 }, { "epoch": 1.3648802736602053, "grad_norm": 7.9375, "learning_rate": 4.899310491679945e-06, "loss": 0.5883, "mean_token_accuracy": 0.8777134567499161, "num_tokens": 64074013.0, "step": 599 }, { "epoch": 1.3671607753705814, "grad_norm": 3.015625, "learning_rate": 4.898780667397136e-06, "loss": 0.6098, "mean_token_accuracy": 0.8756420910358429, "num_tokens": 64181228.0, "step": 600 }, { "epoch": 1.3694412770809579, "grad_norm": 4.9375, "learning_rate": 4.89824948161273e-06, "loss": 0.5929, "mean_token_accuracy": 0.8778956681489944, "num_tokens": 64288189.0, "step": 601 }, { "epoch": 1.371721778791334, "grad_norm": 2.140625, "learning_rate": 4.8977169346282184e-06, "loss": 0.6101, "mean_token_accuracy": 0.8754762411117554, "num_tokens": 64395116.0, "step": 602 }, { "epoch": 1.3740022805017102, "grad_norm": 2.59375, "learning_rate": 4.8971830267458645e-06, "loss": 0.6076, "mean_token_accuracy": 0.8785329908132553, "num_tokens": 64502388.0, "step": 603 }, { "epoch": 1.3762827822120867, "grad_norm": 2.90625, "learning_rate": 4.896647758268703e-06, "loss": 0.6121, "mean_token_accuracy": 0.8748378753662109, "num_tokens": 64609541.0, "step": 604 }, { "epoch": 1.378563283922463, "grad_norm": 3.8125, "learning_rate": 4.8961111295005444e-06, "loss": 0.6057, "mean_token_accuracy": 0.8767747431993484, "num_tokens": 64716199.0, "step": 605 }, { "epoch": 1.3808437856328393, "grad_norm": 2.953125, "learning_rate": 4.895573140745967e-06, "loss": 0.6154, "mean_token_accuracy": 0.8689918667078018, "num_tokens": 64823123.0, "step": 606 }, { "epoch": 1.3831242873432155, "grad_norm": 1.9296875, "learning_rate": 4.895033792310323e-06, "loss": 0.5859, "mean_token_accuracy": 0.879554808139801, "num_tokens": 64930304.0, "step": 607 }, { "epoch": 1.3854047890535917, "grad_norm": 3.859375, "learning_rate": 4.894493084499736e-06, "loss": 0.6176, "mean_token_accuracy": 0.8728460669517517, "num_tokens": 65037508.0, "step": 608 }, { "epoch": 1.387685290763968, "grad_norm": 2.203125, "learning_rate": 4.893951017621103e-06, "loss": 0.5968, "mean_token_accuracy": 0.8793998062610626, "num_tokens": 65145188.0, "step": 609 }, { "epoch": 1.3899657924743445, "grad_norm": 6.59375, "learning_rate": 4.893407591982088e-06, "loss": 0.6289, "mean_token_accuracy": 0.8728020936250687, "num_tokens": 65251638.0, "step": 610 }, { "epoch": 1.3922462941847207, "grad_norm": 3.46875, "learning_rate": 4.892862807891131e-06, "loss": 0.6187, "mean_token_accuracy": 0.8728552013635635, "num_tokens": 65358821.0, "step": 611 }, { "epoch": 1.3945267958950969, "grad_norm": 2.203125, "learning_rate": 4.89231666565744e-06, "loss": 0.6356, "mean_token_accuracy": 0.8699388653039932, "num_tokens": 65466008.0, "step": 612 }, { "epoch": 1.3968072976054733, "grad_norm": 2.40625, "learning_rate": 4.891769165590995e-06, "loss": 0.6067, "mean_token_accuracy": 0.8760533332824707, "num_tokens": 65573422.0, "step": 613 }, { "epoch": 1.3990877993158495, "grad_norm": 2.578125, "learning_rate": 4.891220308002547e-06, "loss": 0.6115, "mean_token_accuracy": 0.8774368613958359, "num_tokens": 65679987.0, "step": 614 }, { "epoch": 1.401368301026226, "grad_norm": 2.84375, "learning_rate": 4.890670093203617e-06, "loss": 0.6178, "mean_token_accuracy": 0.8716036677360535, "num_tokens": 65786496.0, "step": 615 }, { "epoch": 1.403648802736602, "grad_norm": 5.25, "learning_rate": 4.890118521506494e-06, "loss": 0.6349, "mean_token_accuracy": 0.8668168038129807, "num_tokens": 65893146.0, "step": 616 }, { "epoch": 1.4059293044469783, "grad_norm": 3.25, "learning_rate": 4.889565593224242e-06, "loss": 0.6086, "mean_token_accuracy": 0.8759677559137344, "num_tokens": 65999804.0, "step": 617 }, { "epoch": 1.4082098061573547, "grad_norm": 7.4375, "learning_rate": 4.889011308670693e-06, "loss": 0.601, "mean_token_accuracy": 0.8765671104192734, "num_tokens": 66106566.0, "step": 618 }, { "epoch": 1.4104903078677309, "grad_norm": 3.328125, "learning_rate": 4.8884556681604445e-06, "loss": 0.6154, "mean_token_accuracy": 0.8747722804546356, "num_tokens": 66213934.0, "step": 619 }, { "epoch": 1.4127708095781073, "grad_norm": 4.34375, "learning_rate": 4.8878986720088715e-06, "loss": 0.6039, "mean_token_accuracy": 0.878739207983017, "num_tokens": 66320771.0, "step": 620 }, { "epoch": 1.4150513112884835, "grad_norm": 2.90625, "learning_rate": 4.8873403205321115e-06, "loss": 0.6327, "mean_token_accuracy": 0.8718791306018829, "num_tokens": 66428168.0, "step": 621 }, { "epoch": 1.4173318129988597, "grad_norm": 2.234375, "learning_rate": 4.886780614047075e-06, "loss": 0.5787, "mean_token_accuracy": 0.877121314406395, "num_tokens": 66535417.0, "step": 622 }, { "epoch": 1.419612314709236, "grad_norm": 4.90625, "learning_rate": 4.886219552871441e-06, "loss": 0.6183, "mean_token_accuracy": 0.8733110427856445, "num_tokens": 66642970.0, "step": 623 }, { "epoch": 1.4218928164196123, "grad_norm": 6.0, "learning_rate": 4.885657137323656e-06, "loss": 0.6033, "mean_token_accuracy": 0.8745451271533966, "num_tokens": 66750313.0, "step": 624 }, { "epoch": 1.4241733181299887, "grad_norm": 3.796875, "learning_rate": 4.885093367722937e-06, "loss": 0.5826, "mean_token_accuracy": 0.8829841762781143, "num_tokens": 66857610.0, "step": 625 }, { "epoch": 1.426453819840365, "grad_norm": 6.15625, "learning_rate": 4.884528244389269e-06, "loss": 0.6207, "mean_token_accuracy": 0.8738205432891846, "num_tokens": 66964325.0, "step": 626 }, { "epoch": 1.428734321550741, "grad_norm": 1.9921875, "learning_rate": 4.883961767643404e-06, "loss": 0.571, "mean_token_accuracy": 0.8796354234218597, "num_tokens": 67071449.0, "step": 627 }, { "epoch": 1.4310148232611175, "grad_norm": 4.125, "learning_rate": 4.883393937806864e-06, "loss": 0.5991, "mean_token_accuracy": 0.8746996223926544, "num_tokens": 67178773.0, "step": 628 }, { "epoch": 1.4332953249714937, "grad_norm": 6.21875, "learning_rate": 4.882824755201938e-06, "loss": 0.5954, "mean_token_accuracy": 0.8757723271846771, "num_tokens": 67286302.0, "step": 629 }, { "epoch": 1.4355758266818701, "grad_norm": 8.4375, "learning_rate": 4.8822542201516835e-06, "loss": 0.6061, "mean_token_accuracy": 0.8711700737476349, "num_tokens": 67393466.0, "step": 630 }, { "epoch": 1.4378563283922463, "grad_norm": 3.46875, "learning_rate": 4.881682332979925e-06, "loss": 0.6216, "mean_token_accuracy": 0.869665339589119, "num_tokens": 67501293.0, "step": 631 }, { "epoch": 1.4401368301026225, "grad_norm": 2.6875, "learning_rate": 4.881109094011254e-06, "loss": 0.5956, "mean_token_accuracy": 0.8787660300731659, "num_tokens": 67609069.0, "step": 632 }, { "epoch": 1.442417331812999, "grad_norm": 5.5, "learning_rate": 4.88053450357103e-06, "loss": 0.6089, "mean_token_accuracy": 0.8752091825008392, "num_tokens": 67716379.0, "step": 633 }, { "epoch": 1.444697833523375, "grad_norm": 4.6875, "learning_rate": 4.87995856198538e-06, "loss": 0.6164, "mean_token_accuracy": 0.8721603453159332, "num_tokens": 67823022.0, "step": 634 }, { "epoch": 1.4469783352337515, "grad_norm": 4.0625, "learning_rate": 4.879381269581197e-06, "loss": 0.6158, "mean_token_accuracy": 0.8731086701154709, "num_tokens": 67929811.0, "step": 635 }, { "epoch": 1.4492588369441277, "grad_norm": 2.015625, "learning_rate": 4.878802626686141e-06, "loss": 0.5772, "mean_token_accuracy": 0.8828457891941071, "num_tokens": 68036844.0, "step": 636 }, { "epoch": 1.451539338654504, "grad_norm": 2.9375, "learning_rate": 4.8782226336286395e-06, "loss": 0.6151, "mean_token_accuracy": 0.8716375380754471, "num_tokens": 68144018.0, "step": 637 }, { "epoch": 1.4538198403648803, "grad_norm": 2.46875, "learning_rate": 4.8776412907378845e-06, "loss": 0.5829, "mean_token_accuracy": 0.8821705430746078, "num_tokens": 68251176.0, "step": 638 }, { "epoch": 1.4561003420752565, "grad_norm": 2.375, "learning_rate": 4.877058598343835e-06, "loss": 0.6087, "mean_token_accuracy": 0.8753101974725723, "num_tokens": 68358878.0, "step": 639 }, { "epoch": 1.458380843785633, "grad_norm": 1.8515625, "learning_rate": 4.876474556777216e-06, "loss": 0.5801, "mean_token_accuracy": 0.8794443160295486, "num_tokens": 68466299.0, "step": 640 }, { "epoch": 1.4606613454960091, "grad_norm": 2.46875, "learning_rate": 4.8758891663695165e-06, "loss": 0.602, "mean_token_accuracy": 0.8779085278511047, "num_tokens": 68573383.0, "step": 641 }, { "epoch": 1.4629418472063853, "grad_norm": 2.3125, "learning_rate": 4.875302427452996e-06, "loss": 0.6235, "mean_token_accuracy": 0.8709381222724915, "num_tokens": 68680054.0, "step": 642 }, { "epoch": 1.4652223489167617, "grad_norm": 2.953125, "learning_rate": 4.874714340360674e-06, "loss": 0.605, "mean_token_accuracy": 0.8749048262834549, "num_tokens": 68787643.0, "step": 643 }, { "epoch": 1.467502850627138, "grad_norm": 2.453125, "learning_rate": 4.874124905426339e-06, "loss": 0.6054, "mean_token_accuracy": 0.8763098567724228, "num_tokens": 68894888.0, "step": 644 }, { "epoch": 1.4697833523375143, "grad_norm": 3.0, "learning_rate": 4.873534122984541e-06, "loss": 0.611, "mean_token_accuracy": 0.8762774467468262, "num_tokens": 69001443.0, "step": 645 }, { "epoch": 1.4720638540478905, "grad_norm": 2.125, "learning_rate": 4.872941993370598e-06, "loss": 0.5801, "mean_token_accuracy": 0.882948562502861, "num_tokens": 69108474.0, "step": 646 }, { "epoch": 1.4743443557582667, "grad_norm": 3.265625, "learning_rate": 4.872348516920591e-06, "loss": 0.6272, "mean_token_accuracy": 0.8692348450422287, "num_tokens": 69215286.0, "step": 647 }, { "epoch": 1.4766248574686431, "grad_norm": 1.890625, "learning_rate": 4.8717536939713665e-06, "loss": 0.5848, "mean_token_accuracy": 0.8783621490001678, "num_tokens": 69322791.0, "step": 648 }, { "epoch": 1.4789053591790193, "grad_norm": 4.75, "learning_rate": 4.871157524860533e-06, "loss": 0.603, "mean_token_accuracy": 0.8760869204998016, "num_tokens": 69429973.0, "step": 649 }, { "epoch": 1.4811858608893957, "grad_norm": 3.796875, "learning_rate": 4.870560009926465e-06, "loss": 0.6125, "mean_token_accuracy": 0.8765671104192734, "num_tokens": 69537546.0, "step": 650 }, { "epoch": 1.483466362599772, "grad_norm": 2.953125, "learning_rate": 4.869961149508301e-06, "loss": 0.6213, "mean_token_accuracy": 0.8740588128566742, "num_tokens": 69644078.0, "step": 651 }, { "epoch": 1.4857468643101481, "grad_norm": 4.3125, "learning_rate": 4.869360943945943e-06, "loss": 0.5857, "mean_token_accuracy": 0.8758237510919571, "num_tokens": 69750942.0, "step": 652 }, { "epoch": 1.4880273660205245, "grad_norm": 3.546875, "learning_rate": 4.868759393580054e-06, "loss": 0.6004, "mean_token_accuracy": 0.8765598982572556, "num_tokens": 69857823.0, "step": 653 }, { "epoch": 1.4903078677309007, "grad_norm": 2.1875, "learning_rate": 4.868156498752066e-06, "loss": 0.6149, "mean_token_accuracy": 0.8746924102306366, "num_tokens": 69964959.0, "step": 654 }, { "epoch": 1.4925883694412772, "grad_norm": 2.34375, "learning_rate": 4.8675522598041675e-06, "loss": 0.6092, "mean_token_accuracy": 0.8732979595661163, "num_tokens": 70071686.0, "step": 655 }, { "epoch": 1.4948688711516533, "grad_norm": 2.109375, "learning_rate": 4.866946677079314e-06, "loss": 0.5976, "mean_token_accuracy": 0.875095933675766, "num_tokens": 70179037.0, "step": 656 }, { "epoch": 1.4971493728620295, "grad_norm": 4.1875, "learning_rate": 4.866339750921222e-06, "loss": 0.6294, "mean_token_accuracy": 0.8688063323497772, "num_tokens": 70285939.0, "step": 657 }, { "epoch": 1.499429874572406, "grad_norm": 3.1875, "learning_rate": 4.86573148167437e-06, "loss": 0.6153, "mean_token_accuracy": 0.8754535466432571, "num_tokens": 70393254.0, "step": 658 }, { "epoch": 1.5017103762827824, "grad_norm": 5.34375, "learning_rate": 4.865121869684003e-06, "loss": 0.5896, "mean_token_accuracy": 0.8764495700597763, "num_tokens": 70500856.0, "step": 659 }, { "epoch": 1.5039908779931586, "grad_norm": 3.390625, "learning_rate": 4.864510915296122e-06, "loss": 0.597, "mean_token_accuracy": 0.8765180259943008, "num_tokens": 70608662.0, "step": 660 }, { "epoch": 1.5039908779931586, "eval_loss": 0.612367570400238, "eval_mean_token_accuracy": 0.8747105800153638, "eval_num_tokens": 70608662.0, "eval_runtime": 58.5955, "eval_samples_per_second": 143.1, "eval_steps_per_second": 4.488, "step": 660 }, { "epoch": 1.5062713797035348, "grad_norm": 2.109375, "learning_rate": 4.8638986188574955e-06, "loss": 0.5952, "mean_token_accuracy": 0.87701815366745, "num_tokens": 70715623.0, "step": 661 }, { "epoch": 1.508551881413911, "grad_norm": 3.53125, "learning_rate": 4.863284980715649e-06, "loss": 0.6125, "mean_token_accuracy": 0.8763703256845474, "num_tokens": 70822494.0, "step": 662 }, { "epoch": 1.5108323831242874, "grad_norm": 2.265625, "learning_rate": 4.8626700012188724e-06, "loss": 0.6303, "mean_token_accuracy": 0.8716311007738113, "num_tokens": 70929521.0, "step": 663 }, { "epoch": 1.5131128848346638, "grad_norm": 3.171875, "learning_rate": 4.8620536807162164e-06, "loss": 0.6084, "mean_token_accuracy": 0.8747840225696564, "num_tokens": 71036264.0, "step": 664 }, { "epoch": 1.51539338654504, "grad_norm": 2.328125, "learning_rate": 4.861436019557492e-06, "loss": 0.6227, "mean_token_accuracy": 0.8713407516479492, "num_tokens": 71142930.0, "step": 665 }, { "epoch": 1.5176738882554162, "grad_norm": 2.6875, "learning_rate": 4.8608170180932725e-06, "loss": 0.6033, "mean_token_accuracy": 0.8776234537363052, "num_tokens": 71249846.0, "step": 666 }, { "epoch": 1.5199543899657924, "grad_norm": 2.203125, "learning_rate": 4.860196676674891e-06, "loss": 0.5998, "mean_token_accuracy": 0.8785290718078613, "num_tokens": 71356611.0, "step": 667 }, { "epoch": 1.5222348916761688, "grad_norm": 2.5625, "learning_rate": 4.8595749956544414e-06, "loss": 0.6036, "mean_token_accuracy": 0.8775981813669205, "num_tokens": 71463513.0, "step": 668 }, { "epoch": 1.5245153933865452, "grad_norm": 3.171875, "learning_rate": 4.858951975384777e-06, "loss": 0.6166, "mean_token_accuracy": 0.874157264828682, "num_tokens": 71570255.0, "step": 669 }, { "epoch": 1.5267958950969214, "grad_norm": 5.875, "learning_rate": 4.858327616219513e-06, "loss": 0.5885, "mean_token_accuracy": 0.8791629523038864, "num_tokens": 71677310.0, "step": 670 }, { "epoch": 1.5290763968072976, "grad_norm": 5.15625, "learning_rate": 4.857701918513023e-06, "loss": 0.6137, "mean_token_accuracy": 0.8735476732254028, "num_tokens": 71783915.0, "step": 671 }, { "epoch": 1.5313568985176738, "grad_norm": 3.6875, "learning_rate": 4.857074882620442e-06, "loss": 0.6026, "mean_token_accuracy": 0.8756736516952515, "num_tokens": 71891495.0, "step": 672 }, { "epoch": 1.5336374002280502, "grad_norm": 2.453125, "learning_rate": 4.856446508897662e-06, "loss": 0.5949, "mean_token_accuracy": 0.8775335848331451, "num_tokens": 71999208.0, "step": 673 }, { "epoch": 1.5359179019384266, "grad_norm": 8.1875, "learning_rate": 4.8558167977013365e-06, "loss": 0.6152, "mean_token_accuracy": 0.8727478533983231, "num_tokens": 72106904.0, "step": 674 }, { "epoch": 1.5381984036488028, "grad_norm": 6.59375, "learning_rate": 4.8551857493888775e-06, "loss": 0.5947, "mean_token_accuracy": 0.8790593892335892, "num_tokens": 72213551.0, "step": 675 }, { "epoch": 1.540478905359179, "grad_norm": 6.1875, "learning_rate": 4.854553364318456e-06, "loss": 0.6048, "mean_token_accuracy": 0.8734369874000549, "num_tokens": 72320910.0, "step": 676 }, { "epoch": 1.5427594070695552, "grad_norm": 2.53125, "learning_rate": 4.8539196428490016e-06, "loss": 0.6387, "mean_token_accuracy": 0.8696837574243546, "num_tokens": 72427539.0, "step": 677 }, { "epoch": 1.5450399087799316, "grad_norm": 4.84375, "learning_rate": 4.8532845853402015e-06, "loss": 0.622, "mean_token_accuracy": 0.8762244433164597, "num_tokens": 72534311.0, "step": 678 }, { "epoch": 1.547320410490308, "grad_norm": 3.765625, "learning_rate": 4.8526481921525035e-06, "loss": 0.615, "mean_token_accuracy": 0.872207522392273, "num_tokens": 72641361.0, "step": 679 }, { "epoch": 1.5496009122006842, "grad_norm": 3.625, "learning_rate": 4.85201046364711e-06, "loss": 0.6082, "mean_token_accuracy": 0.8779794573783875, "num_tokens": 72748703.0, "step": 680 }, { "epoch": 1.5518814139110604, "grad_norm": 2.65625, "learning_rate": 4.851371400185986e-06, "loss": 0.6091, "mean_token_accuracy": 0.8768866658210754, "num_tokens": 72855762.0, "step": 681 }, { "epoch": 1.5541619156214366, "grad_norm": 2.40625, "learning_rate": 4.85073100213185e-06, "loss": 0.6063, "mean_token_accuracy": 0.8712277412414551, "num_tokens": 72962645.0, "step": 682 }, { "epoch": 1.556442417331813, "grad_norm": 4.34375, "learning_rate": 4.8500892698481784e-06, "loss": 0.6018, "mean_token_accuracy": 0.8731422871351242, "num_tokens": 73069760.0, "step": 683 }, { "epoch": 1.5587229190421894, "grad_norm": 3.609375, "learning_rate": 4.849446203699209e-06, "loss": 0.6125, "mean_token_accuracy": 0.8730466067790985, "num_tokens": 73176865.0, "step": 684 }, { "epoch": 1.5610034207525656, "grad_norm": 2.375, "learning_rate": 4.848801804049932e-06, "loss": 0.6069, "mean_token_accuracy": 0.8776687532663345, "num_tokens": 73284011.0, "step": 685 }, { "epoch": 1.5632839224629418, "grad_norm": 3.9375, "learning_rate": 4.848156071266095e-06, "loss": 0.5939, "mean_token_accuracy": 0.8770481199026108, "num_tokens": 73390880.0, "step": 686 }, { "epoch": 1.565564424173318, "grad_norm": 2.78125, "learning_rate": 4.847509005714207e-06, "loss": 0.6151, "mean_token_accuracy": 0.8751052618026733, "num_tokens": 73497575.0, "step": 687 }, { "epoch": 1.5678449258836944, "grad_norm": 4.71875, "learning_rate": 4.846860607761527e-06, "loss": 0.6022, "mean_token_accuracy": 0.8736310601234436, "num_tokens": 73604375.0, "step": 688 }, { "epoch": 1.5701254275940708, "grad_norm": 2.171875, "learning_rate": 4.8462108777760734e-06, "loss": 0.6045, "mean_token_accuracy": 0.873234361410141, "num_tokens": 73712100.0, "step": 689 }, { "epoch": 1.572405929304447, "grad_norm": 2.453125, "learning_rate": 4.845559816126622e-06, "loss": 0.6198, "mean_token_accuracy": 0.8704589009284973, "num_tokens": 73819126.0, "step": 690 }, { "epoch": 1.5746864310148232, "grad_norm": 4.25, "learning_rate": 4.844907423182699e-06, "loss": 0.6048, "mean_token_accuracy": 0.8772129714488983, "num_tokens": 73925905.0, "step": 691 }, { "epoch": 1.5769669327251994, "grad_norm": 3.90625, "learning_rate": 4.844253699314596e-06, "loss": 0.5999, "mean_token_accuracy": 0.875739112496376, "num_tokens": 74033596.0, "step": 692 }, { "epoch": 1.5792474344355758, "grad_norm": 2.453125, "learning_rate": 4.843598644893349e-06, "loss": 0.5988, "mean_token_accuracy": 0.8757044672966003, "num_tokens": 74140480.0, "step": 693 }, { "epoch": 1.5815279361459522, "grad_norm": 3.828125, "learning_rate": 4.842942260290757e-06, "loss": 0.6247, "mean_token_accuracy": 0.8707884252071381, "num_tokens": 74247691.0, "step": 694 }, { "epoch": 1.5838084378563284, "grad_norm": 2.703125, "learning_rate": 4.84228454587937e-06, "loss": 0.6162, "mean_token_accuracy": 0.8712183386087418, "num_tokens": 74354568.0, "step": 695 }, { "epoch": 1.5860889395667046, "grad_norm": 3.328125, "learning_rate": 4.841625502032495e-06, "loss": 0.5966, "mean_token_accuracy": 0.8787146657705307, "num_tokens": 74461346.0, "step": 696 }, { "epoch": 1.5883694412770808, "grad_norm": 3.15625, "learning_rate": 4.84096512912419e-06, "loss": 0.6141, "mean_token_accuracy": 0.8755437433719635, "num_tokens": 74567912.0, "step": 697 }, { "epoch": 1.5906499429874572, "grad_norm": 2.21875, "learning_rate": 4.8403034275292735e-06, "loss": 0.6005, "mean_token_accuracy": 0.8786727637052536, "num_tokens": 74674714.0, "step": 698 }, { "epoch": 1.5929304446978336, "grad_norm": 2.984375, "learning_rate": 4.839640397623312e-06, "loss": 0.604, "mean_token_accuracy": 0.8754930794239044, "num_tokens": 74781656.0, "step": 699 }, { "epoch": 1.5952109464082098, "grad_norm": 4.125, "learning_rate": 4.83897603978263e-06, "loss": 0.6078, "mean_token_accuracy": 0.8756787329912186, "num_tokens": 74888714.0, "step": 700 }, { "epoch": 1.597491448118586, "grad_norm": 4.03125, "learning_rate": 4.838310354384304e-06, "loss": 0.6248, "mean_token_accuracy": 0.8733028322458267, "num_tokens": 74995535.0, "step": 701 }, { "epoch": 1.5997719498289624, "grad_norm": 2.171875, "learning_rate": 4.8376433418061615e-06, "loss": 0.6095, "mean_token_accuracy": 0.8739898204803467, "num_tokens": 75102558.0, "step": 702 }, { "epoch": 1.6020524515393386, "grad_norm": 4.6875, "learning_rate": 4.8369750024267904e-06, "loss": 0.6103, "mean_token_accuracy": 0.8737581223249435, "num_tokens": 75209920.0, "step": 703 }, { "epoch": 1.604332953249715, "grad_norm": 4.03125, "learning_rate": 4.836305336625523e-06, "loss": 0.6218, "mean_token_accuracy": 0.8702377676963806, "num_tokens": 75316855.0, "step": 704 }, { "epoch": 1.6066134549600912, "grad_norm": 4.84375, "learning_rate": 4.835634344782453e-06, "loss": 0.6076, "mean_token_accuracy": 0.8709095418453217, "num_tokens": 75423929.0, "step": 705 }, { "epoch": 1.6088939566704674, "grad_norm": 2.53125, "learning_rate": 4.834962027278418e-06, "loss": 0.6074, "mean_token_accuracy": 0.873084306716919, "num_tokens": 75531031.0, "step": 706 }, { "epoch": 1.6111744583808438, "grad_norm": 2.28125, "learning_rate": 4.834288384495015e-06, "loss": 0.6219, "mean_token_accuracy": 0.8735803365707397, "num_tokens": 75637818.0, "step": 707 }, { "epoch": 1.61345496009122, "grad_norm": 2.953125, "learning_rate": 4.833613416814591e-06, "loss": 0.5976, "mean_token_accuracy": 0.8781051784753799, "num_tokens": 75744879.0, "step": 708 }, { "epoch": 1.6157354618015964, "grad_norm": 2.421875, "learning_rate": 4.832937124620243e-06, "loss": 0.5884, "mean_token_accuracy": 0.8801741451025009, "num_tokens": 75852406.0, "step": 709 }, { "epoch": 1.6180159635119726, "grad_norm": 5.84375, "learning_rate": 4.832259508295822e-06, "loss": 0.5971, "mean_token_accuracy": 0.8746786862611771, "num_tokens": 75959416.0, "step": 710 }, { "epoch": 1.6202964652223488, "grad_norm": 4.125, "learning_rate": 4.831580568225931e-06, "loss": 0.6059, "mean_token_accuracy": 0.8775349110364914, "num_tokens": 76067055.0, "step": 711 }, { "epoch": 1.6225769669327252, "grad_norm": 2.6875, "learning_rate": 4.830900304795921e-06, "loss": 0.5873, "mean_token_accuracy": 0.8796486407518387, "num_tokens": 76174493.0, "step": 712 }, { "epoch": 1.6248574686431014, "grad_norm": 2.546875, "learning_rate": 4.8302187183918996e-06, "loss": 0.6092, "mean_token_accuracy": 0.8721607774496078, "num_tokens": 76281621.0, "step": 713 }, { "epoch": 1.6271379703534778, "grad_norm": 2.546875, "learning_rate": 4.8295358094007184e-06, "loss": 0.6131, "mean_token_accuracy": 0.8758230358362198, "num_tokens": 76387782.0, "step": 714 }, { "epoch": 1.629418472063854, "grad_norm": 3.328125, "learning_rate": 4.828851578209986e-06, "loss": 0.5849, "mean_token_accuracy": 0.8804755955934525, "num_tokens": 76495248.0, "step": 715 }, { "epoch": 1.6316989737742302, "grad_norm": 2.5625, "learning_rate": 4.828166025208059e-06, "loss": 0.5963, "mean_token_accuracy": 0.8788855522871017, "num_tokens": 76602322.0, "step": 716 }, { "epoch": 1.6339794754846066, "grad_norm": 3.5, "learning_rate": 4.8274791507840416e-06, "loss": 0.6142, "mean_token_accuracy": 0.876452699303627, "num_tokens": 76709429.0, "step": 717 }, { "epoch": 1.636259977194983, "grad_norm": 2.109375, "learning_rate": 4.826790955327793e-06, "loss": 0.6182, "mean_token_accuracy": 0.8712098151445389, "num_tokens": 76817050.0, "step": 718 }, { "epoch": 1.6385404789053593, "grad_norm": 2.3125, "learning_rate": 4.826101439229918e-06, "loss": 0.6144, "mean_token_accuracy": 0.8716437220573425, "num_tokens": 76923670.0, "step": 719 }, { "epoch": 1.6408209806157354, "grad_norm": 5.0, "learning_rate": 4.825410602881774e-06, "loss": 0.6147, "mean_token_accuracy": 0.8725506961345673, "num_tokens": 77030550.0, "step": 720 }, { "epoch": 1.6431014823261116, "grad_norm": 4.25, "learning_rate": 4.824718446675465e-06, "loss": 0.6036, "mean_token_accuracy": 0.8758846968412399, "num_tokens": 77137310.0, "step": 721 }, { "epoch": 1.645381984036488, "grad_norm": 2.84375, "learning_rate": 4.8240249710038455e-06, "loss": 0.5956, "mean_token_accuracy": 0.8754921108484268, "num_tokens": 77244391.0, "step": 722 }, { "epoch": 1.6476624857468645, "grad_norm": 5.40625, "learning_rate": 4.82333017626052e-06, "loss": 0.609, "mean_token_accuracy": 0.8759109824895859, "num_tokens": 77350980.0, "step": 723 }, { "epoch": 1.6499429874572407, "grad_norm": 7.5, "learning_rate": 4.82263406283984e-06, "loss": 0.6217, "mean_token_accuracy": 0.8713529407978058, "num_tokens": 77457863.0, "step": 724 }, { "epoch": 1.6522234891676169, "grad_norm": 5.15625, "learning_rate": 4.821936631136907e-06, "loss": 0.612, "mean_token_accuracy": 0.8779774755239487, "num_tokens": 77564956.0, "step": 725 }, { "epoch": 1.654503990877993, "grad_norm": 2.515625, "learning_rate": 4.821237881547567e-06, "loss": 0.583, "mean_token_accuracy": 0.8784075975418091, "num_tokens": 77672980.0, "step": 726 }, { "epoch": 1.6567844925883695, "grad_norm": 2.234375, "learning_rate": 4.82053781446842e-06, "loss": 0.5857, "mean_token_accuracy": 0.8801317662000656, "num_tokens": 77779982.0, "step": 727 }, { "epoch": 1.6590649942987459, "grad_norm": 2.140625, "learning_rate": 4.819836430296809e-06, "loss": 0.6133, "mean_token_accuracy": 0.8747014552354813, "num_tokens": 77886824.0, "step": 728 }, { "epoch": 1.661345496009122, "grad_norm": 3.421875, "learning_rate": 4.819133729430826e-06, "loss": 0.606, "mean_token_accuracy": 0.874286100268364, "num_tokens": 77993947.0, "step": 729 }, { "epoch": 1.6636259977194983, "grad_norm": 3.46875, "learning_rate": 4.818429712269312e-06, "loss": 0.5977, "mean_token_accuracy": 0.8773009479045868, "num_tokens": 78101464.0, "step": 730 }, { "epoch": 1.6659064994298745, "grad_norm": 4.34375, "learning_rate": 4.8177243792118515e-06, "loss": 0.5972, "mean_token_accuracy": 0.8781311810016632, "num_tokens": 78208412.0, "step": 731 }, { "epoch": 1.6681870011402509, "grad_norm": 5.03125, "learning_rate": 4.8170177306587785e-06, "loss": 0.5947, "mean_token_accuracy": 0.8763359040021896, "num_tokens": 78316677.0, "step": 732 }, { "epoch": 1.6704675028506273, "grad_norm": 3.421875, "learning_rate": 4.8163097670111735e-06, "loss": 0.6005, "mean_token_accuracy": 0.8768177330493927, "num_tokens": 78424519.0, "step": 733 }, { "epoch": 1.6727480045610035, "grad_norm": 5.1875, "learning_rate": 4.815600488670863e-06, "loss": 0.6127, "mean_token_accuracy": 0.8756203055381775, "num_tokens": 78531596.0, "step": 734 }, { "epoch": 1.6750285062713797, "grad_norm": 2.109375, "learning_rate": 4.81488989604042e-06, "loss": 0.5895, "mean_token_accuracy": 0.8781074434518814, "num_tokens": 78638185.0, "step": 735 }, { "epoch": 1.6773090079817559, "grad_norm": 3.3125, "learning_rate": 4.814177989523162e-06, "loss": 0.5993, "mean_token_accuracy": 0.8762544244527817, "num_tokens": 78745981.0, "step": 736 }, { "epoch": 1.6795895096921323, "grad_norm": 4.15625, "learning_rate": 4.813464769523154e-06, "loss": 0.6209, "mean_token_accuracy": 0.8740936666727066, "num_tokens": 78853266.0, "step": 737 }, { "epoch": 1.6818700114025087, "grad_norm": 2.53125, "learning_rate": 4.812750236445206e-06, "loss": 0.6025, "mean_token_accuracy": 0.8746587634086609, "num_tokens": 78960887.0, "step": 738 }, { "epoch": 1.6841505131128849, "grad_norm": 2.9375, "learning_rate": 4.812034390694874e-06, "loss": 0.6203, "mean_token_accuracy": 0.871962770819664, "num_tokens": 79068181.0, "step": 739 }, { "epoch": 1.686431014823261, "grad_norm": 2.625, "learning_rate": 4.811317232678456e-06, "loss": 0.5949, "mean_token_accuracy": 0.8781395107507706, "num_tokens": 79175196.0, "step": 740 }, { "epoch": 1.6887115165336373, "grad_norm": 3.703125, "learning_rate": 4.810598762803e-06, "loss": 0.6016, "mean_token_accuracy": 0.8768087178468704, "num_tokens": 79282678.0, "step": 741 }, { "epoch": 1.6909920182440137, "grad_norm": 4.65625, "learning_rate": 4.809878981476293e-06, "loss": 0.6145, "mean_token_accuracy": 0.8780290633440018, "num_tokens": 79389475.0, "step": 742 }, { "epoch": 1.69327251995439, "grad_norm": 3.5, "learning_rate": 4.80915788910687e-06, "loss": 0.6114, "mean_token_accuracy": 0.8754473924636841, "num_tokens": 79496094.0, "step": 743 }, { "epoch": 1.6955530216647663, "grad_norm": 4.125, "learning_rate": 4.80843548610401e-06, "loss": 0.5918, "mean_token_accuracy": 0.8804915100336075, "num_tokens": 79603317.0, "step": 744 }, { "epoch": 1.6978335233751425, "grad_norm": 2.28125, "learning_rate": 4.807711772877733e-06, "loss": 0.6007, "mean_token_accuracy": 0.875912070274353, "num_tokens": 79709931.0, "step": 745 }, { "epoch": 1.7001140250855187, "grad_norm": 2.828125, "learning_rate": 4.8069867498388066e-06, "loss": 0.6093, "mean_token_accuracy": 0.8721041977405548, "num_tokens": 79816476.0, "step": 746 }, { "epoch": 1.702394526795895, "grad_norm": 2.359375, "learning_rate": 4.806260417398739e-06, "loss": 0.6006, "mean_token_accuracy": 0.8775873631238937, "num_tokens": 79923080.0, "step": 747 }, { "epoch": 1.7046750285062715, "grad_norm": 2.46875, "learning_rate": 4.805532775969783e-06, "loss": 0.607, "mean_token_accuracy": 0.8796124309301376, "num_tokens": 80030054.0, "step": 748 }, { "epoch": 1.7069555302166477, "grad_norm": 2.1875, "learning_rate": 4.804803825964933e-06, "loss": 0.6043, "mean_token_accuracy": 0.8757105469703674, "num_tokens": 80137000.0, "step": 749 }, { "epoch": 1.709236031927024, "grad_norm": 4.9375, "learning_rate": 4.804073567797928e-06, "loss": 0.6095, "mean_token_accuracy": 0.874399796128273, "num_tokens": 80244175.0, "step": 750 }, { "epoch": 1.7115165336374, "grad_norm": 2.59375, "learning_rate": 4.803342001883247e-06, "loss": 0.6234, "mean_token_accuracy": 0.8730274736881256, "num_tokens": 80351450.0, "step": 751 }, { "epoch": 1.7137970353477765, "grad_norm": 3.234375, "learning_rate": 4.802609128636113e-06, "loss": 0.609, "mean_token_accuracy": 0.8778874278068542, "num_tokens": 80458377.0, "step": 752 }, { "epoch": 1.716077537058153, "grad_norm": 4.8125, "learning_rate": 4.801874948472492e-06, "loss": 0.6195, "mean_token_accuracy": 0.8746256977319717, "num_tokens": 80565320.0, "step": 753 }, { "epoch": 1.718358038768529, "grad_norm": 3.359375, "learning_rate": 4.801139461809089e-06, "loss": 0.5944, "mean_token_accuracy": 0.8773505538702011, "num_tokens": 80672618.0, "step": 754 }, { "epoch": 1.7206385404789053, "grad_norm": 4.34375, "learning_rate": 4.800402669063353e-06, "loss": 0.6376, "mean_token_accuracy": 0.8712969422340393, "num_tokens": 80779049.0, "step": 755 }, { "epoch": 1.7229190421892815, "grad_norm": 2.890625, "learning_rate": 4.799664570653473e-06, "loss": 0.6028, "mean_token_accuracy": 0.8769599795341492, "num_tokens": 80886174.0, "step": 756 }, { "epoch": 1.725199543899658, "grad_norm": 4.53125, "learning_rate": 4.79892516699838e-06, "loss": 0.5913, "mean_token_accuracy": 0.8780887126922607, "num_tokens": 80993329.0, "step": 757 }, { "epoch": 1.7274800456100343, "grad_norm": 6.96875, "learning_rate": 4.798184458517745e-06, "loss": 0.5998, "mean_token_accuracy": 0.8770759999752045, "num_tokens": 81101103.0, "step": 758 }, { "epoch": 1.7297605473204105, "grad_norm": 6.6875, "learning_rate": 4.797442445631978e-06, "loss": 0.5998, "mean_token_accuracy": 0.8771584331989288, "num_tokens": 81208123.0, "step": 759 }, { "epoch": 1.7320410490307867, "grad_norm": 3.796875, "learning_rate": 4.7966991287622335e-06, "loss": 0.6297, "mean_token_accuracy": 0.8718820959329605, "num_tokens": 81315003.0, "step": 760 }, { "epoch": 1.734321550741163, "grad_norm": 8.0625, "learning_rate": 4.795954508330403e-06, "loss": 0.6039, "mean_token_accuracy": 0.8779590427875519, "num_tokens": 81421672.0, "step": 761 }, { "epoch": 1.7366020524515393, "grad_norm": 11.4375, "learning_rate": 4.795208584759119e-06, "loss": 0.6015, "mean_token_accuracy": 0.8754216283559799, "num_tokens": 81528963.0, "step": 762 }, { "epoch": 1.7388825541619157, "grad_norm": 7.625, "learning_rate": 4.794461358471753e-06, "loss": 0.585, "mean_token_accuracy": 0.8796775341033936, "num_tokens": 81636332.0, "step": 763 }, { "epoch": 1.741163055872292, "grad_norm": 9.5625, "learning_rate": 4.7937128298924155e-06, "loss": 0.6187, "mean_token_accuracy": 0.8761927634477615, "num_tokens": 81743814.0, "step": 764 }, { "epoch": 1.7434435575826681, "grad_norm": 5.3125, "learning_rate": 4.7929629994459584e-06, "loss": 0.5587, "mean_token_accuracy": 0.8852293342351913, "num_tokens": 81851249.0, "step": 765 }, { "epoch": 1.7457240592930443, "grad_norm": 4.1875, "learning_rate": 4.792211867557969e-06, "loss": 0.5976, "mean_token_accuracy": 0.8791010826826096, "num_tokens": 81958684.0, "step": 766 }, { "epoch": 1.7480045610034207, "grad_norm": 5.625, "learning_rate": 4.7914594346547774e-06, "loss": 0.5968, "mean_token_accuracy": 0.8784136772155762, "num_tokens": 82065522.0, "step": 767 }, { "epoch": 1.7502850627137971, "grad_norm": 5.75, "learning_rate": 4.790705701163449e-06, "loss": 0.6138, "mean_token_accuracy": 0.8732837587594986, "num_tokens": 82172365.0, "step": 768 }, { "epoch": 1.7525655644241733, "grad_norm": 4.6875, "learning_rate": 4.789950667511789e-06, "loss": 0.6063, "mean_token_accuracy": 0.87830650806427, "num_tokens": 82279622.0, "step": 769 }, { "epoch": 1.7548460661345495, "grad_norm": 3.203125, "learning_rate": 4.789194334128338e-06, "loss": 0.6031, "mean_token_accuracy": 0.877634271979332, "num_tokens": 82386626.0, "step": 770 }, { "epoch": 1.757126567844926, "grad_norm": 2.46875, "learning_rate": 4.788436701442378e-06, "loss": 0.6191, "mean_token_accuracy": 0.8743524998426437, "num_tokens": 82494070.0, "step": 771 }, { "epoch": 1.7594070695553021, "grad_norm": 4.9375, "learning_rate": 4.787677769883926e-06, "loss": 0.6018, "mean_token_accuracy": 0.8753460049629211, "num_tokens": 82601259.0, "step": 772 }, { "epoch": 1.7616875712656785, "grad_norm": 4.8125, "learning_rate": 4.786917539883738e-06, "loss": 0.5842, "mean_token_accuracy": 0.8806093335151672, "num_tokens": 82708412.0, "step": 773 }, { "epoch": 1.7639680729760547, "grad_norm": 5.0625, "learning_rate": 4.786156011873304e-06, "loss": 0.6122, "mean_token_accuracy": 0.8751035928726196, "num_tokens": 82815320.0, "step": 774 }, { "epoch": 1.766248574686431, "grad_norm": 2.875, "learning_rate": 4.785393186284854e-06, "loss": 0.5978, "mean_token_accuracy": 0.8772329390048981, "num_tokens": 82922009.0, "step": 775 }, { "epoch": 1.7685290763968073, "grad_norm": 2.046875, "learning_rate": 4.784629063551354e-06, "loss": 0.5729, "mean_token_accuracy": 0.8803988993167877, "num_tokens": 83029138.0, "step": 776 }, { "epoch": 1.7708095781071835, "grad_norm": 2.890625, "learning_rate": 4.783863644106502e-06, "loss": 0.5954, "mean_token_accuracy": 0.8786198645830154, "num_tokens": 83136422.0, "step": 777 }, { "epoch": 1.77309007981756, "grad_norm": 2.640625, "learning_rate": 4.783096928384739e-06, "loss": 0.6054, "mean_token_accuracy": 0.8743276447057724, "num_tokens": 83243905.0, "step": 778 }, { "epoch": 1.7753705815279361, "grad_norm": 4.3125, "learning_rate": 4.782328916821235e-06, "loss": 0.6018, "mean_token_accuracy": 0.8728191703557968, "num_tokens": 83350681.0, "step": 779 }, { "epoch": 1.7776510832383123, "grad_norm": 2.984375, "learning_rate": 4.7815596098519004e-06, "loss": 0.6099, "mean_token_accuracy": 0.873179629445076, "num_tokens": 83457754.0, "step": 780 }, { "epoch": 1.7799315849486887, "grad_norm": 2.1875, "learning_rate": 4.780789007913379e-06, "loss": 0.6103, "mean_token_accuracy": 0.8735631853342056, "num_tokens": 83565053.0, "step": 781 }, { "epoch": 1.782212086659065, "grad_norm": 2.28125, "learning_rate": 4.780017111443048e-06, "loss": 0.5854, "mean_token_accuracy": 0.8767279386520386, "num_tokens": 83672036.0, "step": 782 }, { "epoch": 1.7844925883694414, "grad_norm": 4.25, "learning_rate": 4.779243920879023e-06, "loss": 0.6206, "mean_token_accuracy": 0.8717406392097473, "num_tokens": 83778589.0, "step": 783 }, { "epoch": 1.7867730900798175, "grad_norm": 2.765625, "learning_rate": 4.77846943666015e-06, "loss": 0.5772, "mean_token_accuracy": 0.8808600008487701, "num_tokens": 83886086.0, "step": 784 }, { "epoch": 1.7890535917901937, "grad_norm": 2.5, "learning_rate": 4.777693659226013e-06, "loss": 0.612, "mean_token_accuracy": 0.8717308193445206, "num_tokens": 83993388.0, "step": 785 }, { "epoch": 1.7913340935005702, "grad_norm": 2.40625, "learning_rate": 4.776916589016928e-06, "loss": 0.612, "mean_token_accuracy": 0.874013751745224, "num_tokens": 84100238.0, "step": 786 }, { "epoch": 1.7936145952109466, "grad_norm": 2.90625, "learning_rate": 4.776138226473944e-06, "loss": 0.6069, "mean_token_accuracy": 0.8767507672309875, "num_tokens": 84206817.0, "step": 787 }, { "epoch": 1.7958950969213228, "grad_norm": 3.109375, "learning_rate": 4.775358572038845e-06, "loss": 0.6163, "mean_token_accuracy": 0.8742737025022507, "num_tokens": 84314065.0, "step": 788 }, { "epoch": 1.798175598631699, "grad_norm": 2.046875, "learning_rate": 4.774577626154148e-06, "loss": 0.6074, "mean_token_accuracy": 0.873917818069458, "num_tokens": 84421307.0, "step": 789 }, { "epoch": 1.8004561003420751, "grad_norm": 2.65625, "learning_rate": 4.773795389263104e-06, "loss": 0.598, "mean_token_accuracy": 0.8776136040687561, "num_tokens": 84528605.0, "step": 790 }, { "epoch": 1.8027366020524516, "grad_norm": 2.34375, "learning_rate": 4.773011861809694e-06, "loss": 0.6154, "mean_token_accuracy": 0.8759381324052811, "num_tokens": 84635657.0, "step": 791 }, { "epoch": 1.805017103762828, "grad_norm": 4.3125, "learning_rate": 4.772227044238632e-06, "loss": 0.6173, "mean_token_accuracy": 0.8727094084024429, "num_tokens": 84742929.0, "step": 792 }, { "epoch": 1.8072976054732042, "grad_norm": 3.375, "learning_rate": 4.771440936995367e-06, "loss": 0.5963, "mean_token_accuracy": 0.876227617263794, "num_tokens": 84849828.0, "step": 793 }, { "epoch": 1.8095781071835804, "grad_norm": 2.671875, "learning_rate": 4.770653540526079e-06, "loss": 0.6116, "mean_token_accuracy": 0.8727654963731766, "num_tokens": 84956976.0, "step": 794 }, { "epoch": 1.8118586088939566, "grad_norm": 2.390625, "learning_rate": 4.7698648552776785e-06, "loss": 0.6202, "mean_token_accuracy": 0.8722630441188812, "num_tokens": 85063714.0, "step": 795 }, { "epoch": 1.814139110604333, "grad_norm": 6.9375, "learning_rate": 4.769074881697806e-06, "loss": 0.5742, "mean_token_accuracy": 0.883360892534256, "num_tokens": 85171018.0, "step": 796 }, { "epoch": 1.8164196123147094, "grad_norm": 8.1875, "learning_rate": 4.768283620234838e-06, "loss": 0.5899, "mean_token_accuracy": 0.8781389743089676, "num_tokens": 85278037.0, "step": 797 }, { "epoch": 1.8187001140250856, "grad_norm": 7.28125, "learning_rate": 4.767491071337877e-06, "loss": 0.6095, "mean_token_accuracy": 0.8743828237056732, "num_tokens": 85385317.0, "step": 798 }, { "epoch": 1.8209806157354618, "grad_norm": 3.46875, "learning_rate": 4.766697235456761e-06, "loss": 0.5999, "mean_token_accuracy": 0.8782155364751816, "num_tokens": 85493351.0, "step": 799 }, { "epoch": 1.823261117445838, "grad_norm": 4.3125, "learning_rate": 4.765902113042053e-06, "loss": 0.6179, "mean_token_accuracy": 0.8723445981740952, "num_tokens": 85599811.0, "step": 800 }, { "epoch": 1.8255416191562144, "grad_norm": 4.875, "learning_rate": 4.765105704545052e-06, "loss": 0.5962, "mean_token_accuracy": 0.8751823008060455, "num_tokens": 85706831.0, "step": 801 }, { "epoch": 1.8278221208665908, "grad_norm": 4.5625, "learning_rate": 4.7643080104177815e-06, "loss": 0.611, "mean_token_accuracy": 0.8715473413467407, "num_tokens": 85813856.0, "step": 802 }, { "epoch": 1.830102622576967, "grad_norm": 4.125, "learning_rate": 4.763509031113e-06, "loss": 0.6133, "mean_token_accuracy": 0.8734237253665924, "num_tokens": 85921277.0, "step": 803 }, { "epoch": 1.8323831242873432, "grad_norm": 2.5, "learning_rate": 4.7627087670841894e-06, "loss": 0.6146, "mean_token_accuracy": 0.8760107606649399, "num_tokens": 86028244.0, "step": 804 }, { "epoch": 1.8346636259977194, "grad_norm": 3.5625, "learning_rate": 4.761907218785566e-06, "loss": 0.6267, "mean_token_accuracy": 0.8721684813499451, "num_tokens": 86134885.0, "step": 805 }, { "epoch": 1.8369441277080958, "grad_norm": 2.21875, "learning_rate": 4.761104386672074e-06, "loss": 0.5806, "mean_token_accuracy": 0.8805957436561584, "num_tokens": 86242405.0, "step": 806 }, { "epoch": 1.8392246294184722, "grad_norm": 3.40625, "learning_rate": 4.760300271199384e-06, "loss": 0.6013, "mean_token_accuracy": 0.875573918223381, "num_tokens": 86349613.0, "step": 807 }, { "epoch": 1.8415051311288484, "grad_norm": 2.375, "learning_rate": 4.759494872823896e-06, "loss": 0.6257, "mean_token_accuracy": 0.8726819604635239, "num_tokens": 86456544.0, "step": 808 }, { "epoch": 1.8437856328392246, "grad_norm": 3.203125, "learning_rate": 4.758688192002741e-06, "loss": 0.6027, "mean_token_accuracy": 0.8799534738063812, "num_tokens": 86563859.0, "step": 809 }, { "epoch": 1.8460661345496008, "grad_norm": 2.46875, "learning_rate": 4.757880229193773e-06, "loss": 0.599, "mean_token_accuracy": 0.8771905303001404, "num_tokens": 86670639.0, "step": 810 }, { "epoch": 1.8483466362599772, "grad_norm": 2.125, "learning_rate": 4.757070984855577e-06, "loss": 0.574, "mean_token_accuracy": 0.8817077875137329, "num_tokens": 86777613.0, "step": 811 }, { "epoch": 1.8506271379703536, "grad_norm": 2.640625, "learning_rate": 4.756260459447465e-06, "loss": 0.6173, "mean_token_accuracy": 0.8729428201913834, "num_tokens": 86884393.0, "step": 812 }, { "epoch": 1.8529076396807298, "grad_norm": 3.0, "learning_rate": 4.755448653429475e-06, "loss": 0.6015, "mean_token_accuracy": 0.877841129899025, "num_tokens": 86991781.0, "step": 813 }, { "epoch": 1.855188141391106, "grad_norm": 2.859375, "learning_rate": 4.754635567262372e-06, "loss": 0.5992, "mean_token_accuracy": 0.8776097446680069, "num_tokens": 87098943.0, "step": 814 }, { "epoch": 1.8574686431014822, "grad_norm": 2.171875, "learning_rate": 4.753821201407648e-06, "loss": 0.603, "mean_token_accuracy": 0.8773338198661804, "num_tokens": 87205996.0, "step": 815 }, { "epoch": 1.8597491448118586, "grad_norm": 2.34375, "learning_rate": 4.7530055563275225e-06, "loss": 0.6032, "mean_token_accuracy": 0.8762088716030121, "num_tokens": 87313545.0, "step": 816 }, { "epoch": 1.862029646522235, "grad_norm": 2.75, "learning_rate": 4.7521886324849385e-06, "loss": 0.626, "mean_token_accuracy": 0.8743972480297089, "num_tokens": 87420224.0, "step": 817 }, { "epoch": 1.8643101482326112, "grad_norm": 3.203125, "learning_rate": 4.751370430343568e-06, "loss": 0.5959, "mean_token_accuracy": 0.8746133893728256, "num_tokens": 87527454.0, "step": 818 }, { "epoch": 1.8665906499429874, "grad_norm": 3.484375, "learning_rate": 4.750550950367805e-06, "loss": 0.5718, "mean_token_accuracy": 0.8807315528392792, "num_tokens": 87634497.0, "step": 819 }, { "epoch": 1.8688711516533636, "grad_norm": 2.25, "learning_rate": 4.749730193022771e-06, "loss": 0.6099, "mean_token_accuracy": 0.8758885711431503, "num_tokens": 87741318.0, "step": 820 }, { "epoch": 1.87115165336374, "grad_norm": 5.78125, "learning_rate": 4.748908158774312e-06, "loss": 0.5887, "mean_token_accuracy": 0.8823642432689667, "num_tokens": 87848637.0, "step": 821 }, { "epoch": 1.8734321550741164, "grad_norm": 2.515625, "learning_rate": 4.748084848089e-06, "loss": 0.602, "mean_token_accuracy": 0.876085102558136, "num_tokens": 87955464.0, "step": 822 }, { "epoch": 1.8757126567844926, "grad_norm": 2.0, "learning_rate": 4.747260261434128e-06, "loss": 0.5963, "mean_token_accuracy": 0.8766376227140427, "num_tokens": 88062827.0, "step": 823 }, { "epoch": 1.8779931584948688, "grad_norm": 4.375, "learning_rate": 4.7464343992777175e-06, "loss": 0.6083, "mean_token_accuracy": 0.8759739398956299, "num_tokens": 88169682.0, "step": 824 }, { "epoch": 1.880273660205245, "grad_norm": 3.5625, "learning_rate": 4.74560726208851e-06, "loss": 0.6146, "mean_token_accuracy": 0.8760395795106888, "num_tokens": 88276642.0, "step": 825 }, { "epoch": 1.8825541619156214, "grad_norm": 3.046875, "learning_rate": 4.744778850335974e-06, "loss": 0.6127, "mean_token_accuracy": 0.8747592866420746, "num_tokens": 88383703.0, "step": 826 }, { "epoch": 1.8848346636259978, "grad_norm": 2.640625, "learning_rate": 4.7439491644903e-06, "loss": 0.6065, "mean_token_accuracy": 0.8750014901161194, "num_tokens": 88490924.0, "step": 827 }, { "epoch": 1.887115165336374, "grad_norm": 5.5, "learning_rate": 4.743118205022402e-06, "loss": 0.5895, "mean_token_accuracy": 0.8755096942186356, "num_tokens": 88597962.0, "step": 828 }, { "epoch": 1.8893956670467502, "grad_norm": 5.25, "learning_rate": 4.742285972403915e-06, "loss": 0.6042, "mean_token_accuracy": 0.8763994574546814, "num_tokens": 88705068.0, "step": 829 }, { "epoch": 1.8916761687571264, "grad_norm": 3.265625, "learning_rate": 4.7414524671071995e-06, "loss": 0.5721, "mean_token_accuracy": 0.8812403380870819, "num_tokens": 88812440.0, "step": 830 }, { "epoch": 1.8939566704675028, "grad_norm": 4.9375, "learning_rate": 4.7406176896053356e-06, "loss": 0.6033, "mean_token_accuracy": 0.8769374042749405, "num_tokens": 88919411.0, "step": 831 }, { "epoch": 1.8962371721778792, "grad_norm": 2.171875, "learning_rate": 4.739781640372129e-06, "loss": 0.6026, "mean_token_accuracy": 0.8767253011465073, "num_tokens": 89026619.0, "step": 832 }, { "epoch": 1.8985176738882554, "grad_norm": 3.984375, "learning_rate": 4.7389443198821035e-06, "loss": 0.6013, "mean_token_accuracy": 0.8763202428817749, "num_tokens": 89133774.0, "step": 833 }, { "epoch": 1.9007981755986316, "grad_norm": 2.734375, "learning_rate": 4.738105728610507e-06, "loss": 0.5916, "mean_token_accuracy": 0.8780044764280319, "num_tokens": 89240785.0, "step": 834 }, { "epoch": 1.9030786773090078, "grad_norm": 2.453125, "learning_rate": 4.737265867033307e-06, "loss": 0.6104, "mean_token_accuracy": 0.876687616109848, "num_tokens": 89347651.0, "step": 835 }, { "epoch": 1.9053591790193842, "grad_norm": 6.125, "learning_rate": 4.736424735627193e-06, "loss": 0.5907, "mean_token_accuracy": 0.8766518086194992, "num_tokens": 89454086.0, "step": 836 }, { "epoch": 1.9076396807297606, "grad_norm": 4.03125, "learning_rate": 4.735582334869575e-06, "loss": 0.5983, "mean_token_accuracy": 0.8772305101156235, "num_tokens": 89561179.0, "step": 837 }, { "epoch": 1.9099201824401368, "grad_norm": 2.875, "learning_rate": 4.734738665238583e-06, "loss": 0.5828, "mean_token_accuracy": 0.8851803094148636, "num_tokens": 89668499.0, "step": 838 }, { "epoch": 1.912200684150513, "grad_norm": 2.484375, "learning_rate": 4.733893727213068e-06, "loss": 0.5937, "mean_token_accuracy": 0.8792501837015152, "num_tokens": 89775580.0, "step": 839 }, { "epoch": 1.9144811858608894, "grad_norm": 3.65625, "learning_rate": 4.7330475212726e-06, "loss": 0.5953, "mean_token_accuracy": 0.8790338486433029, "num_tokens": 89883322.0, "step": 840 }, { "epoch": 1.9167616875712656, "grad_norm": 4.78125, "learning_rate": 4.73220004789747e-06, "loss": 0.6213, "mean_token_accuracy": 0.8717529475688934, "num_tokens": 89989995.0, "step": 841 }, { "epoch": 1.919042189281642, "grad_norm": 4.90625, "learning_rate": 4.7313513075686875e-06, "loss": 0.6019, "mean_token_accuracy": 0.8774217069149017, "num_tokens": 90097055.0, "step": 842 }, { "epoch": 1.9213226909920182, "grad_norm": 2.15625, "learning_rate": 4.73050130076798e-06, "loss": 0.5949, "mean_token_accuracy": 0.8793853372335434, "num_tokens": 90204483.0, "step": 843 }, { "epoch": 1.9236031927023944, "grad_norm": 3.515625, "learning_rate": 4.729650027977797e-06, "loss": 0.5769, "mean_token_accuracy": 0.8808252811431885, "num_tokens": 90311830.0, "step": 844 }, { "epoch": 1.9258836944127709, "grad_norm": 3.046875, "learning_rate": 4.728797489681302e-06, "loss": 0.5869, "mean_token_accuracy": 0.8766701221466064, "num_tokens": 90418653.0, "step": 845 }, { "epoch": 1.928164196123147, "grad_norm": 3.71875, "learning_rate": 4.7279436863623805e-06, "loss": 0.5924, "mean_token_accuracy": 0.880815863609314, "num_tokens": 90526355.0, "step": 846 }, { "epoch": 1.9304446978335235, "grad_norm": 2.625, "learning_rate": 4.7270886185056355e-06, "loss": 0.6098, "mean_token_accuracy": 0.8776162266731262, "num_tokens": 90633735.0, "step": 847 }, { "epoch": 1.9327251995438997, "grad_norm": 2.21875, "learning_rate": 4.726232286596385e-06, "loss": 0.6064, "mean_token_accuracy": 0.8745153993368149, "num_tokens": 90740939.0, "step": 848 }, { "epoch": 1.9350057012542758, "grad_norm": 3.625, "learning_rate": 4.725374691120669e-06, "loss": 0.6194, "mean_token_accuracy": 0.8727088868618011, "num_tokens": 90847747.0, "step": 849 }, { "epoch": 1.9372862029646523, "grad_norm": 4.5, "learning_rate": 4.7245158325652396e-06, "loss": 0.592, "mean_token_accuracy": 0.878357782959938, "num_tokens": 90954915.0, "step": 850 }, { "epoch": 1.9395667046750285, "grad_norm": 6.09375, "learning_rate": 4.7236557114175705e-06, "loss": 0.6183, "mean_token_accuracy": 0.8722603023052216, "num_tokens": 91061821.0, "step": 851 }, { "epoch": 1.9418472063854049, "grad_norm": 2.671875, "learning_rate": 4.722794328165849e-06, "loss": 0.6068, "mean_token_accuracy": 0.8738327622413635, "num_tokens": 91168909.0, "step": 852 }, { "epoch": 1.944127708095781, "grad_norm": 3.546875, "learning_rate": 4.721931683298979e-06, "loss": 0.597, "mean_token_accuracy": 0.8767188042402267, "num_tokens": 91275666.0, "step": 853 }, { "epoch": 1.9464082098061573, "grad_norm": 6.75, "learning_rate": 4.721067777306582e-06, "loss": 0.6291, "mean_token_accuracy": 0.8720938414335251, "num_tokens": 91382425.0, "step": 854 }, { "epoch": 1.9486887115165337, "grad_norm": 6.9375, "learning_rate": 4.7202026106789935e-06, "loss": 0.6055, "mean_token_accuracy": 0.87422114610672, "num_tokens": 91489317.0, "step": 855 }, { "epoch": 1.95096921322691, "grad_norm": 4.875, "learning_rate": 4.719336183907266e-06, "loss": 0.5824, "mean_token_accuracy": 0.8765899688005447, "num_tokens": 91596598.0, "step": 856 }, { "epoch": 1.9532497149372863, "grad_norm": 2.6875, "learning_rate": 4.718468497483166e-06, "loss": 0.5942, "mean_token_accuracy": 0.8780688494443893, "num_tokens": 91703876.0, "step": 857 }, { "epoch": 1.9555302166476625, "grad_norm": 3.5, "learning_rate": 4.717599551899177e-06, "loss": 0.5909, "mean_token_accuracy": 0.8795433938503265, "num_tokens": 91810908.0, "step": 858 }, { "epoch": 1.9578107183580387, "grad_norm": 6.5, "learning_rate": 4.716729347648494e-06, "loss": 0.6067, "mean_token_accuracy": 0.8784368187189102, "num_tokens": 91918098.0, "step": 859 }, { "epoch": 1.960091220068415, "grad_norm": 3.59375, "learning_rate": 4.71585788522503e-06, "loss": 0.6027, "mean_token_accuracy": 0.8751647025346756, "num_tokens": 92025449.0, "step": 860 }, { "epoch": 1.9623717217787915, "grad_norm": 3.546875, "learning_rate": 4.7149851651234085e-06, "loss": 0.6008, "mean_token_accuracy": 0.8768515288829803, "num_tokens": 92132495.0, "step": 861 }, { "epoch": 1.9646522234891677, "grad_norm": 6.25, "learning_rate": 4.714111187838969e-06, "loss": 0.6026, "mean_token_accuracy": 0.8760241121053696, "num_tokens": 92240414.0, "step": 862 }, { "epoch": 1.9669327251995439, "grad_norm": 2.421875, "learning_rate": 4.713235953867764e-06, "loss": 0.6127, "mean_token_accuracy": 0.8732226341962814, "num_tokens": 92346560.0, "step": 863 }, { "epoch": 1.96921322690992, "grad_norm": 3.078125, "learning_rate": 4.712359463706561e-06, "loss": 0.6059, "mean_token_accuracy": 0.8731984049081802, "num_tokens": 92453339.0, "step": 864 }, { "epoch": 1.9714937286202965, "grad_norm": 3.78125, "learning_rate": 4.711481717852837e-06, "loss": 0.6093, "mean_token_accuracy": 0.8733879327774048, "num_tokens": 92560826.0, "step": 865 }, { "epoch": 1.973774230330673, "grad_norm": 3.546875, "learning_rate": 4.710602716804784e-06, "loss": 0.614, "mean_token_accuracy": 0.8723070323467255, "num_tokens": 92667923.0, "step": 866 }, { "epoch": 1.976054732041049, "grad_norm": 2.78125, "learning_rate": 4.709722461061307e-06, "loss": 0.6102, "mean_token_accuracy": 0.8759769946336746, "num_tokens": 92774822.0, "step": 867 }, { "epoch": 1.9783352337514253, "grad_norm": 2.984375, "learning_rate": 4.70884095112202e-06, "loss": 0.5922, "mean_token_accuracy": 0.8767600506544113, "num_tokens": 92881978.0, "step": 868 }, { "epoch": 1.9806157354618015, "grad_norm": 4.84375, "learning_rate": 4.707958187487254e-06, "loss": 0.6102, "mean_token_accuracy": 0.8756605833768845, "num_tokens": 92989352.0, "step": 869 }, { "epoch": 1.982896237172178, "grad_norm": 3.78125, "learning_rate": 4.707074170658046e-06, "loss": 0.6259, "mean_token_accuracy": 0.8736278414726257, "num_tokens": 93095509.0, "step": 870 }, { "epoch": 1.9851767388825543, "grad_norm": 2.96875, "learning_rate": 4.706188901136148e-06, "loss": 0.5875, "mean_token_accuracy": 0.8808690756559372, "num_tokens": 93203007.0, "step": 871 }, { "epoch": 1.9874572405929305, "grad_norm": 2.28125, "learning_rate": 4.705302379424023e-06, "loss": 0.6058, "mean_token_accuracy": 0.8731043189764023, "num_tokens": 93310443.0, "step": 872 }, { "epoch": 1.9897377423033067, "grad_norm": 4.25, "learning_rate": 4.704414606024842e-06, "loss": 0.6241, "mean_token_accuracy": 0.8732631206512451, "num_tokens": 93416826.0, "step": 873 }, { "epoch": 1.9920182440136829, "grad_norm": 2.09375, "learning_rate": 4.703525581442488e-06, "loss": 0.5781, "mean_token_accuracy": 0.8822022676467896, "num_tokens": 93523746.0, "step": 874 }, { "epoch": 1.9942987457240593, "grad_norm": 3.5, "learning_rate": 4.702635306181554e-06, "loss": 0.5902, "mean_token_accuracy": 0.8761956989765167, "num_tokens": 93630669.0, "step": 875 }, { "epoch": 1.9965792474344357, "grad_norm": 2.421875, "learning_rate": 4.701743780747345e-06, "loss": 0.6131, "mean_token_accuracy": 0.8748619705438614, "num_tokens": 93738175.0, "step": 876 }, { "epoch": 1.998859749144812, "grad_norm": 4.1875, "learning_rate": 4.700851005645872e-06, "loss": 0.6047, "mean_token_accuracy": 0.8752595335245132, "num_tokens": 93845319.0, "step": 877 }, { "epoch": 2.0, "grad_norm": 5.40625, "learning_rate": 4.699956981383857e-06, "loss": 0.5695, "mean_token_accuracy": 0.8865090310573578, "num_tokens": 93884464.0, "step": 878 }, { "epoch": 2.002280501710376, "grad_norm": 2.625, "learning_rate": 4.699061708468732e-06, "loss": 0.604, "mean_token_accuracy": 0.8751733750104904, "num_tokens": 93991225.0, "step": 879 }, { "epoch": 2.0045610034207524, "grad_norm": 2.390625, "learning_rate": 4.698165187408635e-06, "loss": 0.5823, "mean_token_accuracy": 0.8786071985960007, "num_tokens": 94098859.0, "step": 880 }, { "epoch": 2.0045610034207524, "eval_loss": 0.6056435108184814, "eval_mean_token_accuracy": 0.8761785948231193, "eval_num_tokens": 94098859.0, "eval_runtime": 58.5991, "eval_samples_per_second": 143.091, "eval_steps_per_second": 4.488, "step": 880 }, { "epoch": 2.006841505131129, "grad_norm": 2.640625, "learning_rate": 4.697267418712415e-06, "loss": 0.5955, "mean_token_accuracy": 0.8779609203338623, "num_tokens": 94205931.0, "step": 881 }, { "epoch": 2.009122006841505, "grad_norm": 2.734375, "learning_rate": 4.6963684028896285e-06, "loss": 0.5865, "mean_token_accuracy": 0.877967357635498, "num_tokens": 94313152.0, "step": 882 }, { "epoch": 2.0114025085518814, "grad_norm": 2.0625, "learning_rate": 4.695468140450539e-06, "loss": 0.5928, "mean_token_accuracy": 0.8791947513818741, "num_tokens": 94420319.0, "step": 883 }, { "epoch": 2.0136830102622576, "grad_norm": 4.875, "learning_rate": 4.6945666319061166e-06, "loss": 0.5875, "mean_token_accuracy": 0.8798933327198029, "num_tokens": 94527420.0, "step": 884 }, { "epoch": 2.015963511972634, "grad_norm": 2.859375, "learning_rate": 4.6936638777680435e-06, "loss": 0.5986, "mean_token_accuracy": 0.8755367547273636, "num_tokens": 94634228.0, "step": 885 }, { "epoch": 2.0182440136830104, "grad_norm": 1.9140625, "learning_rate": 4.6927598785487026e-06, "loss": 0.6088, "mean_token_accuracy": 0.873708963394165, "num_tokens": 94740813.0, "step": 886 }, { "epoch": 2.0205245153933866, "grad_norm": 3.25, "learning_rate": 4.691854634761188e-06, "loss": 0.5952, "mean_token_accuracy": 0.8793173581361771, "num_tokens": 94848566.0, "step": 887 }, { "epoch": 2.022805017103763, "grad_norm": 3.015625, "learning_rate": 4.690948146919299e-06, "loss": 0.5963, "mean_token_accuracy": 0.8790719360113144, "num_tokens": 94955659.0, "step": 888 }, { "epoch": 2.025085518814139, "grad_norm": 4.53125, "learning_rate": 4.690040415537538e-06, "loss": 0.5906, "mean_token_accuracy": 0.8785732984542847, "num_tokens": 95062760.0, "step": 889 }, { "epoch": 2.027366020524515, "grad_norm": 3.5, "learning_rate": 4.689131441131119e-06, "loss": 0.6134, "mean_token_accuracy": 0.8747037500143051, "num_tokens": 95170435.0, "step": 890 }, { "epoch": 2.029646522234892, "grad_norm": 2.34375, "learning_rate": 4.6882212242159555e-06, "loss": 0.5945, "mean_token_accuracy": 0.8810647279024124, "num_tokens": 95277421.0, "step": 891 }, { "epoch": 2.031927023945268, "grad_norm": 2.296875, "learning_rate": 4.687309765308671e-06, "loss": 0.5705, "mean_token_accuracy": 0.881627082824707, "num_tokens": 95384868.0, "step": 892 }, { "epoch": 2.034207525655644, "grad_norm": 2.015625, "learning_rate": 4.6863970649265914e-06, "loss": 0.5875, "mean_token_accuracy": 0.8783137798309326, "num_tokens": 95492292.0, "step": 893 }, { "epoch": 2.0364880273660204, "grad_norm": 5.78125, "learning_rate": 4.685483123587748e-06, "loss": 0.5816, "mean_token_accuracy": 0.8808389455080032, "num_tokens": 95599389.0, "step": 894 }, { "epoch": 2.0387685290763966, "grad_norm": 2.96875, "learning_rate": 4.684567941810876e-06, "loss": 0.5715, "mean_token_accuracy": 0.8855313509702682, "num_tokens": 95706662.0, "step": 895 }, { "epoch": 2.0410490307867732, "grad_norm": 2.796875, "learning_rate": 4.683651520115414e-06, "loss": 0.5804, "mean_token_accuracy": 0.8826097548007965, "num_tokens": 95814265.0, "step": 896 }, { "epoch": 2.0433295324971494, "grad_norm": 2.90625, "learning_rate": 4.682733859021508e-06, "loss": 0.5875, "mean_token_accuracy": 0.8777095377445221, "num_tokens": 95921432.0, "step": 897 }, { "epoch": 2.0456100342075256, "grad_norm": 5.71875, "learning_rate": 4.681814959050002e-06, "loss": 0.6054, "mean_token_accuracy": 0.8748744130134583, "num_tokens": 96029029.0, "step": 898 }, { "epoch": 2.047890535917902, "grad_norm": 5.65625, "learning_rate": 4.680894820722446e-06, "loss": 0.5872, "mean_token_accuracy": 0.8802212625741959, "num_tokens": 96136531.0, "step": 899 }, { "epoch": 2.050171037628278, "grad_norm": 6.875, "learning_rate": 4.679973444561095e-06, "loss": 0.6056, "mean_token_accuracy": 0.873874619603157, "num_tokens": 96243395.0, "step": 900 }, { "epoch": 2.0524515393386547, "grad_norm": 3.640625, "learning_rate": 4.679050831088902e-06, "loss": 0.592, "mean_token_accuracy": 0.8777193129062653, "num_tokens": 96350083.0, "step": 901 }, { "epoch": 2.054732041049031, "grad_norm": 2.109375, "learning_rate": 4.678126980829525e-06, "loss": 0.5916, "mean_token_accuracy": 0.8807910680770874, "num_tokens": 96457480.0, "step": 902 }, { "epoch": 2.057012542759407, "grad_norm": 5.0, "learning_rate": 4.677201894307325e-06, "loss": 0.6114, "mean_token_accuracy": 0.872836172580719, "num_tokens": 96564282.0, "step": 903 }, { "epoch": 2.0592930444697832, "grad_norm": 4.75, "learning_rate": 4.676275572047362e-06, "loss": 0.6123, "mean_token_accuracy": 0.871756300330162, "num_tokens": 96671356.0, "step": 904 }, { "epoch": 2.0615735461801594, "grad_norm": 4.6875, "learning_rate": 4.675348014575399e-06, "loss": 0.6142, "mean_token_accuracy": 0.8757220953702927, "num_tokens": 96778413.0, "step": 905 }, { "epoch": 2.063854047890536, "grad_norm": 2.28125, "learning_rate": 4.674419222417899e-06, "loss": 0.6063, "mean_token_accuracy": 0.8768678158521652, "num_tokens": 96885296.0, "step": 906 }, { "epoch": 2.0661345496009123, "grad_norm": 2.09375, "learning_rate": 4.673489196102028e-06, "loss": 0.6045, "mean_token_accuracy": 0.874673992395401, "num_tokens": 96992919.0, "step": 907 }, { "epoch": 2.0684150513112884, "grad_norm": 2.328125, "learning_rate": 4.67255793615565e-06, "loss": 0.6077, "mean_token_accuracy": 0.8760568499565125, "num_tokens": 97099795.0, "step": 908 }, { "epoch": 2.0706955530216646, "grad_norm": 4.90625, "learning_rate": 4.67162544310733e-06, "loss": 0.5799, "mean_token_accuracy": 0.8787839561700821, "num_tokens": 97206999.0, "step": 909 }, { "epoch": 2.072976054732041, "grad_norm": 7.03125, "learning_rate": 4.670691717486333e-06, "loss": 0.5961, "mean_token_accuracy": 0.8767089992761612, "num_tokens": 97314451.0, "step": 910 }, { "epoch": 2.0752565564424175, "grad_norm": 5.84375, "learning_rate": 4.669756759822625e-06, "loss": 0.6157, "mean_token_accuracy": 0.8748338222503662, "num_tokens": 97421703.0, "step": 911 }, { "epoch": 2.0775370581527937, "grad_norm": 2.15625, "learning_rate": 4.668820570646868e-06, "loss": 0.5903, "mean_token_accuracy": 0.8794323354959488, "num_tokens": 97528420.0, "step": 912 }, { "epoch": 2.07981755986317, "grad_norm": 2.484375, "learning_rate": 4.667883150490427e-06, "loss": 0.6027, "mean_token_accuracy": 0.8755746185779572, "num_tokens": 97634954.0, "step": 913 }, { "epoch": 2.082098061573546, "grad_norm": 3.3125, "learning_rate": 4.666944499885361e-06, "loss": 0.6083, "mean_token_accuracy": 0.8736224919557571, "num_tokens": 97742085.0, "step": 914 }, { "epoch": 2.0843785632839227, "grad_norm": 2.484375, "learning_rate": 4.6660046193644315e-06, "loss": 0.6008, "mean_token_accuracy": 0.8764249086380005, "num_tokens": 97849138.0, "step": 915 }, { "epoch": 2.086659064994299, "grad_norm": 2.671875, "learning_rate": 4.665063509461098e-06, "loss": 0.5977, "mean_token_accuracy": 0.8795299530029297, "num_tokens": 97956499.0, "step": 916 }, { "epoch": 2.088939566704675, "grad_norm": 2.21875, "learning_rate": 4.664121170709512e-06, "loss": 0.5861, "mean_token_accuracy": 0.8816528022289276, "num_tokens": 98063712.0, "step": 917 }, { "epoch": 2.0912200684150513, "grad_norm": 2.6875, "learning_rate": 4.663177603644532e-06, "loss": 0.5977, "mean_token_accuracy": 0.8774126917123795, "num_tokens": 98170731.0, "step": 918 }, { "epoch": 2.0935005701254275, "grad_norm": 2.921875, "learning_rate": 4.662232808801704e-06, "loss": 0.6308, "mean_token_accuracy": 0.8704886138439178, "num_tokens": 98277880.0, "step": 919 }, { "epoch": 2.095781071835804, "grad_norm": 4.0, "learning_rate": 4.661286786717278e-06, "loss": 0.5885, "mean_token_accuracy": 0.8788218349218369, "num_tokens": 98385335.0, "step": 920 }, { "epoch": 2.0980615735461803, "grad_norm": 3.234375, "learning_rate": 4.660339537928198e-06, "loss": 0.5937, "mean_token_accuracy": 0.8791757225990295, "num_tokens": 98492150.0, "step": 921 }, { "epoch": 2.1003420752565565, "grad_norm": 2.046875, "learning_rate": 4.659391062972102e-06, "loss": 0.5823, "mean_token_accuracy": 0.8784543722867966, "num_tokens": 98599554.0, "step": 922 }, { "epoch": 2.1026225769669327, "grad_norm": 2.265625, "learning_rate": 4.658441362387328e-06, "loss": 0.5841, "mean_token_accuracy": 0.8808614313602448, "num_tokens": 98706337.0, "step": 923 }, { "epoch": 2.104903078677309, "grad_norm": 3.515625, "learning_rate": 4.657490436712907e-06, "loss": 0.62, "mean_token_accuracy": 0.8705125451087952, "num_tokens": 98812884.0, "step": 924 }, { "epoch": 2.1071835803876855, "grad_norm": 3.796875, "learning_rate": 4.6565382864885665e-06, "loss": 0.5909, "mean_token_accuracy": 0.8783458173274994, "num_tokens": 98920205.0, "step": 925 }, { "epoch": 2.1094640820980617, "grad_norm": 2.3125, "learning_rate": 4.655584912254727e-06, "loss": 0.5919, "mean_token_accuracy": 0.8786965757608414, "num_tokens": 99027392.0, "step": 926 }, { "epoch": 2.111744583808438, "grad_norm": 3.09375, "learning_rate": 4.654630314552508e-06, "loss": 0.5787, "mean_token_accuracy": 0.8778886198997498, "num_tokens": 99134550.0, "step": 927 }, { "epoch": 2.114025085518814, "grad_norm": 3.390625, "learning_rate": 4.653674493923718e-06, "loss": 0.5749, "mean_token_accuracy": 0.8823033571243286, "num_tokens": 99241929.0, "step": 928 }, { "epoch": 2.1163055872291903, "grad_norm": 2.578125, "learning_rate": 4.652717450910864e-06, "loss": 0.6113, "mean_token_accuracy": 0.8728591203689575, "num_tokens": 99349158.0, "step": 929 }, { "epoch": 2.118586088939567, "grad_norm": 2.46875, "learning_rate": 4.651759186057144e-06, "loss": 0.5993, "mean_token_accuracy": 0.87718665599823, "num_tokens": 99456113.0, "step": 930 }, { "epoch": 2.120866590649943, "grad_norm": 2.640625, "learning_rate": 4.650799699906452e-06, "loss": 0.6052, "mean_token_accuracy": 0.8780839145183563, "num_tokens": 99563254.0, "step": 931 }, { "epoch": 2.1231470923603193, "grad_norm": 2.640625, "learning_rate": 4.649838993003373e-06, "loss": 0.5993, "mean_token_accuracy": 0.8814170211553574, "num_tokens": 99670632.0, "step": 932 }, { "epoch": 2.1254275940706955, "grad_norm": 4.625, "learning_rate": 4.648877065893186e-06, "loss": 0.5887, "mean_token_accuracy": 0.8802737295627594, "num_tokens": 99777773.0, "step": 933 }, { "epoch": 2.1277080957810717, "grad_norm": 3.5625, "learning_rate": 4.647913919121861e-06, "loss": 0.5973, "mean_token_accuracy": 0.8730404675006866, "num_tokens": 99885179.0, "step": 934 }, { "epoch": 2.1299885974914483, "grad_norm": 2.21875, "learning_rate": 4.646949553236064e-06, "loss": 0.5834, "mean_token_accuracy": 0.8850271999835968, "num_tokens": 99992436.0, "step": 935 }, { "epoch": 2.1322690992018245, "grad_norm": 4.75, "learning_rate": 4.645983968783148e-06, "loss": 0.5917, "mean_token_accuracy": 0.8796833902597427, "num_tokens": 100099422.0, "step": 936 }, { "epoch": 2.1345496009122007, "grad_norm": 5.9375, "learning_rate": 4.645017166311163e-06, "loss": 0.596, "mean_token_accuracy": 0.8761509507894516, "num_tokens": 100207243.0, "step": 937 }, { "epoch": 2.136830102622577, "grad_norm": 2.875, "learning_rate": 4.644049146368844e-06, "loss": 0.6185, "mean_token_accuracy": 0.8749994188547134, "num_tokens": 100313796.0, "step": 938 }, { "epoch": 2.139110604332953, "grad_norm": 1.9609375, "learning_rate": 4.643079909505622e-06, "loss": 0.5775, "mean_token_accuracy": 0.8829146772623062, "num_tokens": 100420992.0, "step": 939 }, { "epoch": 2.1413911060433297, "grad_norm": 3.078125, "learning_rate": 4.642109456271618e-06, "loss": 0.5853, "mean_token_accuracy": 0.8793570548295975, "num_tokens": 100528158.0, "step": 940 }, { "epoch": 2.143671607753706, "grad_norm": 4.15625, "learning_rate": 4.64113778721764e-06, "loss": 0.6007, "mean_token_accuracy": 0.8770193010568619, "num_tokens": 100635032.0, "step": 941 }, { "epoch": 2.145952109464082, "grad_norm": 5.4375, "learning_rate": 4.640164902895192e-06, "loss": 0.591, "mean_token_accuracy": 0.8772297501564026, "num_tokens": 100742739.0, "step": 942 }, { "epoch": 2.1482326111744583, "grad_norm": 2.5, "learning_rate": 4.6391908038564615e-06, "loss": 0.6109, "mean_token_accuracy": 0.8756915777921677, "num_tokens": 100849851.0, "step": 943 }, { "epoch": 2.1505131128848345, "grad_norm": 6.5, "learning_rate": 4.6382154906543295e-06, "loss": 0.5989, "mean_token_accuracy": 0.8732953071594238, "num_tokens": 100957153.0, "step": 944 }, { "epoch": 2.152793614595211, "grad_norm": 5.3125, "learning_rate": 4.637238963842365e-06, "loss": 0.6082, "mean_token_accuracy": 0.8774750828742981, "num_tokens": 101063587.0, "step": 945 }, { "epoch": 2.1550741163055873, "grad_norm": 3.96875, "learning_rate": 4.636261223974826e-06, "loss": 0.5955, "mean_token_accuracy": 0.8747821748256683, "num_tokens": 101170370.0, "step": 946 }, { "epoch": 2.1573546180159635, "grad_norm": 3.09375, "learning_rate": 4.635282271606658e-06, "loss": 0.5916, "mean_token_accuracy": 0.8776073157787323, "num_tokens": 101277426.0, "step": 947 }, { "epoch": 2.1596351197263397, "grad_norm": 2.5, "learning_rate": 4.634302107293497e-06, "loss": 0.5965, "mean_token_accuracy": 0.876614436507225, "num_tokens": 101383939.0, "step": 948 }, { "epoch": 2.161915621436716, "grad_norm": 2.28125, "learning_rate": 4.633320731591663e-06, "loss": 0.605, "mean_token_accuracy": 0.8757250159978867, "num_tokens": 101491379.0, "step": 949 }, { "epoch": 2.1641961231470925, "grad_norm": 4.34375, "learning_rate": 4.632338145058167e-06, "loss": 0.6244, "mean_token_accuracy": 0.8718846142292023, "num_tokens": 101598608.0, "step": 950 }, { "epoch": 2.1664766248574687, "grad_norm": 1.96875, "learning_rate": 4.631354348250706e-06, "loss": 0.5809, "mean_token_accuracy": 0.8830228000879288, "num_tokens": 101706020.0, "step": 951 }, { "epoch": 2.168757126567845, "grad_norm": 4.40625, "learning_rate": 4.630369341727665e-06, "loss": 0.6116, "mean_token_accuracy": 0.872904360294342, "num_tokens": 101812698.0, "step": 952 }, { "epoch": 2.171037628278221, "grad_norm": 4.3125, "learning_rate": 4.629383126048114e-06, "loss": 0.608, "mean_token_accuracy": 0.8747462183237076, "num_tokens": 101919836.0, "step": 953 }, { "epoch": 2.1733181299885973, "grad_norm": 4.875, "learning_rate": 4.6283957017718105e-06, "loss": 0.5813, "mean_token_accuracy": 0.8803711831569672, "num_tokens": 102027244.0, "step": 954 }, { "epoch": 2.175598631698974, "grad_norm": 1.96875, "learning_rate": 4.627407069459196e-06, "loss": 0.5976, "mean_token_accuracy": 0.8786141574382782, "num_tokens": 102134050.0, "step": 955 }, { "epoch": 2.17787913340935, "grad_norm": 3.8125, "learning_rate": 4.626417229671401e-06, "loss": 0.586, "mean_token_accuracy": 0.8817842751741409, "num_tokens": 102241872.0, "step": 956 }, { "epoch": 2.1801596351197263, "grad_norm": 4.28125, "learning_rate": 4.625426182970237e-06, "loss": 0.6055, "mean_token_accuracy": 0.8747357130050659, "num_tokens": 102348674.0, "step": 957 }, { "epoch": 2.1824401368301025, "grad_norm": 3.796875, "learning_rate": 4.6244339299182065e-06, "loss": 0.6021, "mean_token_accuracy": 0.8756296783685684, "num_tokens": 102455493.0, "step": 958 }, { "epoch": 2.1847206385404787, "grad_norm": 2.671875, "learning_rate": 4.62344047107849e-06, "loss": 0.5923, "mean_token_accuracy": 0.8777966946363449, "num_tokens": 102562617.0, "step": 959 }, { "epoch": 2.1870011402508553, "grad_norm": 3.90625, "learning_rate": 4.622445807014956e-06, "loss": 0.5815, "mean_token_accuracy": 0.8824858963489532, "num_tokens": 102670281.0, "step": 960 }, { "epoch": 2.1892816419612315, "grad_norm": 3.125, "learning_rate": 4.621449938292159e-06, "loss": 0.603, "mean_token_accuracy": 0.8756995797157288, "num_tokens": 102778053.0, "step": 961 }, { "epoch": 2.1915621436716077, "grad_norm": 4.46875, "learning_rate": 4.620452865475331e-06, "loss": 0.5975, "mean_token_accuracy": 0.8776930570602417, "num_tokens": 102885553.0, "step": 962 }, { "epoch": 2.193842645381984, "grad_norm": 2.46875, "learning_rate": 4.6194545891303955e-06, "loss": 0.6116, "mean_token_accuracy": 0.8760574162006378, "num_tokens": 102992589.0, "step": 963 }, { "epoch": 2.19612314709236, "grad_norm": 6.25, "learning_rate": 4.618455109823952e-06, "loss": 0.5997, "mean_token_accuracy": 0.87519970536232, "num_tokens": 103099395.0, "step": 964 }, { "epoch": 2.1984036488027368, "grad_norm": 3.84375, "learning_rate": 4.617454428123287e-06, "loss": 0.6091, "mean_token_accuracy": 0.8743910491466522, "num_tokens": 103205910.0, "step": 965 }, { "epoch": 2.200684150513113, "grad_norm": 5.21875, "learning_rate": 4.616452544596367e-06, "loss": 0.5834, "mean_token_accuracy": 0.8808601945638657, "num_tokens": 103313312.0, "step": 966 }, { "epoch": 2.202964652223489, "grad_norm": 2.25, "learning_rate": 4.615449459811843e-06, "loss": 0.587, "mean_token_accuracy": 0.882127583026886, "num_tokens": 103420313.0, "step": 967 }, { "epoch": 2.2052451539338653, "grad_norm": 2.4375, "learning_rate": 4.614445174339045e-06, "loss": 0.5807, "mean_token_accuracy": 0.8822465240955353, "num_tokens": 103527336.0, "step": 968 }, { "epoch": 2.2075256556442415, "grad_norm": 6.46875, "learning_rate": 4.613439688747988e-06, "loss": 0.5882, "mean_token_accuracy": 0.8785429149866104, "num_tokens": 103634750.0, "step": 969 }, { "epoch": 2.209806157354618, "grad_norm": 6.09375, "learning_rate": 4.612433003609365e-06, "loss": 0.6146, "mean_token_accuracy": 0.8751912713050842, "num_tokens": 103741472.0, "step": 970 }, { "epoch": 2.2120866590649944, "grad_norm": 3.515625, "learning_rate": 4.611425119494552e-06, "loss": 0.6132, "mean_token_accuracy": 0.8733502626419067, "num_tokens": 103848855.0, "step": 971 }, { "epoch": 2.2143671607753705, "grad_norm": 2.53125, "learning_rate": 4.6104160369756025e-06, "loss": 0.5832, "mean_token_accuracy": 0.8768988847732544, "num_tokens": 103956579.0, "step": 972 }, { "epoch": 2.2166476624857467, "grad_norm": 3.578125, "learning_rate": 4.609405756625254e-06, "loss": 0.5802, "mean_token_accuracy": 0.8784401416778564, "num_tokens": 104063842.0, "step": 973 }, { "epoch": 2.2189281641961234, "grad_norm": 3.109375, "learning_rate": 4.608394279016921e-06, "loss": 0.5887, "mean_token_accuracy": 0.8794825226068497, "num_tokens": 104171052.0, "step": 974 }, { "epoch": 2.2212086659064996, "grad_norm": 3.3125, "learning_rate": 4.6073816047247e-06, "loss": 0.5737, "mean_token_accuracy": 0.8819704353809357, "num_tokens": 104278607.0, "step": 975 }, { "epoch": 2.2234891676168758, "grad_norm": 2.265625, "learning_rate": 4.606367734323365e-06, "loss": 0.5971, "mean_token_accuracy": 0.8778064996004105, "num_tokens": 104385870.0, "step": 976 }, { "epoch": 2.225769669327252, "grad_norm": 3.109375, "learning_rate": 4.605352668388369e-06, "loss": 0.6163, "mean_token_accuracy": 0.8713217675685883, "num_tokens": 104493565.0, "step": 977 }, { "epoch": 2.228050171037628, "grad_norm": 3.34375, "learning_rate": 4.6043364074958435e-06, "loss": 0.5968, "mean_token_accuracy": 0.8773491680622101, "num_tokens": 104600945.0, "step": 978 }, { "epoch": 2.2303306727480043, "grad_norm": 2.6875, "learning_rate": 4.6033189522226e-06, "loss": 0.6037, "mean_token_accuracy": 0.8754960596561432, "num_tokens": 104707727.0, "step": 979 }, { "epoch": 2.232611174458381, "grad_norm": 2.5625, "learning_rate": 4.602300303146123e-06, "loss": 0.5731, "mean_token_accuracy": 0.8821861445903778, "num_tokens": 104815334.0, "step": 980 }, { "epoch": 2.234891676168757, "grad_norm": 2.140625, "learning_rate": 4.601280460844583e-06, "loss": 0.5935, "mean_token_accuracy": 0.8821869492530823, "num_tokens": 104922027.0, "step": 981 }, { "epoch": 2.2371721778791334, "grad_norm": 3.125, "learning_rate": 4.6002594258968185e-06, "loss": 0.5679, "mean_token_accuracy": 0.8820158392190933, "num_tokens": 105028843.0, "step": 982 }, { "epoch": 2.2394526795895096, "grad_norm": 3.546875, "learning_rate": 4.599237198882351e-06, "loss": 0.6006, "mean_token_accuracy": 0.8787632882595062, "num_tokens": 105135688.0, "step": 983 }, { "epoch": 2.241733181299886, "grad_norm": 2.875, "learning_rate": 4.598213780381377e-06, "loss": 0.5986, "mean_token_accuracy": 0.8801819086074829, "num_tokens": 105242376.0, "step": 984 }, { "epoch": 2.2440136830102624, "grad_norm": 3.046875, "learning_rate": 4.59718917097477e-06, "loss": 0.5932, "mean_token_accuracy": 0.8789917528629303, "num_tokens": 105349325.0, "step": 985 }, { "epoch": 2.2462941847206386, "grad_norm": 5.8125, "learning_rate": 4.596163371244076e-06, "loss": 0.5752, "mean_token_accuracy": 0.8811417818069458, "num_tokens": 105456213.0, "step": 986 }, { "epoch": 2.2485746864310148, "grad_norm": 7.59375, "learning_rate": 4.595136381771521e-06, "loss": 0.5982, "mean_token_accuracy": 0.8762593269348145, "num_tokens": 105562679.0, "step": 987 }, { "epoch": 2.250855188141391, "grad_norm": 7.8125, "learning_rate": 4.594108203140004e-06, "loss": 0.6039, "mean_token_accuracy": 0.8737095445394516, "num_tokens": 105670038.0, "step": 988 }, { "epoch": 2.253135689851767, "grad_norm": 3.65625, "learning_rate": 4.593078835933099e-06, "loss": 0.5871, "mean_token_accuracy": 0.8798780590295792, "num_tokens": 105777357.0, "step": 989 }, { "epoch": 2.255416191562144, "grad_norm": 2.4375, "learning_rate": 4.592048280735055e-06, "loss": 0.5814, "mean_token_accuracy": 0.879479706287384, "num_tokens": 105884657.0, "step": 990 }, { "epoch": 2.25769669327252, "grad_norm": 5.71875, "learning_rate": 4.591016538130796e-06, "loss": 0.5854, "mean_token_accuracy": 0.8783192932605743, "num_tokens": 105991145.0, "step": 991 }, { "epoch": 2.259977194982896, "grad_norm": 5.59375, "learning_rate": 4.589983608705918e-06, "loss": 0.6045, "mean_token_accuracy": 0.8753475993871689, "num_tokens": 106097808.0, "step": 992 }, { "epoch": 2.2622576966932724, "grad_norm": 5.78125, "learning_rate": 4.588949493046693e-06, "loss": 0.628, "mean_token_accuracy": 0.8707805275917053, "num_tokens": 106204975.0, "step": 993 }, { "epoch": 2.264538198403649, "grad_norm": 4.375, "learning_rate": 4.587914191740064e-06, "loss": 0.6265, "mean_token_accuracy": 0.8718110471963882, "num_tokens": 106311506.0, "step": 994 }, { "epoch": 2.266818700114025, "grad_norm": 2.765625, "learning_rate": 4.586877705373648e-06, "loss": 0.5866, "mean_token_accuracy": 0.8814664483070374, "num_tokens": 106418583.0, "step": 995 }, { "epoch": 2.2690992018244014, "grad_norm": 3.625, "learning_rate": 4.585840034535736e-06, "loss": 0.5829, "mean_token_accuracy": 0.881786435842514, "num_tokens": 106526615.0, "step": 996 }, { "epoch": 2.2713797035347776, "grad_norm": 7.40625, "learning_rate": 4.584801179815289e-06, "loss": 0.5919, "mean_token_accuracy": 0.878081277012825, "num_tokens": 106633800.0, "step": 997 }, { "epoch": 2.2736602052451538, "grad_norm": 7.59375, "learning_rate": 4.583761141801941e-06, "loss": 0.6034, "mean_token_accuracy": 0.8735693991184235, "num_tokens": 106740741.0, "step": 998 }, { "epoch": 2.27594070695553, "grad_norm": 5.0625, "learning_rate": 4.5827199210859975e-06, "loss": 0.5934, "mean_token_accuracy": 0.8779504746198654, "num_tokens": 106847997.0, "step": 999 }, { "epoch": 2.2782212086659066, "grad_norm": 5.4375, "learning_rate": 4.581677518258435e-06, "loss": 0.6154, "mean_token_accuracy": 0.8708099573850632, "num_tokens": 106955093.0, "step": 1000 }, { "epoch": 2.280501710376283, "grad_norm": 5.25, "learning_rate": 4.580633933910901e-06, "loss": 0.5761, "mean_token_accuracy": 0.881976768374443, "num_tokens": 107062229.0, "step": 1001 }, { "epoch": 2.282782212086659, "grad_norm": 5.5625, "learning_rate": 4.579589168635715e-06, "loss": 0.6241, "mean_token_accuracy": 0.8718415945768356, "num_tokens": 107168673.0, "step": 1002 }, { "epoch": 2.285062713797035, "grad_norm": 3.765625, "learning_rate": 4.578543223025865e-06, "loss": 0.6294, "mean_token_accuracy": 0.8693426996469498, "num_tokens": 107275384.0, "step": 1003 }, { "epoch": 2.287343215507412, "grad_norm": 3.265625, "learning_rate": 4.577496097675009e-06, "loss": 0.5871, "mean_token_accuracy": 0.8758938610553741, "num_tokens": 107382155.0, "step": 1004 }, { "epoch": 2.289623717217788, "grad_norm": 3.03125, "learning_rate": 4.576447793177476e-06, "loss": 0.6103, "mean_token_accuracy": 0.8743392378091812, "num_tokens": 107488845.0, "step": 1005 }, { "epoch": 2.291904218928164, "grad_norm": 2.1875, "learning_rate": 4.575398310128263e-06, "loss": 0.5773, "mean_token_accuracy": 0.8819352090358734, "num_tokens": 107596244.0, "step": 1006 }, { "epoch": 2.2941847206385404, "grad_norm": 3.140625, "learning_rate": 4.574347649123036e-06, "loss": 0.6106, "mean_token_accuracy": 0.8723960518836975, "num_tokens": 107703259.0, "step": 1007 }, { "epoch": 2.2964652223489166, "grad_norm": 2.5625, "learning_rate": 4.57329581075813e-06, "loss": 0.626, "mean_token_accuracy": 0.8719481080770493, "num_tokens": 107810343.0, "step": 1008 }, { "epoch": 2.2987457240592932, "grad_norm": 5.1875, "learning_rate": 4.572242795630549e-06, "loss": 0.5856, "mean_token_accuracy": 0.8785750865936279, "num_tokens": 107917748.0, "step": 1009 }, { "epoch": 2.3010262257696694, "grad_norm": 2.609375, "learning_rate": 4.571188604337963e-06, "loss": 0.5905, "mean_token_accuracy": 0.8774587363004684, "num_tokens": 108025056.0, "step": 1010 }, { "epoch": 2.3033067274800456, "grad_norm": 2.09375, "learning_rate": 4.570133237478711e-06, "loss": 0.6028, "mean_token_accuracy": 0.8772729784250259, "num_tokens": 108131874.0, "step": 1011 }, { "epoch": 2.305587229190422, "grad_norm": 3.765625, "learning_rate": 4.5690766956517985e-06, "loss": 0.5825, "mean_token_accuracy": 0.883285716176033, "num_tokens": 108238676.0, "step": 1012 }, { "epoch": 2.307867730900798, "grad_norm": 4.8125, "learning_rate": 4.568018979456899e-06, "loss": 0.5937, "mean_token_accuracy": 0.87696273624897, "num_tokens": 108345907.0, "step": 1013 }, { "epoch": 2.3101482326111746, "grad_norm": 5.46875, "learning_rate": 4.566960089494351e-06, "loss": 0.6027, "mean_token_accuracy": 0.8753390312194824, "num_tokens": 108453246.0, "step": 1014 }, { "epoch": 2.312428734321551, "grad_norm": 2.484375, "learning_rate": 4.5659000263651615e-06, "loss": 0.5782, "mean_token_accuracy": 0.8795862644910812, "num_tokens": 108560344.0, "step": 1015 }, { "epoch": 2.314709236031927, "grad_norm": 2.46875, "learning_rate": 4.564838790671e-06, "loss": 0.5794, "mean_token_accuracy": 0.8801321387290955, "num_tokens": 108667748.0, "step": 1016 }, { "epoch": 2.316989737742303, "grad_norm": 3.90625, "learning_rate": 4.5637763830142046e-06, "loss": 0.567, "mean_token_accuracy": 0.8832177221775055, "num_tokens": 108775087.0, "step": 1017 }, { "epoch": 2.3192702394526794, "grad_norm": 4.96875, "learning_rate": 4.562712803997776e-06, "loss": 0.6119, "mean_token_accuracy": 0.8724011331796646, "num_tokens": 108882040.0, "step": 1018 }, { "epoch": 2.321550741163056, "grad_norm": 2.203125, "learning_rate": 4.5616480542253825e-06, "loss": 0.5942, "mean_token_accuracy": 0.8780981302261353, "num_tokens": 108988642.0, "step": 1019 }, { "epoch": 2.3238312428734322, "grad_norm": 2.59375, "learning_rate": 4.5605821343013555e-06, "loss": 0.61, "mean_token_accuracy": 0.8755066245794296, "num_tokens": 109096022.0, "step": 1020 }, { "epoch": 2.3261117445838084, "grad_norm": 3.796875, "learning_rate": 4.55951504483069e-06, "loss": 0.5988, "mean_token_accuracy": 0.878739207983017, "num_tokens": 109203309.0, "step": 1021 }, { "epoch": 2.3283922462941846, "grad_norm": 2.71875, "learning_rate": 4.558446786419045e-06, "loss": 0.599, "mean_token_accuracy": 0.8794536143541336, "num_tokens": 109309812.0, "step": 1022 }, { "epoch": 2.330672748004561, "grad_norm": 2.296875, "learning_rate": 4.557377359672745e-06, "loss": 0.5817, "mean_token_accuracy": 0.8791648596525192, "num_tokens": 109416293.0, "step": 1023 }, { "epoch": 2.3329532497149374, "grad_norm": 2.65625, "learning_rate": 4.556306765198775e-06, "loss": 0.5715, "mean_token_accuracy": 0.8814702332019806, "num_tokens": 109523500.0, "step": 1024 }, { "epoch": 2.3352337514253136, "grad_norm": 2.140625, "learning_rate": 4.555235003604782e-06, "loss": 0.5865, "mean_token_accuracy": 0.8794472515583038, "num_tokens": 109630312.0, "step": 1025 }, { "epoch": 2.33751425313569, "grad_norm": 4.65625, "learning_rate": 4.55416207549908e-06, "loss": 0.5963, "mean_token_accuracy": 0.8786284774541855, "num_tokens": 109737301.0, "step": 1026 }, { "epoch": 2.339794754846066, "grad_norm": 4.625, "learning_rate": 4.5530879814906404e-06, "loss": 0.6159, "mean_token_accuracy": 0.8746591061353683, "num_tokens": 109844193.0, "step": 1027 }, { "epoch": 2.342075256556442, "grad_norm": 2.5, "learning_rate": 4.5520127221891e-06, "loss": 0.6225, "mean_token_accuracy": 0.8710612952709198, "num_tokens": 109951493.0, "step": 1028 }, { "epoch": 2.344355758266819, "grad_norm": 2.640625, "learning_rate": 4.5509362982047525e-06, "loss": 0.5982, "mean_token_accuracy": 0.8757833987474442, "num_tokens": 110058293.0, "step": 1029 }, { "epoch": 2.346636259977195, "grad_norm": 2.828125, "learning_rate": 4.549858710148558e-06, "loss": 0.592, "mean_token_accuracy": 0.876603439450264, "num_tokens": 110165755.0, "step": 1030 }, { "epoch": 2.3489167616875712, "grad_norm": 4.125, "learning_rate": 4.548779958632134e-06, "loss": 0.5678, "mean_token_accuracy": 0.8827737718820572, "num_tokens": 110273020.0, "step": 1031 }, { "epoch": 2.3511972633979474, "grad_norm": 2.84375, "learning_rate": 4.5477000442677575e-06, "loss": 0.6005, "mean_token_accuracy": 0.8793087154626846, "num_tokens": 110379923.0, "step": 1032 }, { "epoch": 2.353477765108324, "grad_norm": 3.171875, "learning_rate": 4.546618967668369e-06, "loss": 0.5855, "mean_token_accuracy": 0.8808025866746902, "num_tokens": 110487050.0, "step": 1033 }, { "epoch": 2.3557582668187003, "grad_norm": 2.28125, "learning_rate": 4.545536729447566e-06, "loss": 0.5604, "mean_token_accuracy": 0.8820274025201797, "num_tokens": 110594620.0, "step": 1034 }, { "epoch": 2.3580387685290765, "grad_norm": 3.359375, "learning_rate": 4.544453330219606e-06, "loss": 0.5961, "mean_token_accuracy": 0.8768385797739029, "num_tokens": 110701829.0, "step": 1035 }, { "epoch": 2.3603192702394526, "grad_norm": 4.84375, "learning_rate": 4.543368770599406e-06, "loss": 0.5974, "mean_token_accuracy": 0.8801583051681519, "num_tokens": 110809252.0, "step": 1036 }, { "epoch": 2.362599771949829, "grad_norm": 2.484375, "learning_rate": 4.542283051202539e-06, "loss": 0.5896, "mean_token_accuracy": 0.8764929324388504, "num_tokens": 110916093.0, "step": 1037 }, { "epoch": 2.364880273660205, "grad_norm": 6.09375, "learning_rate": 4.541196172645242e-06, "loss": 0.5974, "mean_token_accuracy": 0.8785767704248428, "num_tokens": 111023473.0, "step": 1038 }, { "epoch": 2.3671607753705817, "grad_norm": 6.9375, "learning_rate": 4.540108135544403e-06, "loss": 0.5914, "mean_token_accuracy": 0.8778847008943558, "num_tokens": 111130747.0, "step": 1039 }, { "epoch": 2.369441277080958, "grad_norm": 4.8125, "learning_rate": 4.5390189405175725e-06, "loss": 0.5938, "mean_token_accuracy": 0.8778575360774994, "num_tokens": 111237498.0, "step": 1040 }, { "epoch": 2.371721778791334, "grad_norm": 2.328125, "learning_rate": 4.537928588182955e-06, "loss": 0.6031, "mean_token_accuracy": 0.8711429536342621, "num_tokens": 111344543.0, "step": 1041 }, { "epoch": 2.3740022805017102, "grad_norm": 3.125, "learning_rate": 4.536837079159416e-06, "loss": 0.5712, "mean_token_accuracy": 0.8806983381509781, "num_tokens": 111451314.0, "step": 1042 }, { "epoch": 2.376282782212087, "grad_norm": 3.09375, "learning_rate": 4.535744414066473e-06, "loss": 0.5813, "mean_token_accuracy": 0.8755781650543213, "num_tokens": 111558317.0, "step": 1043 }, { "epoch": 2.378563283922463, "grad_norm": 2.53125, "learning_rate": 4.534650593524302e-06, "loss": 0.6024, "mean_token_accuracy": 0.874798059463501, "num_tokens": 111664986.0, "step": 1044 }, { "epoch": 2.3808437856328393, "grad_norm": 4.03125, "learning_rate": 4.533555618153735e-06, "loss": 0.581, "mean_token_accuracy": 0.8804291188716888, "num_tokens": 111772027.0, "step": 1045 }, { "epoch": 2.3831242873432155, "grad_norm": 2.5, "learning_rate": 4.532459488576258e-06, "loss": 0.5837, "mean_token_accuracy": 0.8797705769538879, "num_tokens": 111878671.0, "step": 1046 }, { "epoch": 2.3854047890535917, "grad_norm": 4.90625, "learning_rate": 4.531362205414013e-06, "loss": 0.59, "mean_token_accuracy": 0.8787082433700562, "num_tokens": 111985253.0, "step": 1047 }, { "epoch": 2.387685290763968, "grad_norm": 3.5, "learning_rate": 4.530263769289798e-06, "loss": 0.5874, "mean_token_accuracy": 0.8787430375814438, "num_tokens": 112092830.0, "step": 1048 }, { "epoch": 2.3899657924743445, "grad_norm": 4.625, "learning_rate": 4.529164180827063e-06, "loss": 0.6002, "mean_token_accuracy": 0.8749395608901978, "num_tokens": 112199272.0, "step": 1049 }, { "epoch": 2.3922462941847207, "grad_norm": 5.875, "learning_rate": 4.528063440649913e-06, "loss": 0.5932, "mean_token_accuracy": 0.8786779493093491, "num_tokens": 112306480.0, "step": 1050 }, { "epoch": 2.394526795895097, "grad_norm": 2.1875, "learning_rate": 4.526961549383109e-06, "loss": 0.5999, "mean_token_accuracy": 0.8784288913011551, "num_tokens": 112413379.0, "step": 1051 }, { "epoch": 2.396807297605473, "grad_norm": 3.796875, "learning_rate": 4.52585850765206e-06, "loss": 0.6035, "mean_token_accuracy": 0.8735271543264389, "num_tokens": 112520374.0, "step": 1052 }, { "epoch": 2.3990877993158497, "grad_norm": 2.4375, "learning_rate": 4.524754316082833e-06, "loss": 0.5796, "mean_token_accuracy": 0.8810184299945831, "num_tokens": 112627321.0, "step": 1053 }, { "epoch": 2.401368301026226, "grad_norm": 6.4375, "learning_rate": 4.5236489753021465e-06, "loss": 0.6087, "mean_token_accuracy": 0.8768182098865509, "num_tokens": 112734136.0, "step": 1054 }, { "epoch": 2.403648802736602, "grad_norm": 2.734375, "learning_rate": 4.522542485937369e-06, "loss": 0.5847, "mean_token_accuracy": 0.8769848495721817, "num_tokens": 112841135.0, "step": 1055 }, { "epoch": 2.4059293044469783, "grad_norm": 3.71875, "learning_rate": 4.521434848616523e-06, "loss": 0.5983, "mean_token_accuracy": 0.8787764012813568, "num_tokens": 112948501.0, "step": 1056 }, { "epoch": 2.4082098061573545, "grad_norm": 2.40625, "learning_rate": 4.520326063968283e-06, "loss": 0.5876, "mean_token_accuracy": 0.8776495456695557, "num_tokens": 113055115.0, "step": 1057 }, { "epoch": 2.4104903078677307, "grad_norm": 2.859375, "learning_rate": 4.5192161326219716e-06, "loss": 0.5887, "mean_token_accuracy": 0.8828926682472229, "num_tokens": 113162261.0, "step": 1058 }, { "epoch": 2.4127708095781073, "grad_norm": 2.03125, "learning_rate": 4.5181050552075665e-06, "loss": 0.5894, "mean_token_accuracy": 0.8777969181537628, "num_tokens": 113269282.0, "step": 1059 }, { "epoch": 2.4150513112884835, "grad_norm": 2.453125, "learning_rate": 4.516992832355694e-06, "loss": 0.5973, "mean_token_accuracy": 0.8747198283672333, "num_tokens": 113375649.0, "step": 1060 }, { "epoch": 2.4173318129988597, "grad_norm": 2.53125, "learning_rate": 4.515879464697629e-06, "loss": 0.5993, "mean_token_accuracy": 0.8800251632928848, "num_tokens": 113483268.0, "step": 1061 }, { "epoch": 2.419612314709236, "grad_norm": 2.53125, "learning_rate": 4.514764952865297e-06, "loss": 0.6132, "mean_token_accuracy": 0.8789681494235992, "num_tokens": 113590044.0, "step": 1062 }, { "epoch": 2.4218928164196125, "grad_norm": 4.34375, "learning_rate": 4.513649297491275e-06, "loss": 0.6064, "mean_token_accuracy": 0.8741051852703094, "num_tokens": 113696761.0, "step": 1063 }, { "epoch": 2.4241733181299887, "grad_norm": 2.78125, "learning_rate": 4.512532499208787e-06, "loss": 0.606, "mean_token_accuracy": 0.8762391060590744, "num_tokens": 113804097.0, "step": 1064 }, { "epoch": 2.426453819840365, "grad_norm": 2.21875, "learning_rate": 4.511414558651706e-06, "loss": 0.5828, "mean_token_accuracy": 0.8823606818914413, "num_tokens": 113911914.0, "step": 1065 }, { "epoch": 2.428734321550741, "grad_norm": 2.953125, "learning_rate": 4.5102954764545525e-06, "loss": 0.5754, "mean_token_accuracy": 0.8803216964006424, "num_tokens": 114018818.0, "step": 1066 }, { "epoch": 2.4310148232611173, "grad_norm": 2.671875, "learning_rate": 4.509175253252497e-06, "loss": 0.6118, "mean_token_accuracy": 0.8786596357822418, "num_tokens": 114125625.0, "step": 1067 }, { "epoch": 2.433295324971494, "grad_norm": 2.6875, "learning_rate": 4.508053889681357e-06, "loss": 0.5957, "mean_token_accuracy": 0.87969671189785, "num_tokens": 114232527.0, "step": 1068 }, { "epoch": 2.43557582668187, "grad_norm": 2.109375, "learning_rate": 4.5069313863775956e-06, "loss": 0.5815, "mean_token_accuracy": 0.8781027346849442, "num_tokens": 114339323.0, "step": 1069 }, { "epoch": 2.4378563283922463, "grad_norm": 2.625, "learning_rate": 4.505807743978325e-06, "loss": 0.5874, "mean_token_accuracy": 0.877943754196167, "num_tokens": 114446811.0, "step": 1070 }, { "epoch": 2.4401368301026225, "grad_norm": 2.8125, "learning_rate": 4.5046829631213014e-06, "loss": 0.613, "mean_token_accuracy": 0.8732447922229767, "num_tokens": 114553671.0, "step": 1071 }, { "epoch": 2.4424173318129987, "grad_norm": 2.890625, "learning_rate": 4.503557044444931e-06, "loss": 0.5986, "mean_token_accuracy": 0.8797748982906342, "num_tokens": 114660235.0, "step": 1072 }, { "epoch": 2.4446978335233753, "grad_norm": 6.09375, "learning_rate": 4.502429988588263e-06, "loss": 0.6077, "mean_token_accuracy": 0.8732910007238388, "num_tokens": 114767907.0, "step": 1073 }, { "epoch": 2.4469783352337515, "grad_norm": 3.640625, "learning_rate": 4.50130179619099e-06, "loss": 0.5895, "mean_token_accuracy": 0.8783971816301346, "num_tokens": 114874874.0, "step": 1074 }, { "epoch": 2.4492588369441277, "grad_norm": 2.390625, "learning_rate": 4.500172467893455e-06, "loss": 0.5892, "mean_token_accuracy": 0.8795461803674698, "num_tokens": 114981618.0, "step": 1075 }, { "epoch": 2.451539338654504, "grad_norm": 5.0625, "learning_rate": 4.499042004336642e-06, "loss": 0.5883, "mean_token_accuracy": 0.8823413103818893, "num_tokens": 115088543.0, "step": 1076 }, { "epoch": 2.45381984036488, "grad_norm": 6.90625, "learning_rate": 4.497910406162182e-06, "loss": 0.5928, "mean_token_accuracy": 0.8751686066389084, "num_tokens": 115195316.0, "step": 1077 }, { "epoch": 2.4561003420752567, "grad_norm": 6.28125, "learning_rate": 4.496777674012345e-06, "loss": 0.6081, "mean_token_accuracy": 0.8768873512744904, "num_tokens": 115302063.0, "step": 1078 }, { "epoch": 2.458380843785633, "grad_norm": 2.125, "learning_rate": 4.495643808530049e-06, "loss": 0.6031, "mean_token_accuracy": 0.8746660053730011, "num_tokens": 115409028.0, "step": 1079 }, { "epoch": 2.460661345496009, "grad_norm": 4.375, "learning_rate": 4.494508810358855e-06, "loss": 0.5939, "mean_token_accuracy": 0.8768034875392914, "num_tokens": 115516132.0, "step": 1080 }, { "epoch": 2.4629418472063853, "grad_norm": 5.0625, "learning_rate": 4.4933726801429665e-06, "loss": 0.6003, "mean_token_accuracy": 0.8732063323259354, "num_tokens": 115623543.0, "step": 1081 }, { "epoch": 2.4652223489167615, "grad_norm": 6.0, "learning_rate": 4.4922354185272275e-06, "loss": 0.6137, "mean_token_accuracy": 0.875907376408577, "num_tokens": 115730261.0, "step": 1082 }, { "epoch": 2.467502850627138, "grad_norm": 5.90625, "learning_rate": 4.491097026157127e-06, "loss": 0.5997, "mean_token_accuracy": 0.8774657398462296, "num_tokens": 115837186.0, "step": 1083 }, { "epoch": 2.4697833523375143, "grad_norm": 3.375, "learning_rate": 4.489957503678794e-06, "loss": 0.5833, "mean_token_accuracy": 0.8786139786243439, "num_tokens": 115944585.0, "step": 1084 }, { "epoch": 2.4720638540478905, "grad_norm": 6.3125, "learning_rate": 4.488816851738999e-06, "loss": 0.5921, "mean_token_accuracy": 0.881994903087616, "num_tokens": 116051127.0, "step": 1085 }, { "epoch": 2.4743443557582667, "grad_norm": 4.0, "learning_rate": 4.487675070985156e-06, "loss": 0.591, "mean_token_accuracy": 0.879449263215065, "num_tokens": 116158228.0, "step": 1086 }, { "epoch": 2.476624857468643, "grad_norm": 3.5625, "learning_rate": 4.4865321620653144e-06, "loss": 0.5908, "mean_token_accuracy": 0.8789650350809097, "num_tokens": 116265116.0, "step": 1087 }, { "epoch": 2.4789053591790196, "grad_norm": 4.34375, "learning_rate": 4.485388125628171e-06, "loss": 0.5956, "mean_token_accuracy": 0.8748547732830048, "num_tokens": 116372159.0, "step": 1088 }, { "epoch": 2.4811858608893957, "grad_norm": 5.40625, "learning_rate": 4.484242962323056e-06, "loss": 0.5757, "mean_token_accuracy": 0.8844872564077377, "num_tokens": 116478884.0, "step": 1089 }, { "epoch": 2.483466362599772, "grad_norm": 5.09375, "learning_rate": 4.483096672799942e-06, "loss": 0.597, "mean_token_accuracy": 0.8777136653661728, "num_tokens": 116585905.0, "step": 1090 }, { "epoch": 2.485746864310148, "grad_norm": 4.3125, "learning_rate": 4.481949257709442e-06, "loss": 0.5792, "mean_token_accuracy": 0.8820765465497971, "num_tokens": 116693091.0, "step": 1091 }, { "epoch": 2.4880273660205243, "grad_norm": 3.453125, "learning_rate": 4.480800717702807e-06, "loss": 0.5935, "mean_token_accuracy": 0.8752633780241013, "num_tokens": 116801048.0, "step": 1092 }, { "epoch": 2.490307867730901, "grad_norm": 2.046875, "learning_rate": 4.479651053431926e-06, "loss": 0.5861, "mean_token_accuracy": 0.8796508759260178, "num_tokens": 116907982.0, "step": 1093 }, { "epoch": 2.492588369441277, "grad_norm": 2.875, "learning_rate": 4.4785002655493246e-06, "loss": 0.5905, "mean_token_accuracy": 0.8795785456895828, "num_tokens": 117015153.0, "step": 1094 }, { "epoch": 2.4948688711516533, "grad_norm": 2.34375, "learning_rate": 4.477348354708169e-06, "loss": 0.5995, "mean_token_accuracy": 0.8773641586303711, "num_tokens": 117122420.0, "step": 1095 }, { "epoch": 2.4971493728620295, "grad_norm": 2.984375, "learning_rate": 4.476195321562262e-06, "loss": 0.5962, "mean_token_accuracy": 0.8779642879962921, "num_tokens": 117229161.0, "step": 1096 }, { "epoch": 2.4994298745724057, "grad_norm": 2.609375, "learning_rate": 4.475041166766042e-06, "loss": 0.6134, "mean_token_accuracy": 0.8731478750705719, "num_tokens": 117336152.0, "step": 1097 }, { "epoch": 2.5017103762827824, "grad_norm": 4.875, "learning_rate": 4.473885890974586e-06, "loss": 0.614, "mean_token_accuracy": 0.8745481222867966, "num_tokens": 117443787.0, "step": 1098 }, { "epoch": 2.5039908779931586, "grad_norm": 3.796875, "learning_rate": 4.472729494843605e-06, "loss": 0.5917, "mean_token_accuracy": 0.8793596476316452, "num_tokens": 117550788.0, "step": 1099 }, { "epoch": 2.5062713797035348, "grad_norm": 2.65625, "learning_rate": 4.471571979029448e-06, "loss": 0.5808, "mean_token_accuracy": 0.8826311677694321, "num_tokens": 117658107.0, "step": 1100 }, { "epoch": 2.5062713797035348, "eval_loss": 0.6002530455589294, "eval_mean_token_accuracy": 0.8773398306433239, "eval_num_tokens": 117658107.0, "eval_runtime": 58.5432, "eval_samples_per_second": 143.228, "eval_steps_per_second": 4.492, "step": 1100 }, { "epoch": 2.508551881413911, "grad_norm": 2.1875, "learning_rate": 4.470413344189098e-06, "loss": 0.5837, "mean_token_accuracy": 0.8801444619894028, "num_tokens": 117764556.0, "step": 1101 }, { "epoch": 2.5108323831242876, "grad_norm": 3.0625, "learning_rate": 4.469253590980175e-06, "loss": 0.5925, "mean_token_accuracy": 0.8782180845737457, "num_tokens": 117871482.0, "step": 1102 }, { "epoch": 2.5131128848346638, "grad_norm": 2.46875, "learning_rate": 4.46809272006093e-06, "loss": 0.5886, "mean_token_accuracy": 0.8766061216592789, "num_tokens": 117978366.0, "step": 1103 }, { "epoch": 2.51539338654504, "grad_norm": 2.53125, "learning_rate": 4.466930732090254e-06, "loss": 0.5901, "mean_token_accuracy": 0.8791538327932358, "num_tokens": 118085055.0, "step": 1104 }, { "epoch": 2.517673888255416, "grad_norm": 2.765625, "learning_rate": 4.465767627727668e-06, "loss": 0.571, "mean_token_accuracy": 0.8798196315765381, "num_tokens": 118192713.0, "step": 1105 }, { "epoch": 2.5199543899657924, "grad_norm": 2.265625, "learning_rate": 4.464603407633326e-06, "loss": 0.6121, "mean_token_accuracy": 0.8753893822431564, "num_tokens": 118299578.0, "step": 1106 }, { "epoch": 2.5222348916761685, "grad_norm": 2.4375, "learning_rate": 4.463438072468018e-06, "loss": 0.5943, "mean_token_accuracy": 0.8764231652021408, "num_tokens": 118406017.0, "step": 1107 }, { "epoch": 2.524515393386545, "grad_norm": 2.671875, "learning_rate": 4.462271622893166e-06, "loss": 0.6068, "mean_token_accuracy": 0.8774004876613617, "num_tokens": 118512926.0, "step": 1108 }, { "epoch": 2.5267958950969214, "grad_norm": 2.859375, "learning_rate": 4.461104059570825e-06, "loss": 0.6248, "mean_token_accuracy": 0.8694035857915878, "num_tokens": 118619939.0, "step": 1109 }, { "epoch": 2.5290763968072976, "grad_norm": 2.546875, "learning_rate": 4.4599353831636785e-06, "loss": 0.5785, "mean_token_accuracy": 0.8803573846817017, "num_tokens": 118727362.0, "step": 1110 }, { "epoch": 2.5313568985176738, "grad_norm": 6.84375, "learning_rate": 4.458765594335048e-06, "loss": 0.5913, "mean_token_accuracy": 0.877105325460434, "num_tokens": 118834306.0, "step": 1111 }, { "epoch": 2.5336374002280504, "grad_norm": 2.421875, "learning_rate": 4.457594693748881e-06, "loss": 0.6192, "mean_token_accuracy": 0.8765625059604645, "num_tokens": 118941239.0, "step": 1112 }, { "epoch": 2.5359179019384266, "grad_norm": 2.546875, "learning_rate": 4.456422682069758e-06, "loss": 0.5766, "mean_token_accuracy": 0.8844843655824661, "num_tokens": 119048172.0, "step": 1113 }, { "epoch": 2.538198403648803, "grad_norm": 2.9375, "learning_rate": 4.455249559962892e-06, "loss": 0.5867, "mean_token_accuracy": 0.880544051527977, "num_tokens": 119155079.0, "step": 1114 }, { "epoch": 2.540478905359179, "grad_norm": 2.140625, "learning_rate": 4.454075328094123e-06, "loss": 0.5728, "mean_token_accuracy": 0.8793693780899048, "num_tokens": 119262757.0, "step": 1115 }, { "epoch": 2.542759407069555, "grad_norm": 2.359375, "learning_rate": 4.452899987129922e-06, "loss": 0.5934, "mean_token_accuracy": 0.8787418156862259, "num_tokens": 119370566.0, "step": 1116 }, { "epoch": 2.5450399087799314, "grad_norm": 2.625, "learning_rate": 4.4517235377373915e-06, "loss": 0.6146, "mean_token_accuracy": 0.8720010370016098, "num_tokens": 119477698.0, "step": 1117 }, { "epoch": 2.547320410490308, "grad_norm": 2.953125, "learning_rate": 4.45054598058426e-06, "loss": 0.5706, "mean_token_accuracy": 0.8797213733196259, "num_tokens": 119584695.0, "step": 1118 }, { "epoch": 2.549600912200684, "grad_norm": 2.40625, "learning_rate": 4.449367316338887e-06, "loss": 0.5789, "mean_token_accuracy": 0.8773613125085831, "num_tokens": 119691823.0, "step": 1119 }, { "epoch": 2.5518814139110604, "grad_norm": 2.796875, "learning_rate": 4.448187545670258e-06, "loss": 0.5892, "mean_token_accuracy": 0.881779134273529, "num_tokens": 119799172.0, "step": 1120 }, { "epoch": 2.5541619156214366, "grad_norm": 3.1875, "learning_rate": 4.44700666924799e-06, "loss": 0.6025, "mean_token_accuracy": 0.874766156077385, "num_tokens": 119905784.0, "step": 1121 }, { "epoch": 2.556442417331813, "grad_norm": 3.21875, "learning_rate": 4.4458246877423254e-06, "loss": 0.6032, "mean_token_accuracy": 0.8753187209367752, "num_tokens": 120012658.0, "step": 1122 }, { "epoch": 2.5587229190421894, "grad_norm": 3.296875, "learning_rate": 4.444641601824134e-06, "loss": 0.5871, "mean_token_accuracy": 0.8781961053609848, "num_tokens": 120119347.0, "step": 1123 }, { "epoch": 2.5610034207525656, "grad_norm": 2.765625, "learning_rate": 4.443457412164911e-06, "loss": 0.5931, "mean_token_accuracy": 0.8751012235879898, "num_tokens": 120226332.0, "step": 1124 }, { "epoch": 2.563283922462942, "grad_norm": 3.03125, "learning_rate": 4.442272119436781e-06, "loss": 0.5954, "mean_token_accuracy": 0.8773366808891296, "num_tokens": 120334032.0, "step": 1125 }, { "epoch": 2.565564424173318, "grad_norm": 3.5, "learning_rate": 4.441085724312494e-06, "loss": 0.5827, "mean_token_accuracy": 0.8794488459825516, "num_tokens": 120441226.0, "step": 1126 }, { "epoch": 2.567844925883694, "grad_norm": 2.875, "learning_rate": 4.4398982274654235e-06, "loss": 0.5724, "mean_token_accuracy": 0.8812543451786041, "num_tokens": 120549055.0, "step": 1127 }, { "epoch": 2.570125427594071, "grad_norm": 4.28125, "learning_rate": 4.43870962956957e-06, "loss": 0.567, "mean_token_accuracy": 0.8838596194982529, "num_tokens": 120656376.0, "step": 1128 }, { "epoch": 2.572405929304447, "grad_norm": 3.125, "learning_rate": 4.437519931299559e-06, "loss": 0.5838, "mean_token_accuracy": 0.878927618265152, "num_tokens": 120764475.0, "step": 1129 }, { "epoch": 2.574686431014823, "grad_norm": 2.609375, "learning_rate": 4.43632913333064e-06, "loss": 0.5855, "mean_token_accuracy": 0.8791298568248749, "num_tokens": 120871237.0, "step": 1130 }, { "epoch": 2.5769669327251994, "grad_norm": 2.296875, "learning_rate": 4.435137236338688e-06, "loss": 0.5852, "mean_token_accuracy": 0.8821601420640945, "num_tokens": 120978615.0, "step": 1131 }, { "epoch": 2.579247434435576, "grad_norm": 6.46875, "learning_rate": 4.433944241000199e-06, "loss": 0.6004, "mean_token_accuracy": 0.8753758072853088, "num_tokens": 121085644.0, "step": 1132 }, { "epoch": 2.581527936145952, "grad_norm": 3.328125, "learning_rate": 4.4327501479922955e-06, "loss": 0.5738, "mean_token_accuracy": 0.8809983879327774, "num_tokens": 121193058.0, "step": 1133 }, { "epoch": 2.5838084378563284, "grad_norm": 2.734375, "learning_rate": 4.431554957992722e-06, "loss": 0.6168, "mean_token_accuracy": 0.8755798637866974, "num_tokens": 121299745.0, "step": 1134 }, { "epoch": 2.5860889395667046, "grad_norm": 4.0625, "learning_rate": 4.430358671679843e-06, "loss": 0.6168, "mean_token_accuracy": 0.8684369623661041, "num_tokens": 121406377.0, "step": 1135 }, { "epoch": 2.588369441277081, "grad_norm": 3.03125, "learning_rate": 4.42916128973265e-06, "loss": 0.6126, "mean_token_accuracy": 0.8750972300767899, "num_tokens": 121513300.0, "step": 1136 }, { "epoch": 2.590649942987457, "grad_norm": 2.9375, "learning_rate": 4.427962812830753e-06, "loss": 0.6178, "mean_token_accuracy": 0.8743036985397339, "num_tokens": 121620489.0, "step": 1137 }, { "epoch": 2.5929304446978336, "grad_norm": 4.25, "learning_rate": 4.426763241654383e-06, "loss": 0.6034, "mean_token_accuracy": 0.8752316683530807, "num_tokens": 121727667.0, "step": 1138 }, { "epoch": 2.59521094640821, "grad_norm": 5.15625, "learning_rate": 4.425562576884396e-06, "loss": 0.5877, "mean_token_accuracy": 0.8789777606725693, "num_tokens": 121834628.0, "step": 1139 }, { "epoch": 2.597491448118586, "grad_norm": 4.125, "learning_rate": 4.424360819202264e-06, "loss": 0.592, "mean_token_accuracy": 0.8804343789815903, "num_tokens": 121941613.0, "step": 1140 }, { "epoch": 2.5997719498289626, "grad_norm": 2.59375, "learning_rate": 4.423157969290081e-06, "loss": 0.6015, "mean_token_accuracy": 0.8748785555362701, "num_tokens": 122048337.0, "step": 1141 }, { "epoch": 2.602052451539339, "grad_norm": 3.328125, "learning_rate": 4.421954027830565e-06, "loss": 0.5984, "mean_token_accuracy": 0.8761484026908875, "num_tokens": 122155743.0, "step": 1142 }, { "epoch": 2.604332953249715, "grad_norm": 3.5625, "learning_rate": 4.4207489955070465e-06, "loss": 0.5864, "mean_token_accuracy": 0.8797977864742279, "num_tokens": 122262415.0, "step": 1143 }, { "epoch": 2.6066134549600912, "grad_norm": 3.359375, "learning_rate": 4.419542873003479e-06, "loss": 0.5849, "mean_token_accuracy": 0.879091739654541, "num_tokens": 122369829.0, "step": 1144 }, { "epoch": 2.6088939566704674, "grad_norm": 3.921875, "learning_rate": 4.418335661004436e-06, "loss": 0.6004, "mean_token_accuracy": 0.8749701827764511, "num_tokens": 122476808.0, "step": 1145 }, { "epoch": 2.6111744583808436, "grad_norm": 4.65625, "learning_rate": 4.417127360195107e-06, "loss": 0.5838, "mean_token_accuracy": 0.8770206123590469, "num_tokens": 122583941.0, "step": 1146 }, { "epoch": 2.61345496009122, "grad_norm": 2.328125, "learning_rate": 4.415917971261299e-06, "loss": 0.5929, "mean_token_accuracy": 0.8799891471862793, "num_tokens": 122691451.0, "step": 1147 }, { "epoch": 2.6157354618015964, "grad_norm": 2.671875, "learning_rate": 4.414707494889439e-06, "loss": 0.5782, "mean_token_accuracy": 0.8810625970363617, "num_tokens": 122798569.0, "step": 1148 }, { "epoch": 2.6180159635119726, "grad_norm": 2.34375, "learning_rate": 4.413495931766571e-06, "loss": 0.5959, "mean_token_accuracy": 0.8791490197181702, "num_tokens": 122906142.0, "step": 1149 }, { "epoch": 2.620296465222349, "grad_norm": 4.84375, "learning_rate": 4.412283282580352e-06, "loss": 0.5818, "mean_token_accuracy": 0.880540743470192, "num_tokens": 123012808.0, "step": 1150 }, { "epoch": 2.6225769669327255, "grad_norm": 2.296875, "learning_rate": 4.41106954801906e-06, "loss": 0.5901, "mean_token_accuracy": 0.8770016133785248, "num_tokens": 123119979.0, "step": 1151 }, { "epoch": 2.6248574686431017, "grad_norm": 3.859375, "learning_rate": 4.409854728771588e-06, "loss": 0.5875, "mean_token_accuracy": 0.8810955137014389, "num_tokens": 123227161.0, "step": 1152 }, { "epoch": 2.627137970353478, "grad_norm": 4.375, "learning_rate": 4.4086388255274425e-06, "loss": 0.5973, "mean_token_accuracy": 0.8776374161243439, "num_tokens": 123333866.0, "step": 1153 }, { "epoch": 2.629418472063854, "grad_norm": 2.171875, "learning_rate": 4.407421838976747e-06, "loss": 0.5688, "mean_token_accuracy": 0.8836002796888351, "num_tokens": 123441095.0, "step": 1154 }, { "epoch": 2.6316989737742302, "grad_norm": 2.390625, "learning_rate": 4.40620376981024e-06, "loss": 0.5776, "mean_token_accuracy": 0.8822665065526962, "num_tokens": 123548255.0, "step": 1155 }, { "epoch": 2.6339794754846064, "grad_norm": 3.875, "learning_rate": 4.404984618719275e-06, "loss": 0.6157, "mean_token_accuracy": 0.8759646117687225, "num_tokens": 123654882.0, "step": 1156 }, { "epoch": 2.636259977194983, "grad_norm": 3.609375, "learning_rate": 4.403764386395817e-06, "loss": 0.5805, "mean_token_accuracy": 0.8818103224039078, "num_tokens": 123762537.0, "step": 1157 }, { "epoch": 2.6385404789053593, "grad_norm": 2.28125, "learning_rate": 4.402543073532446e-06, "loss": 0.5647, "mean_token_accuracy": 0.8845062106847763, "num_tokens": 123870045.0, "step": 1158 }, { "epoch": 2.6408209806157354, "grad_norm": 2.109375, "learning_rate": 4.401320680822357e-06, "loss": 0.5626, "mean_token_accuracy": 0.8820901811122894, "num_tokens": 123977756.0, "step": 1159 }, { "epoch": 2.6431014823261116, "grad_norm": 4.34375, "learning_rate": 4.400097208959357e-06, "loss": 0.5636, "mean_token_accuracy": 0.8831377625465393, "num_tokens": 124085720.0, "step": 1160 }, { "epoch": 2.6453819840364883, "grad_norm": 5.375, "learning_rate": 4.398872658637863e-06, "loss": 0.5896, "mean_token_accuracy": 0.8773757070302963, "num_tokens": 124192659.0, "step": 1161 }, { "epoch": 2.6476624857468645, "grad_norm": 4.9375, "learning_rate": 4.397647030552907e-06, "loss": 0.6128, "mean_token_accuracy": 0.8716171085834503, "num_tokens": 124299363.0, "step": 1162 }, { "epoch": 2.6499429874572407, "grad_norm": 4.75, "learning_rate": 4.396420325400132e-06, "loss": 0.5984, "mean_token_accuracy": 0.8754719495773315, "num_tokens": 124406432.0, "step": 1163 }, { "epoch": 2.652223489167617, "grad_norm": 8.6875, "learning_rate": 4.3951925438757936e-06, "loss": 0.6075, "mean_token_accuracy": 0.8721831738948822, "num_tokens": 124513942.0, "step": 1164 }, { "epoch": 2.654503990877993, "grad_norm": 5.6875, "learning_rate": 4.3939636866767535e-06, "loss": 0.5874, "mean_token_accuracy": 0.8772239238023758, "num_tokens": 124621397.0, "step": 1165 }, { "epoch": 2.6567844925883692, "grad_norm": 9.8125, "learning_rate": 4.39273375450049e-06, "loss": 0.6082, "mean_token_accuracy": 0.8729386478662491, "num_tokens": 124728132.0, "step": 1166 }, { "epoch": 2.659064994298746, "grad_norm": 5.09375, "learning_rate": 4.391502748045088e-06, "loss": 0.5892, "mean_token_accuracy": 0.8794106990098953, "num_tokens": 124834818.0, "step": 1167 }, { "epoch": 2.661345496009122, "grad_norm": 3.859375, "learning_rate": 4.390270668009244e-06, "loss": 0.5857, "mean_token_accuracy": 0.8794341683387756, "num_tokens": 124941733.0, "step": 1168 }, { "epoch": 2.6636259977194983, "grad_norm": 2.953125, "learning_rate": 4.38903751509226e-06, "loss": 0.5853, "mean_token_accuracy": 0.8788381963968277, "num_tokens": 125049080.0, "step": 1169 }, { "epoch": 2.6659064994298745, "grad_norm": 3.546875, "learning_rate": 4.3878032899940534e-06, "loss": 0.601, "mean_token_accuracy": 0.8787420690059662, "num_tokens": 125156160.0, "step": 1170 }, { "epoch": 2.668187001140251, "grad_norm": 6.1875, "learning_rate": 4.386567993415144e-06, "loss": 0.5948, "mean_token_accuracy": 0.8779895603656769, "num_tokens": 125263507.0, "step": 1171 }, { "epoch": 2.6704675028506273, "grad_norm": 5.375, "learning_rate": 4.3853316260566635e-06, "loss": 0.5665, "mean_token_accuracy": 0.8852428048849106, "num_tokens": 125370490.0, "step": 1172 }, { "epoch": 2.6727480045610035, "grad_norm": 5.03125, "learning_rate": 4.384094188620349e-06, "loss": 0.6148, "mean_token_accuracy": 0.8747402876615524, "num_tokens": 125477079.0, "step": 1173 }, { "epoch": 2.6750285062713797, "grad_norm": 3.4375, "learning_rate": 4.3828556818085485e-06, "loss": 0.5914, "mean_token_accuracy": 0.8805000483989716, "num_tokens": 125584049.0, "step": 1174 }, { "epoch": 2.677309007981756, "grad_norm": 2.890625, "learning_rate": 4.3816161063242115e-06, "loss": 0.5938, "mean_token_accuracy": 0.8765310496091843, "num_tokens": 125690629.0, "step": 1175 }, { "epoch": 2.679589509692132, "grad_norm": 2.625, "learning_rate": 4.3803754628708995e-06, "loss": 0.5705, "mean_token_accuracy": 0.8820624649524689, "num_tokens": 125798232.0, "step": 1176 }, { "epoch": 2.6818700114025087, "grad_norm": 3.65625, "learning_rate": 4.379133752152776e-06, "loss": 0.5746, "mean_token_accuracy": 0.8863883018493652, "num_tokens": 125905642.0, "step": 1177 }, { "epoch": 2.684150513112885, "grad_norm": 2.25, "learning_rate": 4.377890974874614e-06, "loss": 0.5849, "mean_token_accuracy": 0.8794945180416107, "num_tokens": 126013558.0, "step": 1178 }, { "epoch": 2.686431014823261, "grad_norm": 2.125, "learning_rate": 4.376647131741787e-06, "loss": 0.5741, "mean_token_accuracy": 0.8828789293766022, "num_tokens": 126120692.0, "step": 1179 }, { "epoch": 2.6887115165336373, "grad_norm": 2.578125, "learning_rate": 4.375402223460279e-06, "loss": 0.6034, "mean_token_accuracy": 0.8755092322826385, "num_tokens": 126227571.0, "step": 1180 }, { "epoch": 2.690992018244014, "grad_norm": 5.15625, "learning_rate": 4.3741562507366754e-06, "loss": 0.6015, "mean_token_accuracy": 0.8771520704030991, "num_tokens": 126334666.0, "step": 1181 }, { "epoch": 2.69327251995439, "grad_norm": 2.640625, "learning_rate": 4.3729092142781655e-06, "loss": 0.5598, "mean_token_accuracy": 0.8833112865686417, "num_tokens": 126441830.0, "step": 1182 }, { "epoch": 2.6955530216647663, "grad_norm": 4.4375, "learning_rate": 4.3716611147925435e-06, "loss": 0.5759, "mean_token_accuracy": 0.8818268924951553, "num_tokens": 126548549.0, "step": 1183 }, { "epoch": 2.6978335233751425, "grad_norm": 3.015625, "learning_rate": 4.370411952988207e-06, "loss": 0.5963, "mean_token_accuracy": 0.8798163086175919, "num_tokens": 126655776.0, "step": 1184 }, { "epoch": 2.7001140250855187, "grad_norm": 2.484375, "learning_rate": 4.369161729574155e-06, "loss": 0.6053, "mean_token_accuracy": 0.8758595883846283, "num_tokens": 126762585.0, "step": 1185 }, { "epoch": 2.702394526795895, "grad_norm": 2.671875, "learning_rate": 4.367910445259991e-06, "loss": 0.5876, "mean_token_accuracy": 0.8808387517929077, "num_tokens": 126869208.0, "step": 1186 }, { "epoch": 2.7046750285062715, "grad_norm": 3.703125, "learning_rate": 4.36665810075592e-06, "loss": 0.5894, "mean_token_accuracy": 0.877717912197113, "num_tokens": 126976031.0, "step": 1187 }, { "epoch": 2.7069555302166477, "grad_norm": 3.328125, "learning_rate": 4.365404696772748e-06, "loss": 0.5819, "mean_token_accuracy": 0.8818590492010117, "num_tokens": 127083381.0, "step": 1188 }, { "epoch": 2.709236031927024, "grad_norm": 2.0625, "learning_rate": 4.364150234021883e-06, "loss": 0.5807, "mean_token_accuracy": 0.8820794522762299, "num_tokens": 127190792.0, "step": 1189 }, { "epoch": 2.7115165336374, "grad_norm": 2.859375, "learning_rate": 4.362894713215334e-06, "loss": 0.5693, "mean_token_accuracy": 0.8834023624658585, "num_tokens": 127298871.0, "step": 1190 }, { "epoch": 2.7137970353477767, "grad_norm": 2.359375, "learning_rate": 4.361638135065711e-06, "loss": 0.557, "mean_token_accuracy": 0.8848878443241119, "num_tokens": 127407231.0, "step": 1191 }, { "epoch": 2.716077537058153, "grad_norm": 3.25, "learning_rate": 4.360380500286222e-06, "loss": 0.5808, "mean_token_accuracy": 0.8773325383663177, "num_tokens": 127514509.0, "step": 1192 }, { "epoch": 2.718358038768529, "grad_norm": 2.921875, "learning_rate": 4.359121809590678e-06, "loss": 0.5884, "mean_token_accuracy": 0.8788514882326126, "num_tokens": 127621769.0, "step": 1193 }, { "epoch": 2.7206385404789053, "grad_norm": 3.390625, "learning_rate": 4.357862063693486e-06, "loss": 0.6107, "mean_token_accuracy": 0.8736522942781448, "num_tokens": 127728235.0, "step": 1194 }, { "epoch": 2.7229190421892815, "grad_norm": 2.640625, "learning_rate": 4.356601263309654e-06, "loss": 0.5938, "mean_token_accuracy": 0.8788127601146698, "num_tokens": 127834915.0, "step": 1195 }, { "epoch": 2.7251995438996577, "grad_norm": 2.5, "learning_rate": 4.355339409154788e-06, "loss": 0.5965, "mean_token_accuracy": 0.8762216120958328, "num_tokens": 127942200.0, "step": 1196 }, { "epoch": 2.7274800456100343, "grad_norm": 2.65625, "learning_rate": 4.354076501945093e-06, "loss": 0.6054, "mean_token_accuracy": 0.8778090626001358, "num_tokens": 128049307.0, "step": 1197 }, { "epoch": 2.7297605473204105, "grad_norm": 2.53125, "learning_rate": 4.352812542397369e-06, "loss": 0.5963, "mean_token_accuracy": 0.8793000727891922, "num_tokens": 128155836.0, "step": 1198 }, { "epoch": 2.7320410490307867, "grad_norm": 2.546875, "learning_rate": 4.351547531229016e-06, "loss": 0.5806, "mean_token_accuracy": 0.8827884644269943, "num_tokens": 128262857.0, "step": 1199 }, { "epoch": 2.734321550741163, "grad_norm": 4.21875, "learning_rate": 4.350281469158029e-06, "loss": 0.61, "mean_token_accuracy": 0.8760698437690735, "num_tokens": 128369643.0, "step": 1200 }, { "epoch": 2.7366020524515395, "grad_norm": 2.5625, "learning_rate": 4.3490143569030025e-06, "loss": 0.5919, "mean_token_accuracy": 0.8797038048505783, "num_tokens": 128476571.0, "step": 1201 }, { "epoch": 2.7388825541619157, "grad_norm": 4.21875, "learning_rate": 4.347746195183123e-06, "loss": 0.5968, "mean_token_accuracy": 0.8761547356843948, "num_tokens": 128583850.0, "step": 1202 }, { "epoch": 2.741163055872292, "grad_norm": 4.28125, "learning_rate": 4.346476984718176e-06, "loss": 0.6078, "mean_token_accuracy": 0.8726416677236557, "num_tokens": 128691263.0, "step": 1203 }, { "epoch": 2.743443557582668, "grad_norm": 5.59375, "learning_rate": 4.345206726228538e-06, "loss": 0.5859, "mean_token_accuracy": 0.8812550455331802, "num_tokens": 128798421.0, "step": 1204 }, { "epoch": 2.7457240592930443, "grad_norm": 2.21875, "learning_rate": 4.343935420435187e-06, "loss": 0.5942, "mean_token_accuracy": 0.8780511021614075, "num_tokens": 128905472.0, "step": 1205 }, { "epoch": 2.7480045610034205, "grad_norm": 2.484375, "learning_rate": 4.34266306805969e-06, "loss": 0.5884, "mean_token_accuracy": 0.8798847049474716, "num_tokens": 129012064.0, "step": 1206 }, { "epoch": 2.750285062713797, "grad_norm": 3.171875, "learning_rate": 4.341389669824209e-06, "loss": 0.5915, "mean_token_accuracy": 0.8787447810173035, "num_tokens": 129118812.0, "step": 1207 }, { "epoch": 2.7525655644241733, "grad_norm": 4.0, "learning_rate": 4.340115226451501e-06, "loss": 0.602, "mean_token_accuracy": 0.8773921728134155, "num_tokens": 129226022.0, "step": 1208 }, { "epoch": 2.7548460661345495, "grad_norm": 5.0625, "learning_rate": 4.338839738664915e-06, "loss": 0.591, "mean_token_accuracy": 0.8822150230407715, "num_tokens": 129333172.0, "step": 1209 }, { "epoch": 2.757126567844926, "grad_norm": 2.515625, "learning_rate": 4.3375632071883935e-06, "loss": 0.5966, "mean_token_accuracy": 0.8766026943922043, "num_tokens": 129440070.0, "step": 1210 }, { "epoch": 2.7594070695553023, "grad_norm": 3.453125, "learning_rate": 4.336285632746472e-06, "loss": 0.5997, "mean_token_accuracy": 0.8804485648870468, "num_tokens": 129547114.0, "step": 1211 }, { "epoch": 2.7616875712656785, "grad_norm": 2.59375, "learning_rate": 4.3350070160642754e-06, "loss": 0.588, "mean_token_accuracy": 0.8801703006029129, "num_tokens": 129653799.0, "step": 1212 }, { "epoch": 2.7639680729760547, "grad_norm": 3.5625, "learning_rate": 4.333727357867523e-06, "loss": 0.5735, "mean_token_accuracy": 0.8849901556968689, "num_tokens": 129761360.0, "step": 1213 }, { "epoch": 2.766248574686431, "grad_norm": 2.5, "learning_rate": 4.3324466588825235e-06, "loss": 0.5723, "mean_token_accuracy": 0.8838348835706711, "num_tokens": 129868322.0, "step": 1214 }, { "epoch": 2.768529076396807, "grad_norm": 2.125, "learning_rate": 4.331164919836177e-06, "loss": 0.5913, "mean_token_accuracy": 0.8776644319295883, "num_tokens": 129975643.0, "step": 1215 }, { "epoch": 2.7708095781071833, "grad_norm": 2.84375, "learning_rate": 4.329882141455974e-06, "loss": 0.5871, "mean_token_accuracy": 0.8794203400611877, "num_tokens": 130082157.0, "step": 1216 }, { "epoch": 2.77309007981756, "grad_norm": 2.859375, "learning_rate": 4.3285983244699955e-06, "loss": 0.5854, "mean_token_accuracy": 0.8823015242815018, "num_tokens": 130189446.0, "step": 1217 }, { "epoch": 2.775370581527936, "grad_norm": 2.796875, "learning_rate": 4.327313469606911e-06, "loss": 0.5938, "mean_token_accuracy": 0.8795596957206726, "num_tokens": 130296713.0, "step": 1218 }, { "epoch": 2.7776510832383123, "grad_norm": 2.78125, "learning_rate": 4.326027577595977e-06, "loss": 0.5853, "mean_token_accuracy": 0.8805474638938904, "num_tokens": 130403572.0, "step": 1219 }, { "epoch": 2.779931584948689, "grad_norm": 2.421875, "learning_rate": 4.324740649167044e-06, "loss": 0.5829, "mean_token_accuracy": 0.8781325221061707, "num_tokens": 130510616.0, "step": 1220 }, { "epoch": 2.782212086659065, "grad_norm": 2.0625, "learning_rate": 4.323452685050545e-06, "loss": 0.5753, "mean_token_accuracy": 0.8842640519142151, "num_tokens": 130618088.0, "step": 1221 }, { "epoch": 2.7844925883694414, "grad_norm": 2.40625, "learning_rate": 4.3221636859775075e-06, "loss": 0.5815, "mean_token_accuracy": 0.8810494989156723, "num_tokens": 130725134.0, "step": 1222 }, { "epoch": 2.7867730900798175, "grad_norm": 4.03125, "learning_rate": 4.320873652679538e-06, "loss": 0.5892, "mean_token_accuracy": 0.8809178620576859, "num_tokens": 130832079.0, "step": 1223 }, { "epoch": 2.7890535917901937, "grad_norm": 2.234375, "learning_rate": 4.319582585888838e-06, "loss": 0.5868, "mean_token_accuracy": 0.8811828643083572, "num_tokens": 130939236.0, "step": 1224 }, { "epoch": 2.79133409350057, "grad_norm": 2.25, "learning_rate": 4.31829048633819e-06, "loss": 0.5929, "mean_token_accuracy": 0.8782161772251129, "num_tokens": 131046662.0, "step": 1225 }, { "epoch": 2.7936145952109466, "grad_norm": 2.625, "learning_rate": 4.316997354760965e-06, "loss": 0.5863, "mean_token_accuracy": 0.8795737326145172, "num_tokens": 131153949.0, "step": 1226 }, { "epoch": 2.7958950969213228, "grad_norm": 2.703125, "learning_rate": 4.3157031918911204e-06, "loss": 0.592, "mean_token_accuracy": 0.8794270306825638, "num_tokens": 131260517.0, "step": 1227 }, { "epoch": 2.798175598631699, "grad_norm": 2.5625, "learning_rate": 4.314407998463198e-06, "loss": 0.5663, "mean_token_accuracy": 0.881917729973793, "num_tokens": 131367523.0, "step": 1228 }, { "epoch": 2.800456100342075, "grad_norm": 6.40625, "learning_rate": 4.3131117752123235e-06, "loss": 0.598, "mean_token_accuracy": 0.8779633790254593, "num_tokens": 131474262.0, "step": 1229 }, { "epoch": 2.802736602052452, "grad_norm": 3.5, "learning_rate": 4.311814522874209e-06, "loss": 0.5936, "mean_token_accuracy": 0.8795278668403625, "num_tokens": 131581107.0, "step": 1230 }, { "epoch": 2.805017103762828, "grad_norm": 3.609375, "learning_rate": 4.3105162421851494e-06, "loss": 0.6044, "mean_token_accuracy": 0.8744796365499496, "num_tokens": 131687930.0, "step": 1231 }, { "epoch": 2.807297605473204, "grad_norm": 5.8125, "learning_rate": 4.309216933882025e-06, "loss": 0.5833, "mean_token_accuracy": 0.8764165937900543, "num_tokens": 131794765.0, "step": 1232 }, { "epoch": 2.8095781071835804, "grad_norm": 2.484375, "learning_rate": 4.307916598702296e-06, "loss": 0.5676, "mean_token_accuracy": 0.883644163608551, "num_tokens": 131902317.0, "step": 1233 }, { "epoch": 2.8118586088939566, "grad_norm": 3.65625, "learning_rate": 4.3066152373840105e-06, "loss": 0.6001, "mean_token_accuracy": 0.8781489878892899, "num_tokens": 132009851.0, "step": 1234 }, { "epoch": 2.8141391106043327, "grad_norm": 2.71875, "learning_rate": 4.305312850665794e-06, "loss": 0.5967, "mean_token_accuracy": 0.8778474628925323, "num_tokens": 132116939.0, "step": 1235 }, { "epoch": 2.8164196123147094, "grad_norm": 4.0, "learning_rate": 4.304009439286855e-06, "loss": 0.5897, "mean_token_accuracy": 0.8780102729797363, "num_tokens": 132223406.0, "step": 1236 }, { "epoch": 2.8187001140250856, "grad_norm": 2.953125, "learning_rate": 4.3027050039869865e-06, "loss": 0.5879, "mean_token_accuracy": 0.8773478418588638, "num_tokens": 132330700.0, "step": 1237 }, { "epoch": 2.8209806157354618, "grad_norm": 2.65625, "learning_rate": 4.301399545506561e-06, "loss": 0.5785, "mean_token_accuracy": 0.8794009536504745, "num_tokens": 132437921.0, "step": 1238 }, { "epoch": 2.823261117445838, "grad_norm": 2.421875, "learning_rate": 4.3000930645865305e-06, "loss": 0.581, "mean_token_accuracy": 0.8809588998556137, "num_tokens": 132545056.0, "step": 1239 }, { "epoch": 2.8255416191562146, "grad_norm": 2.453125, "learning_rate": 4.298785561968428e-06, "loss": 0.5899, "mean_token_accuracy": 0.8774296343326569, "num_tokens": 132651620.0, "step": 1240 }, { "epoch": 2.827822120866591, "grad_norm": 2.90625, "learning_rate": 4.297477038394368e-06, "loss": 0.5687, "mean_token_accuracy": 0.8836159557104111, "num_tokens": 132758652.0, "step": 1241 }, { "epoch": 2.830102622576967, "grad_norm": 2.53125, "learning_rate": 4.296167494607043e-06, "loss": 0.584, "mean_token_accuracy": 0.8778567016124725, "num_tokens": 132866236.0, "step": 1242 }, { "epoch": 2.832383124287343, "grad_norm": 3.078125, "learning_rate": 4.294856931349724e-06, "loss": 0.6096, "mean_token_accuracy": 0.8745770305395126, "num_tokens": 132973334.0, "step": 1243 }, { "epoch": 2.8346636259977194, "grad_norm": 2.515625, "learning_rate": 4.293545349366262e-06, "loss": 0.5915, "mean_token_accuracy": 0.8771042227745056, "num_tokens": 133079721.0, "step": 1244 }, { "epoch": 2.8369441277080956, "grad_norm": 2.578125, "learning_rate": 4.292232749401085e-06, "loss": 0.5904, "mean_token_accuracy": 0.8794742226600647, "num_tokens": 133186577.0, "step": 1245 }, { "epoch": 2.839224629418472, "grad_norm": 2.703125, "learning_rate": 4.2909191321992e-06, "loss": 0.6038, "mean_token_accuracy": 0.8782872408628464, "num_tokens": 133293871.0, "step": 1246 }, { "epoch": 2.8415051311288484, "grad_norm": 2.921875, "learning_rate": 4.2896044985061915e-06, "loss": 0.5959, "mean_token_accuracy": 0.8800533413887024, "num_tokens": 133400863.0, "step": 1247 }, { "epoch": 2.8437856328392246, "grad_norm": 2.421875, "learning_rate": 4.288288849068218e-06, "loss": 0.5854, "mean_token_accuracy": 0.8809787184000015, "num_tokens": 133507932.0, "step": 1248 }, { "epoch": 2.846066134549601, "grad_norm": 3.25, "learning_rate": 4.286972184632019e-06, "loss": 0.6134, "mean_token_accuracy": 0.8738250583410263, "num_tokens": 133615287.0, "step": 1249 }, { "epoch": 2.8483466362599774, "grad_norm": 2.140625, "learning_rate": 4.285654505944906e-06, "loss": 0.5767, "mean_token_accuracy": 0.8837039470672607, "num_tokens": 133722203.0, "step": 1250 }, { "epoch": 2.8506271379703536, "grad_norm": 3.78125, "learning_rate": 4.28433581375477e-06, "loss": 0.56, "mean_token_accuracy": 0.8851412385702133, "num_tokens": 133829983.0, "step": 1251 }, { "epoch": 2.85290763968073, "grad_norm": 3.5, "learning_rate": 4.283016108810073e-06, "loss": 0.6047, "mean_token_accuracy": 0.8773223906755447, "num_tokens": 133937207.0, "step": 1252 }, { "epoch": 2.855188141391106, "grad_norm": 2.328125, "learning_rate": 4.281695391859854e-06, "loss": 0.571, "mean_token_accuracy": 0.8886352777481079, "num_tokens": 134044589.0, "step": 1253 }, { "epoch": 2.857468643101482, "grad_norm": 5.09375, "learning_rate": 4.28037366365373e-06, "loss": 0.5988, "mean_token_accuracy": 0.8755118399858475, "num_tokens": 134151607.0, "step": 1254 }, { "epoch": 2.8597491448118584, "grad_norm": 2.8125, "learning_rate": 4.279050924941885e-06, "loss": 0.5854, "mean_token_accuracy": 0.8779601603746414, "num_tokens": 134258813.0, "step": 1255 }, { "epoch": 2.862029646522235, "grad_norm": 3.4375, "learning_rate": 4.2777271764750805e-06, "loss": 0.5679, "mean_token_accuracy": 0.8823637515306473, "num_tokens": 134366131.0, "step": 1256 }, { "epoch": 2.864310148232611, "grad_norm": 2.125, "learning_rate": 4.276402419004652e-06, "loss": 0.5843, "mean_token_accuracy": 0.8769189864397049, "num_tokens": 134472934.0, "step": 1257 }, { "epoch": 2.8665906499429874, "grad_norm": 2.3125, "learning_rate": 4.275076653282504e-06, "loss": 0.5915, "mean_token_accuracy": 0.8746793568134308, "num_tokens": 134579651.0, "step": 1258 }, { "epoch": 2.8688711516533636, "grad_norm": 3.59375, "learning_rate": 4.273749880061118e-06, "loss": 0.599, "mean_token_accuracy": 0.8750852793455124, "num_tokens": 134686349.0, "step": 1259 }, { "epoch": 2.8711516533637402, "grad_norm": 3.484375, "learning_rate": 4.272422100093542e-06, "loss": 0.6092, "mean_token_accuracy": 0.8773278295993805, "num_tokens": 134793245.0, "step": 1260 }, { "epoch": 2.8734321550741164, "grad_norm": 3.28125, "learning_rate": 4.271093314133401e-06, "loss": 0.6051, "mean_token_accuracy": 0.8744064420461655, "num_tokens": 134900346.0, "step": 1261 }, { "epoch": 2.8757126567844926, "grad_norm": 2.671875, "learning_rate": 4.269763522934888e-06, "loss": 0.5923, "mean_token_accuracy": 0.8789143562316895, "num_tokens": 135007561.0, "step": 1262 }, { "epoch": 2.877993158494869, "grad_norm": 7.625, "learning_rate": 4.268432727252765e-06, "loss": 0.5911, "mean_token_accuracy": 0.8737368434667587, "num_tokens": 135115241.0, "step": 1263 }, { "epoch": 2.880273660205245, "grad_norm": 5.4375, "learning_rate": 4.2671009278423665e-06, "loss": 0.5813, "mean_token_accuracy": 0.8808005452156067, "num_tokens": 135221997.0, "step": 1264 }, { "epoch": 2.882554161915621, "grad_norm": 3.890625, "learning_rate": 4.265768125459597e-06, "loss": 0.5756, "mean_token_accuracy": 0.879405215382576, "num_tokens": 135329159.0, "step": 1265 }, { "epoch": 2.884834663625998, "grad_norm": 2.53125, "learning_rate": 4.264434320860929e-06, "loss": 0.5794, "mean_token_accuracy": 0.8776859790086746, "num_tokens": 135436122.0, "step": 1266 }, { "epoch": 2.887115165336374, "grad_norm": 2.921875, "learning_rate": 4.2630995148034044e-06, "loss": 0.6147, "mean_token_accuracy": 0.8750355541706085, "num_tokens": 135542672.0, "step": 1267 }, { "epoch": 2.88939566704675, "grad_norm": 2.265625, "learning_rate": 4.261763708044633e-06, "loss": 0.5957, "mean_token_accuracy": 0.8799539804458618, "num_tokens": 135649649.0, "step": 1268 }, { "epoch": 2.8916761687571264, "grad_norm": 2.953125, "learning_rate": 4.2604269013427925e-06, "loss": 0.5871, "mean_token_accuracy": 0.8781967163085938, "num_tokens": 135756953.0, "step": 1269 }, { "epoch": 2.893956670467503, "grad_norm": 3.03125, "learning_rate": 4.25908909545663e-06, "loss": 0.6056, "mean_token_accuracy": 0.8765884786844254, "num_tokens": 135863953.0, "step": 1270 }, { "epoch": 2.8962371721778792, "grad_norm": 3.890625, "learning_rate": 4.257750291145457e-06, "loss": 0.611, "mean_token_accuracy": 0.8713682293891907, "num_tokens": 135970439.0, "step": 1271 }, { "epoch": 2.8985176738882554, "grad_norm": 2.125, "learning_rate": 4.256410489169154e-06, "loss": 0.5826, "mean_token_accuracy": 0.8807791769504547, "num_tokens": 136077645.0, "step": 1272 }, { "epoch": 2.9007981755986316, "grad_norm": 5.125, "learning_rate": 4.255069690288166e-06, "loss": 0.6053, "mean_token_accuracy": 0.8755863457918167, "num_tokens": 136184852.0, "step": 1273 }, { "epoch": 2.903078677309008, "grad_norm": 2.265625, "learning_rate": 4.253727895263504e-06, "loss": 0.5754, "mean_token_accuracy": 0.8805266171693802, "num_tokens": 136292297.0, "step": 1274 }, { "epoch": 2.905359179019384, "grad_norm": 2.4375, "learning_rate": 4.252385104856746e-06, "loss": 0.5819, "mean_token_accuracy": 0.8784385919570923, "num_tokens": 136399187.0, "step": 1275 }, { "epoch": 2.9076396807297606, "grad_norm": 2.78125, "learning_rate": 4.251041319830034e-06, "loss": 0.5675, "mean_token_accuracy": 0.8847327828407288, "num_tokens": 136506050.0, "step": 1276 }, { "epoch": 2.909920182440137, "grad_norm": 2.640625, "learning_rate": 4.249696540946074e-06, "loss": 0.5737, "mean_token_accuracy": 0.8823814243078232, "num_tokens": 136613948.0, "step": 1277 }, { "epoch": 2.912200684150513, "grad_norm": 2.40625, "learning_rate": 4.248350768968136e-06, "loss": 0.5926, "mean_token_accuracy": 0.8782215118408203, "num_tokens": 136720344.0, "step": 1278 }, { "epoch": 2.9144811858608897, "grad_norm": 2.6875, "learning_rate": 4.247004004660055e-06, "loss": 0.5972, "mean_token_accuracy": 0.8780831098556519, "num_tokens": 136827167.0, "step": 1279 }, { "epoch": 2.916761687571266, "grad_norm": 2.359375, "learning_rate": 4.245656248786228e-06, "loss": 0.573, "mean_token_accuracy": 0.880301833152771, "num_tokens": 136933946.0, "step": 1280 }, { "epoch": 2.919042189281642, "grad_norm": 2.9375, "learning_rate": 4.2443075021116166e-06, "loss": 0.5953, "mean_token_accuracy": 0.8767362087965012, "num_tokens": 137040549.0, "step": 1281 }, { "epoch": 2.9213226909920182, "grad_norm": 2.65625, "learning_rate": 4.242957765401741e-06, "loss": 0.5736, "mean_token_accuracy": 0.8834853321313858, "num_tokens": 137147698.0, "step": 1282 }, { "epoch": 2.9236031927023944, "grad_norm": 2.765625, "learning_rate": 4.241607039422687e-06, "loss": 0.5732, "mean_token_accuracy": 0.8839240074157715, "num_tokens": 137254610.0, "step": 1283 }, { "epoch": 2.9258836944127706, "grad_norm": 2.484375, "learning_rate": 4.2402553249411e-06, "loss": 0.5888, "mean_token_accuracy": 0.8824329972267151, "num_tokens": 137361723.0, "step": 1284 }, { "epoch": 2.928164196123147, "grad_norm": 3.234375, "learning_rate": 4.238902622724188e-06, "loss": 0.5786, "mean_token_accuracy": 0.8801659643650055, "num_tokens": 137469233.0, "step": 1285 }, { "epoch": 2.9304446978335235, "grad_norm": 3.171875, "learning_rate": 4.237548933539718e-06, "loss": 0.5609, "mean_token_accuracy": 0.8831808865070343, "num_tokens": 137575939.0, "step": 1286 }, { "epoch": 2.9327251995438997, "grad_norm": 2.65625, "learning_rate": 4.236194258156019e-06, "loss": 0.5652, "mean_token_accuracy": 0.8819428384304047, "num_tokens": 137683411.0, "step": 1287 }, { "epoch": 2.935005701254276, "grad_norm": 2.703125, "learning_rate": 4.234838597341977e-06, "loss": 0.5775, "mean_token_accuracy": 0.8816139101982117, "num_tokens": 137790631.0, "step": 1288 }, { "epoch": 2.9372862029646525, "grad_norm": 3.421875, "learning_rate": 4.233481951867039e-06, "loss": 0.6042, "mean_token_accuracy": 0.8764592856168747, "num_tokens": 137897306.0, "step": 1289 }, { "epoch": 2.9395667046750287, "grad_norm": 2.375, "learning_rate": 4.232124322501212e-06, "loss": 0.5876, "mean_token_accuracy": 0.8771606087684631, "num_tokens": 138004247.0, "step": 1290 }, { "epoch": 2.941847206385405, "grad_norm": 2.53125, "learning_rate": 4.230765710015058e-06, "loss": 0.5972, "mean_token_accuracy": 0.8792049288749695, "num_tokens": 138110738.0, "step": 1291 }, { "epoch": 2.944127708095781, "grad_norm": 2.359375, "learning_rate": 4.229406115179703e-06, "loss": 0.5811, "mean_token_accuracy": 0.8832272589206696, "num_tokens": 138218085.0, "step": 1292 }, { "epoch": 2.9464082098061573, "grad_norm": 4.34375, "learning_rate": 4.228045538766823e-06, "loss": 0.5946, "mean_token_accuracy": 0.8802038431167603, "num_tokens": 138324630.0, "step": 1293 }, { "epoch": 2.9486887115165334, "grad_norm": 3.734375, "learning_rate": 4.226683981548656e-06, "loss": 0.6021, "mean_token_accuracy": 0.8761596828699112, "num_tokens": 138431811.0, "step": 1294 }, { "epoch": 2.95096921322691, "grad_norm": 3.8125, "learning_rate": 4.2253214442979975e-06, "loss": 0.5868, "mean_token_accuracy": 0.8778193593025208, "num_tokens": 138539059.0, "step": 1295 }, { "epoch": 2.9532497149372863, "grad_norm": 6.34375, "learning_rate": 4.223957927788195e-06, "loss": 0.6276, "mean_token_accuracy": 0.8704663217067719, "num_tokens": 138646166.0, "step": 1296 }, { "epoch": 2.9555302166476625, "grad_norm": 7.0625, "learning_rate": 4.222593432793155e-06, "loss": 0.5865, "mean_token_accuracy": 0.879715234041214, "num_tokens": 138753154.0, "step": 1297 }, { "epoch": 2.9578107183580387, "grad_norm": 3.421875, "learning_rate": 4.2212279600873385e-06, "loss": 0.5672, "mean_token_accuracy": 0.8822585344314575, "num_tokens": 138860601.0, "step": 1298 }, { "epoch": 2.9600912200684153, "grad_norm": 6.03125, "learning_rate": 4.219861510445762e-06, "loss": 0.5785, "mean_token_accuracy": 0.8831232637166977, "num_tokens": 138967531.0, "step": 1299 }, { "epoch": 2.9623717217787915, "grad_norm": 3.78125, "learning_rate": 4.2184940846439946e-06, "loss": 0.6037, "mean_token_accuracy": 0.8782211989164352, "num_tokens": 139074913.0, "step": 1300 }, { "epoch": 2.9646522234891677, "grad_norm": 9.3125, "learning_rate": 4.217125683458162e-06, "loss": 0.5744, "mean_token_accuracy": 0.8801289498806, "num_tokens": 139182021.0, "step": 1301 }, { "epoch": 2.966932725199544, "grad_norm": 11.3125, "learning_rate": 4.215756307664941e-06, "loss": 0.5911, "mean_token_accuracy": 0.8780060112476349, "num_tokens": 139289235.0, "step": 1302 }, { "epoch": 2.96921322690992, "grad_norm": 13.75, "learning_rate": 4.214385958041565e-06, "loss": 0.6209, "mean_token_accuracy": 0.8715900182723999, "num_tokens": 139396474.0, "step": 1303 }, { "epoch": 2.9714937286202963, "grad_norm": 9.8125, "learning_rate": 4.213014635365816e-06, "loss": 0.614, "mean_token_accuracy": 0.8730522990226746, "num_tokens": 139502964.0, "step": 1304 }, { "epoch": 2.973774230330673, "grad_norm": 7.125, "learning_rate": 4.2116423404160316e-06, "loss": 0.5762, "mean_token_accuracy": 0.8835666477680206, "num_tokens": 139610473.0, "step": 1305 }, { "epoch": 2.976054732041049, "grad_norm": 2.71875, "learning_rate": 4.210269073971098e-06, "loss": 0.5923, "mean_token_accuracy": 0.8774100840091705, "num_tokens": 139718229.0, "step": 1306 }, { "epoch": 2.9783352337514253, "grad_norm": 4.0, "learning_rate": 4.208894836810457e-06, "loss": 0.5635, "mean_token_accuracy": 0.8828433156013489, "num_tokens": 139825370.0, "step": 1307 }, { "epoch": 2.9806157354618015, "grad_norm": 3.90625, "learning_rate": 4.207519629714099e-06, "loss": 0.5781, "mean_token_accuracy": 0.8787718713283539, "num_tokens": 139932155.0, "step": 1308 }, { "epoch": 2.982896237172178, "grad_norm": 6.28125, "learning_rate": 4.206143453462562e-06, "loss": 0.5967, "mean_token_accuracy": 0.8779617547988892, "num_tokens": 140039280.0, "step": 1309 }, { "epoch": 2.9851767388825543, "grad_norm": 2.78125, "learning_rate": 4.204766308836941e-06, "loss": 0.5802, "mean_token_accuracy": 0.8793751299381256, "num_tokens": 140146757.0, "step": 1310 }, { "epoch": 2.9874572405929305, "grad_norm": 2.71875, "learning_rate": 4.203388196618874e-06, "loss": 0.6091, "mean_token_accuracy": 0.8730627149343491, "num_tokens": 140253574.0, "step": 1311 }, { "epoch": 2.9897377423033067, "grad_norm": 5.25, "learning_rate": 4.202009117590552e-06, "loss": 0.5844, "mean_token_accuracy": 0.8779786825180054, "num_tokens": 140360312.0, "step": 1312 }, { "epoch": 2.992018244013683, "grad_norm": 5.1875, "learning_rate": 4.200629072534713e-06, "loss": 0.5836, "mean_token_accuracy": 0.8807083070278168, "num_tokens": 140467414.0, "step": 1313 }, { "epoch": 2.994298745724059, "grad_norm": 6.75, "learning_rate": 4.1992480622346455e-06, "loss": 0.5945, "mean_token_accuracy": 0.8760360926389694, "num_tokens": 140574002.0, "step": 1314 }, { "epoch": 2.9965792474344357, "grad_norm": 3.6875, "learning_rate": 4.197866087474181e-06, "loss": 0.5929, "mean_token_accuracy": 0.8772165775299072, "num_tokens": 140680798.0, "step": 1315 }, { "epoch": 2.998859749144812, "grad_norm": 2.9375, "learning_rate": 4.196483149037707e-06, "loss": 0.5989, "mean_token_accuracy": 0.8783533871173859, "num_tokens": 140787588.0, "step": 1316 }, { "epoch": 3.0, "grad_norm": 7.1875, "learning_rate": 4.195099247710147e-06, "loss": 0.624, "mean_token_accuracy": 0.8783180117607117, "num_tokens": 140826696.0, "step": 1317 }, { "epoch": 3.002280501710376, "grad_norm": 5.28125, "learning_rate": 4.1937143842769805e-06, "loss": 0.5899, "mean_token_accuracy": 0.8779798746109009, "num_tokens": 140933415.0, "step": 1318 }, { "epoch": 3.0045610034207524, "grad_norm": 3.359375, "learning_rate": 4.192328559524227e-06, "loss": 0.5726, "mean_token_accuracy": 0.8831545263528824, "num_tokens": 141041011.0, "step": 1319 }, { "epoch": 3.006841505131129, "grad_norm": 3.53125, "learning_rate": 4.190941774238454e-06, "loss": 0.577, "mean_token_accuracy": 0.8806227445602417, "num_tokens": 141148541.0, "step": 1320 }, { "epoch": 3.006841505131129, "eval_loss": 0.596246063709259, "eval_mean_token_accuracy": 0.8780989914339304, "eval_num_tokens": 141148541.0, "eval_runtime": 58.6547, "eval_samples_per_second": 142.955, "eval_steps_per_second": 4.484, "step": 1320 }, { "epoch": 3.009122006841505, "grad_norm": 2.484375, "learning_rate": 4.1895540292067765e-06, "loss": 0.5867, "mean_token_accuracy": 0.8786788433790207, "num_tokens": 141255428.0, "step": 1321 }, { "epoch": 3.0114025085518814, "grad_norm": 5.5625, "learning_rate": 4.18816532521685e-06, "loss": 0.5764, "mean_token_accuracy": 0.8818920701742172, "num_tokens": 141362931.0, "step": 1322 }, { "epoch": 3.0136830102622576, "grad_norm": 8.25, "learning_rate": 4.1867756630568755e-06, "loss": 0.5635, "mean_token_accuracy": 0.8816585689783096, "num_tokens": 141470783.0, "step": 1323 }, { "epoch": 3.015963511972634, "grad_norm": 5.65625, "learning_rate": 4.1853850435156e-06, "loss": 0.5817, "mean_token_accuracy": 0.8797392249107361, "num_tokens": 141578103.0, "step": 1324 }, { "epoch": 3.0182440136830104, "grad_norm": 4.125, "learning_rate": 4.18399346738231e-06, "loss": 0.5928, "mean_token_accuracy": 0.8776365965604782, "num_tokens": 141684405.0, "step": 1325 }, { "epoch": 3.0205245153933866, "grad_norm": 2.671875, "learning_rate": 4.18260093544684e-06, "loss": 0.588, "mean_token_accuracy": 0.8821319788694382, "num_tokens": 141790905.0, "step": 1326 }, { "epoch": 3.022805017103763, "grad_norm": 5.0625, "learning_rate": 4.181207448499562e-06, "loss": 0.5786, "mean_token_accuracy": 0.8799286335706711, "num_tokens": 141898196.0, "step": 1327 }, { "epoch": 3.025085518814139, "grad_norm": 2.953125, "learning_rate": 4.179813007331394e-06, "loss": 0.5785, "mean_token_accuracy": 0.8794847130775452, "num_tokens": 142005139.0, "step": 1328 }, { "epoch": 3.027366020524515, "grad_norm": 3.890625, "learning_rate": 4.178417612733792e-06, "loss": 0.578, "mean_token_accuracy": 0.8797455281019211, "num_tokens": 142111855.0, "step": 1329 }, { "epoch": 3.029646522234892, "grad_norm": 6.53125, "learning_rate": 4.177021265498757e-06, "loss": 0.5931, "mean_token_accuracy": 0.879165843129158, "num_tokens": 142218523.0, "step": 1330 }, { "epoch": 3.031927023945268, "grad_norm": 3.34375, "learning_rate": 4.1756239664188275e-06, "loss": 0.574, "mean_token_accuracy": 0.8838147222995758, "num_tokens": 142325909.0, "step": 1331 }, { "epoch": 3.034207525655644, "grad_norm": 2.21875, "learning_rate": 4.1742257162870835e-06, "loss": 0.5876, "mean_token_accuracy": 0.8792033344507217, "num_tokens": 142433088.0, "step": 1332 }, { "epoch": 3.0364880273660204, "grad_norm": 2.78125, "learning_rate": 4.172826515897146e-06, "loss": 0.6002, "mean_token_accuracy": 0.8756905943155289, "num_tokens": 142540035.0, "step": 1333 }, { "epoch": 3.0387685290763966, "grad_norm": 4.59375, "learning_rate": 4.171426366043172e-06, "loss": 0.5887, "mean_token_accuracy": 0.8791181296110153, "num_tokens": 142647392.0, "step": 1334 }, { "epoch": 3.0410490307867732, "grad_norm": 3.5625, "learning_rate": 4.170025267519862e-06, "loss": 0.5885, "mean_token_accuracy": 0.8790037631988525, "num_tokens": 142754270.0, "step": 1335 }, { "epoch": 3.0433295324971494, "grad_norm": 5.28125, "learning_rate": 4.168623221122451e-06, "loss": 0.5575, "mean_token_accuracy": 0.8844562470912933, "num_tokens": 142861663.0, "step": 1336 }, { "epoch": 3.0456100342075256, "grad_norm": 2.796875, "learning_rate": 4.167220227646713e-06, "loss": 0.5749, "mean_token_accuracy": 0.8789408653974533, "num_tokens": 142968943.0, "step": 1337 }, { "epoch": 3.047890535917902, "grad_norm": 4.0625, "learning_rate": 4.165816287888962e-06, "loss": 0.5843, "mean_token_accuracy": 0.879507377743721, "num_tokens": 143076504.0, "step": 1338 }, { "epoch": 3.050171037628278, "grad_norm": 2.5, "learning_rate": 4.164411402646045e-06, "loss": 0.5905, "mean_token_accuracy": 0.8781361430883408, "num_tokens": 143183423.0, "step": 1339 }, { "epoch": 3.0524515393386547, "grad_norm": 5.46875, "learning_rate": 4.163005572715348e-06, "loss": 0.5969, "mean_token_accuracy": 0.879531055688858, "num_tokens": 143290366.0, "step": 1340 }, { "epoch": 3.054732041049031, "grad_norm": 3.796875, "learning_rate": 4.161598798894795e-06, "loss": 0.595, "mean_token_accuracy": 0.8771509379148483, "num_tokens": 143397090.0, "step": 1341 }, { "epoch": 3.057012542759407, "grad_norm": 2.28125, "learning_rate": 4.160191081982841e-06, "loss": 0.6039, "mean_token_accuracy": 0.8763537108898163, "num_tokens": 143504171.0, "step": 1342 }, { "epoch": 3.0592930444697832, "grad_norm": 2.734375, "learning_rate": 4.15878242277848e-06, "loss": 0.5734, "mean_token_accuracy": 0.8811188638210297, "num_tokens": 143612068.0, "step": 1343 }, { "epoch": 3.0615735461801594, "grad_norm": 2.5625, "learning_rate": 4.157372822081241e-06, "loss": 0.5761, "mean_token_accuracy": 0.8822892308235168, "num_tokens": 143719411.0, "step": 1344 }, { "epoch": 3.063854047890536, "grad_norm": 3.09375, "learning_rate": 4.155962280691184e-06, "loss": 0.5873, "mean_token_accuracy": 0.8773934096097946, "num_tokens": 143826292.0, "step": 1345 }, { "epoch": 3.0661345496009123, "grad_norm": 6.125, "learning_rate": 4.154550799408906e-06, "loss": 0.5654, "mean_token_accuracy": 0.8825222551822662, "num_tokens": 143933142.0, "step": 1346 }, { "epoch": 3.0684150513112884, "grad_norm": 4.125, "learning_rate": 4.153138379035537e-06, "loss": 0.5774, "mean_token_accuracy": 0.8797077685594559, "num_tokens": 144040202.0, "step": 1347 }, { "epoch": 3.0706955530216646, "grad_norm": 4.25, "learning_rate": 4.1517250203727395e-06, "loss": 0.5917, "mean_token_accuracy": 0.8776738494634628, "num_tokens": 144147431.0, "step": 1348 }, { "epoch": 3.072976054732041, "grad_norm": 3.734375, "learning_rate": 4.150310724222708e-06, "loss": 0.5766, "mean_token_accuracy": 0.8820817023515701, "num_tokens": 144254552.0, "step": 1349 }, { "epoch": 3.0752565564424175, "grad_norm": 3.5625, "learning_rate": 4.14889549138817e-06, "loss": 0.5758, "mean_token_accuracy": 0.8843452036380768, "num_tokens": 144362233.0, "step": 1350 }, { "epoch": 3.0775370581527937, "grad_norm": 2.65625, "learning_rate": 4.147479322672383e-06, "loss": 0.5932, "mean_token_accuracy": 0.8773269057273865, "num_tokens": 144469328.0, "step": 1351 }, { "epoch": 3.07981755986317, "grad_norm": 2.484375, "learning_rate": 4.14606221887914e-06, "loss": 0.5724, "mean_token_accuracy": 0.882498249411583, "num_tokens": 144576504.0, "step": 1352 }, { "epoch": 3.082098061573546, "grad_norm": 4.0625, "learning_rate": 4.144644180812759e-06, "loss": 0.5873, "mean_token_accuracy": 0.8806255161762238, "num_tokens": 144683928.0, "step": 1353 }, { "epoch": 3.0843785632839227, "grad_norm": 2.875, "learning_rate": 4.143225209278093e-06, "loss": 0.6075, "mean_token_accuracy": 0.8764054775238037, "num_tokens": 144791032.0, "step": 1354 }, { "epoch": 3.086659064994299, "grad_norm": 3.5625, "learning_rate": 4.141805305080521e-06, "loss": 0.6094, "mean_token_accuracy": 0.8736419975757599, "num_tokens": 144897700.0, "step": 1355 }, { "epoch": 3.088939566704675, "grad_norm": 3.703125, "learning_rate": 4.1403844690259544e-06, "loss": 0.6007, "mean_token_accuracy": 0.8770473003387451, "num_tokens": 145004603.0, "step": 1356 }, { "epoch": 3.0912200684150513, "grad_norm": 2.03125, "learning_rate": 4.138962701920831e-06, "loss": 0.5742, "mean_token_accuracy": 0.8825189918279648, "num_tokens": 145112050.0, "step": 1357 }, { "epoch": 3.0935005701254275, "grad_norm": 2.609375, "learning_rate": 4.13754000457212e-06, "loss": 0.5937, "mean_token_accuracy": 0.8764393627643585, "num_tokens": 145218872.0, "step": 1358 }, { "epoch": 3.095781071835804, "grad_norm": 2.8125, "learning_rate": 4.136116377787317e-06, "loss": 0.6087, "mean_token_accuracy": 0.8764399290084839, "num_tokens": 145325751.0, "step": 1359 }, { "epoch": 3.0980615735461803, "grad_norm": 2.5, "learning_rate": 4.134691822374445e-06, "loss": 0.5701, "mean_token_accuracy": 0.8835193067789078, "num_tokens": 145432654.0, "step": 1360 }, { "epoch": 3.1003420752565565, "grad_norm": 3.0, "learning_rate": 4.1332663391420515e-06, "loss": 0.5884, "mean_token_accuracy": 0.8790077865123749, "num_tokens": 145539487.0, "step": 1361 }, { "epoch": 3.1026225769669327, "grad_norm": 2.328125, "learning_rate": 4.131839928899217e-06, "loss": 0.58, "mean_token_accuracy": 0.880421444773674, "num_tokens": 145647251.0, "step": 1362 }, { "epoch": 3.104903078677309, "grad_norm": 2.4375, "learning_rate": 4.130412592455542e-06, "loss": 0.5916, "mean_token_accuracy": 0.8812432438135147, "num_tokens": 145754411.0, "step": 1363 }, { "epoch": 3.1071835803876855, "grad_norm": 3.15625, "learning_rate": 4.128984330621157e-06, "loss": 0.5819, "mean_token_accuracy": 0.8790383189916611, "num_tokens": 145861509.0, "step": 1364 }, { "epoch": 3.1094640820980617, "grad_norm": 3.453125, "learning_rate": 4.127555144206713e-06, "loss": 0.5979, "mean_token_accuracy": 0.8779343664646149, "num_tokens": 145968759.0, "step": 1365 }, { "epoch": 3.111744583808438, "grad_norm": 2.46875, "learning_rate": 4.126125034023392e-06, "loss": 0.5988, "mean_token_accuracy": 0.8722660690546036, "num_tokens": 146075699.0, "step": 1366 }, { "epoch": 3.114025085518814, "grad_norm": 4.03125, "learning_rate": 4.124694000882894e-06, "loss": 0.6043, "mean_token_accuracy": 0.8754329532384872, "num_tokens": 146183526.0, "step": 1367 }, { "epoch": 3.1163055872291903, "grad_norm": 2.359375, "learning_rate": 4.123262045597447e-06, "loss": 0.601, "mean_token_accuracy": 0.8771016895771027, "num_tokens": 146290729.0, "step": 1368 }, { "epoch": 3.118586088939567, "grad_norm": 3.71875, "learning_rate": 4.121829168979802e-06, "loss": 0.5863, "mean_token_accuracy": 0.8831167221069336, "num_tokens": 146397777.0, "step": 1369 }, { "epoch": 3.120866590649943, "grad_norm": 6.4375, "learning_rate": 4.120395371843231e-06, "loss": 0.5964, "mean_token_accuracy": 0.8783126771450043, "num_tokens": 146504599.0, "step": 1370 }, { "epoch": 3.1231470923603193, "grad_norm": 3.578125, "learning_rate": 4.11896065500153e-06, "loss": 0.5848, "mean_token_accuracy": 0.8771827965974808, "num_tokens": 146611271.0, "step": 1371 }, { "epoch": 3.1254275940706955, "grad_norm": 3.171875, "learning_rate": 4.117525019269016e-06, "loss": 0.5836, "mean_token_accuracy": 0.8794733434915543, "num_tokens": 146718261.0, "step": 1372 }, { "epoch": 3.1277080957810717, "grad_norm": 2.65625, "learning_rate": 4.116088465460529e-06, "loss": 0.5802, "mean_token_accuracy": 0.8812500983476639, "num_tokens": 146825123.0, "step": 1373 }, { "epoch": 3.1299885974914483, "grad_norm": 3.25, "learning_rate": 4.114650994391428e-06, "loss": 0.5864, "mean_token_accuracy": 0.8858306407928467, "num_tokens": 146932562.0, "step": 1374 }, { "epoch": 3.1322690992018245, "grad_norm": 4.84375, "learning_rate": 4.113212606877596e-06, "loss": 0.5975, "mean_token_accuracy": 0.8762167394161224, "num_tokens": 147039235.0, "step": 1375 }, { "epoch": 3.1345496009122007, "grad_norm": 2.5625, "learning_rate": 4.111773303735432e-06, "loss": 0.6084, "mean_token_accuracy": 0.8757900893688202, "num_tokens": 147146449.0, "step": 1376 }, { "epoch": 3.136830102622577, "grad_norm": 3.65625, "learning_rate": 4.110333085781857e-06, "loss": 0.5972, "mean_token_accuracy": 0.8781153112649918, "num_tokens": 147253224.0, "step": 1377 }, { "epoch": 3.139110604332953, "grad_norm": 2.921875, "learning_rate": 4.108891953834312e-06, "loss": 0.6022, "mean_token_accuracy": 0.8766528069972992, "num_tokens": 147359809.0, "step": 1378 }, { "epoch": 3.1413911060433297, "grad_norm": 3.671875, "learning_rate": 4.107449908710753e-06, "loss": 0.5898, "mean_token_accuracy": 0.8773281574249268, "num_tokens": 147467375.0, "step": 1379 }, { "epoch": 3.143671607753706, "grad_norm": 4.40625, "learning_rate": 4.106006951229661e-06, "loss": 0.5814, "mean_token_accuracy": 0.8796637058258057, "num_tokens": 147574319.0, "step": 1380 }, { "epoch": 3.145952109464082, "grad_norm": 3.8125, "learning_rate": 4.104563082210028e-06, "loss": 0.5763, "mean_token_accuracy": 0.8800371438264847, "num_tokens": 147681603.0, "step": 1381 }, { "epoch": 3.1482326111744583, "grad_norm": 3.734375, "learning_rate": 4.1031183024713665e-06, "loss": 0.5898, "mean_token_accuracy": 0.8773266673088074, "num_tokens": 147788610.0, "step": 1382 }, { "epoch": 3.1505131128848345, "grad_norm": 4.5, "learning_rate": 4.101672612833706e-06, "loss": 0.5791, "mean_token_accuracy": 0.8835958689451218, "num_tokens": 147895321.0, "step": 1383 }, { "epoch": 3.152793614595211, "grad_norm": 4.75, "learning_rate": 4.100226014117592e-06, "loss": 0.6047, "mean_token_accuracy": 0.8747086226940155, "num_tokens": 148002099.0, "step": 1384 }, { "epoch": 3.1550741163055873, "grad_norm": 4.0, "learning_rate": 4.098778507144086e-06, "loss": 0.5649, "mean_token_accuracy": 0.883410856127739, "num_tokens": 148109199.0, "step": 1385 }, { "epoch": 3.1573546180159635, "grad_norm": 3.6875, "learning_rate": 4.097330092734765e-06, "loss": 0.5878, "mean_token_accuracy": 0.8802912831306458, "num_tokens": 148215632.0, "step": 1386 }, { "epoch": 3.1596351197263397, "grad_norm": 6.375, "learning_rate": 4.09588077171172e-06, "loss": 0.5637, "mean_token_accuracy": 0.8851134181022644, "num_tokens": 148323133.0, "step": 1387 }, { "epoch": 3.161915621436716, "grad_norm": 6.21875, "learning_rate": 4.094430544897559e-06, "loss": 0.5913, "mean_token_accuracy": 0.8810424953699112, "num_tokens": 148429831.0, "step": 1388 }, { "epoch": 3.1641961231470925, "grad_norm": 6.0625, "learning_rate": 4.092979413115404e-06, "loss": 0.586, "mean_token_accuracy": 0.8824973404407501, "num_tokens": 148536848.0, "step": 1389 }, { "epoch": 3.1664766248574687, "grad_norm": 4.21875, "learning_rate": 4.091527377188886e-06, "loss": 0.5861, "mean_token_accuracy": 0.8795595914125443, "num_tokens": 148644319.0, "step": 1390 }, { "epoch": 3.168757126567845, "grad_norm": 3.640625, "learning_rate": 4.090074437942155e-06, "loss": 0.5768, "mean_token_accuracy": 0.879493236541748, "num_tokens": 148751953.0, "step": 1391 }, { "epoch": 3.171037628278221, "grad_norm": 5.0625, "learning_rate": 4.088620596199872e-06, "loss": 0.5868, "mean_token_accuracy": 0.8819226622581482, "num_tokens": 148859244.0, "step": 1392 }, { "epoch": 3.1733181299885973, "grad_norm": 5.15625, "learning_rate": 4.087165852787206e-06, "loss": 0.5872, "mean_token_accuracy": 0.8783839493989944, "num_tokens": 148966518.0, "step": 1393 }, { "epoch": 3.175598631698974, "grad_norm": 6.09375, "learning_rate": 4.085710208529844e-06, "loss": 0.5879, "mean_token_accuracy": 0.8783328980207443, "num_tokens": 149074306.0, "step": 1394 }, { "epoch": 3.17787913340935, "grad_norm": 2.671875, "learning_rate": 4.084253664253981e-06, "loss": 0.5838, "mean_token_accuracy": 0.8816855847835541, "num_tokens": 149181408.0, "step": 1395 }, { "epoch": 3.1801596351197263, "grad_norm": 4.6875, "learning_rate": 4.082796220786324e-06, "loss": 0.5968, "mean_token_accuracy": 0.8770945519208908, "num_tokens": 149288874.0, "step": 1396 }, { "epoch": 3.1824401368301025, "grad_norm": 4.09375, "learning_rate": 4.081337878954088e-06, "loss": 0.5781, "mean_token_accuracy": 0.8788999617099762, "num_tokens": 149395972.0, "step": 1397 }, { "epoch": 3.1847206385404787, "grad_norm": 2.25, "learning_rate": 4.079878639585002e-06, "loss": 0.563, "mean_token_accuracy": 0.884684830904007, "num_tokens": 149503225.0, "step": 1398 }, { "epoch": 3.1870011402508553, "grad_norm": 3.234375, "learning_rate": 4.0784185035072996e-06, "loss": 0.593, "mean_token_accuracy": 0.8796551078557968, "num_tokens": 149609665.0, "step": 1399 }, { "epoch": 3.1892816419612315, "grad_norm": 2.59375, "learning_rate": 4.076957471549728e-06, "loss": 0.5844, "mean_token_accuracy": 0.8772178441286087, "num_tokens": 149716324.0, "step": 1400 }, { "epoch": 3.1915621436716077, "grad_norm": 3.4375, "learning_rate": 4.0754955445415405e-06, "loss": 0.5978, "mean_token_accuracy": 0.8765067309141159, "num_tokens": 149823176.0, "step": 1401 }, { "epoch": 3.193842645381984, "grad_norm": 3.296875, "learning_rate": 4.074032723312497e-06, "loss": 0.5904, "mean_token_accuracy": 0.8809285014867783, "num_tokens": 149930057.0, "step": 1402 }, { "epoch": 3.19612314709236, "grad_norm": 3.484375, "learning_rate": 4.072569008692868e-06, "loss": 0.5934, "mean_token_accuracy": 0.8809255510568619, "num_tokens": 150036837.0, "step": 1403 }, { "epoch": 3.1984036488027368, "grad_norm": 2.53125, "learning_rate": 4.071104401513429e-06, "loss": 0.58, "mean_token_accuracy": 0.8793366551399231, "num_tokens": 150144003.0, "step": 1404 }, { "epoch": 3.200684150513113, "grad_norm": 3.0625, "learning_rate": 4.069638902605464e-06, "loss": 0.5849, "mean_token_accuracy": 0.8811514675617218, "num_tokens": 150252100.0, "step": 1405 }, { "epoch": 3.202964652223489, "grad_norm": 2.953125, "learning_rate": 4.06817251280076e-06, "loss": 0.6003, "mean_token_accuracy": 0.8807378858327866, "num_tokens": 150359077.0, "step": 1406 }, { "epoch": 3.2052451539338653, "grad_norm": 2.78125, "learning_rate": 4.0667052329316125e-06, "loss": 0.5956, "mean_token_accuracy": 0.8794204890727997, "num_tokens": 150466473.0, "step": 1407 }, { "epoch": 3.2075256556442415, "grad_norm": 2.46875, "learning_rate": 4.0652370638308215e-06, "loss": 0.603, "mean_token_accuracy": 0.8787625581026077, "num_tokens": 150572929.0, "step": 1408 }, { "epoch": 3.209806157354618, "grad_norm": 3.6875, "learning_rate": 4.063768006331691e-06, "loss": 0.598, "mean_token_accuracy": 0.874964028596878, "num_tokens": 150679963.0, "step": 1409 }, { "epoch": 3.2120866590649944, "grad_norm": 6.4375, "learning_rate": 4.06229806126803e-06, "loss": 0.5632, "mean_token_accuracy": 0.882831260561943, "num_tokens": 150787240.0, "step": 1410 }, { "epoch": 3.2143671607753705, "grad_norm": 2.921875, "learning_rate": 4.06082722947415e-06, "loss": 0.5531, "mean_token_accuracy": 0.8842821419239044, "num_tokens": 150894724.0, "step": 1411 }, { "epoch": 3.2166476624857467, "grad_norm": 6.09375, "learning_rate": 4.059355511784868e-06, "loss": 0.5953, "mean_token_accuracy": 0.8783928453922272, "num_tokens": 151001455.0, "step": 1412 }, { "epoch": 3.2189281641961234, "grad_norm": 5.0625, "learning_rate": 4.057882909035503e-06, "loss": 0.5872, "mean_token_accuracy": 0.8786003589630127, "num_tokens": 151108702.0, "step": 1413 }, { "epoch": 3.2212086659064996, "grad_norm": 4.75, "learning_rate": 4.0564094220618735e-06, "loss": 0.5748, "mean_token_accuracy": 0.8806807845830917, "num_tokens": 151215645.0, "step": 1414 }, { "epoch": 3.2234891676168758, "grad_norm": 3.53125, "learning_rate": 4.054935051700305e-06, "loss": 0.5728, "mean_token_accuracy": 0.8828134685754776, "num_tokens": 151322626.0, "step": 1415 }, { "epoch": 3.225769669327252, "grad_norm": 2.546875, "learning_rate": 4.053459798787619e-06, "loss": 0.5988, "mean_token_accuracy": 0.8766085356473923, "num_tokens": 151429877.0, "step": 1416 }, { "epoch": 3.228050171037628, "grad_norm": 4.34375, "learning_rate": 4.0519836641611425e-06, "loss": 0.5851, "mean_token_accuracy": 0.8795545697212219, "num_tokens": 151536994.0, "step": 1417 }, { "epoch": 3.2303306727480043, "grad_norm": 5.6875, "learning_rate": 4.050506648658701e-06, "loss": 0.5921, "mean_token_accuracy": 0.8756987452507019, "num_tokens": 151643935.0, "step": 1418 }, { "epoch": 3.232611174458381, "grad_norm": 4.03125, "learning_rate": 4.049028753118619e-06, "loss": 0.5959, "mean_token_accuracy": 0.8784631341695786, "num_tokens": 151751200.0, "step": 1419 }, { "epoch": 3.234891676168757, "grad_norm": 3.84375, "learning_rate": 4.047549978379721e-06, "loss": 0.6032, "mean_token_accuracy": 0.8733428716659546, "num_tokens": 151858059.0, "step": 1420 }, { "epoch": 3.2371721778791334, "grad_norm": 2.421875, "learning_rate": 4.046070325281333e-06, "loss": 0.5731, "mean_token_accuracy": 0.8788390904664993, "num_tokens": 151965037.0, "step": 1421 }, { "epoch": 3.2394526795895096, "grad_norm": 4.6875, "learning_rate": 4.044589794663275e-06, "loss": 0.6012, "mean_token_accuracy": 0.8751859962940216, "num_tokens": 152072452.0, "step": 1422 }, { "epoch": 3.241733181299886, "grad_norm": 3.140625, "learning_rate": 4.04310838736587e-06, "loss": 0.5783, "mean_token_accuracy": 0.8823445439338684, "num_tokens": 152180256.0, "step": 1423 }, { "epoch": 3.2440136830102624, "grad_norm": 5.3125, "learning_rate": 4.041626104229937e-06, "loss": 0.5671, "mean_token_accuracy": 0.8832966238260269, "num_tokens": 152287148.0, "step": 1424 }, { "epoch": 3.2462941847206386, "grad_norm": 2.4375, "learning_rate": 4.0401429460967864e-06, "loss": 0.5864, "mean_token_accuracy": 0.8766773343086243, "num_tokens": 152393325.0, "step": 1425 }, { "epoch": 3.2485746864310148, "grad_norm": 2.609375, "learning_rate": 4.038658913808235e-06, "loss": 0.595, "mean_token_accuracy": 0.877457782626152, "num_tokens": 152500677.0, "step": 1426 }, { "epoch": 3.250855188141391, "grad_norm": 3.671875, "learning_rate": 4.037174008206589e-06, "loss": 0.5955, "mean_token_accuracy": 0.8779526948928833, "num_tokens": 152606926.0, "step": 1427 }, { "epoch": 3.253135689851767, "grad_norm": 2.875, "learning_rate": 4.035688230134651e-06, "loss": 0.5891, "mean_token_accuracy": 0.8811050057411194, "num_tokens": 152713975.0, "step": 1428 }, { "epoch": 3.255416191562144, "grad_norm": 3.046875, "learning_rate": 4.034201580435723e-06, "loss": 0.5865, "mean_token_accuracy": 0.8812618553638458, "num_tokens": 152820865.0, "step": 1429 }, { "epoch": 3.25769669327252, "grad_norm": 3.0, "learning_rate": 4.0327140599535954e-06, "loss": 0.6084, "mean_token_accuracy": 0.8711613863706589, "num_tokens": 152927618.0, "step": 1430 }, { "epoch": 3.259977194982896, "grad_norm": 2.296875, "learning_rate": 4.031225669532558e-06, "loss": 0.5991, "mean_token_accuracy": 0.8751797676086426, "num_tokens": 153034150.0, "step": 1431 }, { "epoch": 3.2622576966932724, "grad_norm": 2.546875, "learning_rate": 4.029736410017392e-06, "loss": 0.5783, "mean_token_accuracy": 0.8789128512144089, "num_tokens": 153141189.0, "step": 1432 }, { "epoch": 3.264538198403649, "grad_norm": 4.5, "learning_rate": 4.028246282253373e-06, "loss": 0.5939, "mean_token_accuracy": 0.8788965493440628, "num_tokens": 153247856.0, "step": 1433 }, { "epoch": 3.266818700114025, "grad_norm": 2.828125, "learning_rate": 4.026755287086267e-06, "loss": 0.5809, "mean_token_accuracy": 0.8801927268505096, "num_tokens": 153355189.0, "step": 1434 }, { "epoch": 3.2690992018244014, "grad_norm": 6.34375, "learning_rate": 4.025263425362335e-06, "loss": 0.5797, "mean_token_accuracy": 0.8794397115707397, "num_tokens": 153461890.0, "step": 1435 }, { "epoch": 3.2713797035347776, "grad_norm": 3.953125, "learning_rate": 4.0237706979283306e-06, "loss": 0.5722, "mean_token_accuracy": 0.8837473094463348, "num_tokens": 153568765.0, "step": 1436 }, { "epoch": 3.2736602052451538, "grad_norm": 3.390625, "learning_rate": 4.022277105631495e-06, "loss": 0.5853, "mean_token_accuracy": 0.882627084851265, "num_tokens": 153675854.0, "step": 1437 }, { "epoch": 3.27594070695553, "grad_norm": 2.515625, "learning_rate": 4.020782649319563e-06, "loss": 0.5791, "mean_token_accuracy": 0.8796237707138062, "num_tokens": 153783094.0, "step": 1438 }, { "epoch": 3.2782212086659066, "grad_norm": 3.3125, "learning_rate": 4.019287329840759e-06, "loss": 0.5933, "mean_token_accuracy": 0.8813715130090714, "num_tokens": 153889764.0, "step": 1439 }, { "epoch": 3.280501710376283, "grad_norm": 2.8125, "learning_rate": 4.017791148043797e-06, "loss": 0.5841, "mean_token_accuracy": 0.882407933473587, "num_tokens": 153996810.0, "step": 1440 }, { "epoch": 3.282782212086659, "grad_norm": 2.578125, "learning_rate": 4.016294104777883e-06, "loss": 0.5801, "mean_token_accuracy": 0.8812000006437302, "num_tokens": 154103786.0, "step": 1441 }, { "epoch": 3.285062713797035, "grad_norm": 2.265625, "learning_rate": 4.0147962008927065e-06, "loss": 0.5734, "mean_token_accuracy": 0.8824433386325836, "num_tokens": 154211857.0, "step": 1442 }, { "epoch": 3.287343215507412, "grad_norm": 3.34375, "learning_rate": 4.013297437238452e-06, "loss": 0.5647, "mean_token_accuracy": 0.8849389404058456, "num_tokens": 154318483.0, "step": 1443 }, { "epoch": 3.289623717217788, "grad_norm": 3.609375, "learning_rate": 4.011797814665787e-06, "loss": 0.5825, "mean_token_accuracy": 0.8829954713582993, "num_tokens": 154425544.0, "step": 1444 }, { "epoch": 3.291904218928164, "grad_norm": 2.375, "learning_rate": 4.010297334025869e-06, "loss": 0.5487, "mean_token_accuracy": 0.8852455317974091, "num_tokens": 154532774.0, "step": 1445 }, { "epoch": 3.2941847206385404, "grad_norm": 2.4375, "learning_rate": 4.008795996170341e-06, "loss": 0.5726, "mean_token_accuracy": 0.8847774416208267, "num_tokens": 154639810.0, "step": 1446 }, { "epoch": 3.2964652223489166, "grad_norm": 2.46875, "learning_rate": 4.0072938019513345e-06, "loss": 0.5896, "mean_token_accuracy": 0.8774778991937637, "num_tokens": 154747419.0, "step": 1447 }, { "epoch": 3.2987457240592932, "grad_norm": 3.328125, "learning_rate": 4.0057907522214646e-06, "loss": 0.5816, "mean_token_accuracy": 0.8795448988676071, "num_tokens": 154855354.0, "step": 1448 }, { "epoch": 3.3010262257696694, "grad_norm": 3.140625, "learning_rate": 4.004286847833835e-06, "loss": 0.5805, "mean_token_accuracy": 0.8828533291816711, "num_tokens": 154962700.0, "step": 1449 }, { "epoch": 3.3033067274800456, "grad_norm": 4.0625, "learning_rate": 4.002782089642031e-06, "loss": 0.5988, "mean_token_accuracy": 0.8755003958940506, "num_tokens": 155069571.0, "step": 1450 }, { "epoch": 3.305587229190422, "grad_norm": 2.375, "learning_rate": 4.001276478500127e-06, "loss": 0.5454, "mean_token_accuracy": 0.8879126906394958, "num_tokens": 155177593.0, "step": 1451 }, { "epoch": 3.307867730900798, "grad_norm": 2.671875, "learning_rate": 3.9997700152626755e-06, "loss": 0.5674, "mean_token_accuracy": 0.8830482810735703, "num_tokens": 155284329.0, "step": 1452 }, { "epoch": 3.3101482326111746, "grad_norm": 8.375, "learning_rate": 3.9982627007847186e-06, "loss": 0.5848, "mean_token_accuracy": 0.8783923089504242, "num_tokens": 155392445.0, "step": 1453 }, { "epoch": 3.312428734321551, "grad_norm": 4.5625, "learning_rate": 3.996754535921777e-06, "loss": 0.5655, "mean_token_accuracy": 0.8857136368751526, "num_tokens": 155499630.0, "step": 1454 }, { "epoch": 3.314709236031927, "grad_norm": 3.1875, "learning_rate": 3.995245521529857e-06, "loss": 0.5649, "mean_token_accuracy": 0.8815815448760986, "num_tokens": 155606583.0, "step": 1455 }, { "epoch": 3.316989737742303, "grad_norm": 2.78125, "learning_rate": 3.993735658465446e-06, "loss": 0.5964, "mean_token_accuracy": 0.880055382847786, "num_tokens": 155712881.0, "step": 1456 }, { "epoch": 3.3192702394526794, "grad_norm": 4.375, "learning_rate": 3.992224947585513e-06, "loss": 0.6071, "mean_token_accuracy": 0.8751248270273209, "num_tokens": 155820166.0, "step": 1457 }, { "epoch": 3.321550741163056, "grad_norm": 8.3125, "learning_rate": 3.990713389747508e-06, "loss": 0.5909, "mean_token_accuracy": 0.8792251795530319, "num_tokens": 155927043.0, "step": 1458 }, { "epoch": 3.3238312428734322, "grad_norm": 6.21875, "learning_rate": 3.989200985809362e-06, "loss": 0.5721, "mean_token_accuracy": 0.8834533840417862, "num_tokens": 156034044.0, "step": 1459 }, { "epoch": 3.3261117445838084, "grad_norm": 3.71875, "learning_rate": 3.987687736629487e-06, "loss": 0.5925, "mean_token_accuracy": 0.8784044981002808, "num_tokens": 156140733.0, "step": 1460 }, { "epoch": 3.3283922462941846, "grad_norm": 3.71875, "learning_rate": 3.986173643066774e-06, "loss": 0.5919, "mean_token_accuracy": 0.8763966262340546, "num_tokens": 156247552.0, "step": 1461 }, { "epoch": 3.330672748004561, "grad_norm": 2.9375, "learning_rate": 3.984658705980593e-06, "loss": 0.6053, "mean_token_accuracy": 0.8726420104503632, "num_tokens": 156355329.0, "step": 1462 }, { "epoch": 3.3329532497149374, "grad_norm": 5.34375, "learning_rate": 3.983142926230792e-06, "loss": 0.5804, "mean_token_accuracy": 0.880921483039856, "num_tokens": 156462676.0, "step": 1463 }, { "epoch": 3.3352337514253136, "grad_norm": 3.921875, "learning_rate": 3.981626304677701e-06, "loss": 0.5803, "mean_token_accuracy": 0.8802833706140518, "num_tokens": 156570120.0, "step": 1464 }, { "epoch": 3.33751425313569, "grad_norm": 3.796875, "learning_rate": 3.980108842182121e-06, "loss": 0.605, "mean_token_accuracy": 0.8745223730802536, "num_tokens": 156677584.0, "step": 1465 }, { "epoch": 3.339794754846066, "grad_norm": 3.453125, "learning_rate": 3.978590539605338e-06, "loss": 0.5827, "mean_token_accuracy": 0.8785550892353058, "num_tokens": 156785538.0, "step": 1466 }, { "epoch": 3.342075256556442, "grad_norm": 3.109375, "learning_rate": 3.97707139780911e-06, "loss": 0.5614, "mean_token_accuracy": 0.8855056762695312, "num_tokens": 156893407.0, "step": 1467 }, { "epoch": 3.344355758266819, "grad_norm": 2.546875, "learning_rate": 3.975551417655673e-06, "loss": 0.5701, "mean_token_accuracy": 0.8805434554815292, "num_tokens": 157000135.0, "step": 1468 }, { "epoch": 3.346636259977195, "grad_norm": 2.703125, "learning_rate": 3.974030600007737e-06, "loss": 0.5824, "mean_token_accuracy": 0.880170151591301, "num_tokens": 157106985.0, "step": 1469 }, { "epoch": 3.3489167616875712, "grad_norm": 2.421875, "learning_rate": 3.97250894572849e-06, "loss": 0.5935, "mean_token_accuracy": 0.8791963756084442, "num_tokens": 157213948.0, "step": 1470 }, { "epoch": 3.3511972633979474, "grad_norm": 5.0625, "learning_rate": 3.970986455681593e-06, "loss": 0.5577, "mean_token_accuracy": 0.8822743445634842, "num_tokens": 157321436.0, "step": 1471 }, { "epoch": 3.353477765108324, "grad_norm": 2.46875, "learning_rate": 3.969463130731183e-06, "loss": 0.6327, "mean_token_accuracy": 0.8710167407989502, "num_tokens": 157427856.0, "step": 1472 }, { "epoch": 3.3557582668187003, "grad_norm": 2.78125, "learning_rate": 3.967938971741869e-06, "loss": 0.597, "mean_token_accuracy": 0.880034327507019, "num_tokens": 157535511.0, "step": 1473 }, { "epoch": 3.3580387685290765, "grad_norm": 4.65625, "learning_rate": 3.966413979578734e-06, "loss": 0.5949, "mean_token_accuracy": 0.8794209510087967, "num_tokens": 157642677.0, "step": 1474 }, { "epoch": 3.3603192702394526, "grad_norm": 2.59375, "learning_rate": 3.964888155107335e-06, "loss": 0.581, "mean_token_accuracy": 0.8807871788740158, "num_tokens": 157749664.0, "step": 1475 }, { "epoch": 3.362599771949829, "grad_norm": 3.578125, "learning_rate": 3.963361499193699e-06, "loss": 0.5919, "mean_token_accuracy": 0.8778406232595444, "num_tokens": 157857160.0, "step": 1476 }, { "epoch": 3.364880273660205, "grad_norm": 2.359375, "learning_rate": 3.9618340127043274e-06, "loss": 0.5798, "mean_token_accuracy": 0.881255105137825, "num_tokens": 157964040.0, "step": 1477 }, { "epoch": 3.3671607753705817, "grad_norm": 3.515625, "learning_rate": 3.960305696506192e-06, "loss": 0.5802, "mean_token_accuracy": 0.8831611126661301, "num_tokens": 158071025.0, "step": 1478 }, { "epoch": 3.369441277080958, "grad_norm": 2.84375, "learning_rate": 3.958776551466737e-06, "loss": 0.5853, "mean_token_accuracy": 0.8826898485422134, "num_tokens": 158177813.0, "step": 1479 }, { "epoch": 3.371721778791334, "grad_norm": 2.53125, "learning_rate": 3.957246578453873e-06, "loss": 0.6079, "mean_token_accuracy": 0.8766767531633377, "num_tokens": 158284056.0, "step": 1480 }, { "epoch": 3.3740022805017102, "grad_norm": 4.65625, "learning_rate": 3.955715778335984e-06, "loss": 0.5648, "mean_token_accuracy": 0.882177472114563, "num_tokens": 158390740.0, "step": 1481 }, { "epoch": 3.376282782212087, "grad_norm": 2.859375, "learning_rate": 3.954184151981924e-06, "loss": 0.5948, "mean_token_accuracy": 0.8798738569021225, "num_tokens": 158497577.0, "step": 1482 }, { "epoch": 3.378563283922463, "grad_norm": 3.203125, "learning_rate": 3.952651700261012e-06, "loss": 0.5969, "mean_token_accuracy": 0.8804901242256165, "num_tokens": 158604558.0, "step": 1483 }, { "epoch": 3.3808437856328393, "grad_norm": 3.640625, "learning_rate": 3.95111842404304e-06, "loss": 0.5684, "mean_token_accuracy": 0.8816994279623032, "num_tokens": 158712603.0, "step": 1484 }, { "epoch": 3.3831242873432155, "grad_norm": 2.59375, "learning_rate": 3.949584324198266e-06, "loss": 0.5931, "mean_token_accuracy": 0.8765831738710403, "num_tokens": 158819917.0, "step": 1485 }, { "epoch": 3.3854047890535917, "grad_norm": 2.828125, "learning_rate": 3.948049401597414e-06, "loss": 0.576, "mean_token_accuracy": 0.881806880235672, "num_tokens": 158927113.0, "step": 1486 }, { "epoch": 3.387685290763968, "grad_norm": 3.578125, "learning_rate": 3.946513657111678e-06, "loss": 0.5814, "mean_token_accuracy": 0.881575807929039, "num_tokens": 159034584.0, "step": 1487 }, { "epoch": 3.3899657924743445, "grad_norm": 2.40625, "learning_rate": 3.944977091612716e-06, "loss": 0.5794, "mean_token_accuracy": 0.880961686372757, "num_tokens": 159141695.0, "step": 1488 }, { "epoch": 3.3922462941847207, "grad_norm": 2.734375, "learning_rate": 3.943439705972654e-06, "loss": 0.604, "mean_token_accuracy": 0.8772746175527573, "num_tokens": 159248763.0, "step": 1489 }, { "epoch": 3.394526795895097, "grad_norm": 3.015625, "learning_rate": 3.94190150106408e-06, "loss": 0.577, "mean_token_accuracy": 0.8807346522808075, "num_tokens": 159355766.0, "step": 1490 }, { "epoch": 3.396807297605473, "grad_norm": 3.671875, "learning_rate": 3.9403624777600526e-06, "loss": 0.5787, "mean_token_accuracy": 0.8799644559621811, "num_tokens": 159462660.0, "step": 1491 }, { "epoch": 3.3990877993158497, "grad_norm": 4.15625, "learning_rate": 3.938822636934089e-06, "loss": 0.592, "mean_token_accuracy": 0.8790238797664642, "num_tokens": 159569562.0, "step": 1492 }, { "epoch": 3.401368301026226, "grad_norm": 7.1875, "learning_rate": 3.937281979460175e-06, "loss": 0.5845, "mean_token_accuracy": 0.8814513683319092, "num_tokens": 159676589.0, "step": 1493 }, { "epoch": 3.403648802736602, "grad_norm": 3.859375, "learning_rate": 3.9357405062127565e-06, "loss": 0.5722, "mean_token_accuracy": 0.8827214390039444, "num_tokens": 159783562.0, "step": 1494 }, { "epoch": 3.4059293044469783, "grad_norm": 2.875, "learning_rate": 3.934198218066745e-06, "loss": 0.5729, "mean_token_accuracy": 0.882139042019844, "num_tokens": 159891070.0, "step": 1495 }, { "epoch": 3.4082098061573545, "grad_norm": 6.34375, "learning_rate": 3.932655115897513e-06, "loss": 0.5722, "mean_token_accuracy": 0.8820941895246506, "num_tokens": 159997890.0, "step": 1496 }, { "epoch": 3.4104903078677307, "grad_norm": 4.0, "learning_rate": 3.9311112005808955e-06, "loss": 0.6003, "mean_token_accuracy": 0.8780702501535416, "num_tokens": 160104922.0, "step": 1497 }, { "epoch": 3.4127708095781073, "grad_norm": 6.15625, "learning_rate": 3.92956647299319e-06, "loss": 0.585, "mean_token_accuracy": 0.8826514631509781, "num_tokens": 160211984.0, "step": 1498 }, { "epoch": 3.4150513112884835, "grad_norm": 3.515625, "learning_rate": 3.928020934011153e-06, "loss": 0.6038, "mean_token_accuracy": 0.876240611076355, "num_tokens": 160319031.0, "step": 1499 }, { "epoch": 3.4173318129988597, "grad_norm": 2.296875, "learning_rate": 3.926474584512002e-06, "loss": 0.5919, "mean_token_accuracy": 0.8836843520402908, "num_tokens": 160425892.0, "step": 1500 }, { "epoch": 3.419612314709236, "grad_norm": 2.921875, "learning_rate": 3.924927425373417e-06, "loss": 0.5734, "mean_token_accuracy": 0.8801800161600113, "num_tokens": 160533225.0, "step": 1501 }, { "epoch": 3.4218928164196125, "grad_norm": 7.0625, "learning_rate": 3.9233794574735345e-06, "loss": 0.5941, "mean_token_accuracy": 0.8788256347179413, "num_tokens": 160640613.0, "step": 1502 }, { "epoch": 3.4241733181299887, "grad_norm": 5.8125, "learning_rate": 3.921830681690951e-06, "loss": 0.6065, "mean_token_accuracy": 0.8749442100524902, "num_tokens": 160746763.0, "step": 1503 }, { "epoch": 3.426453819840365, "grad_norm": 2.8125, "learning_rate": 3.920281098904722e-06, "loss": 0.5889, "mean_token_accuracy": 0.8785606771707535, "num_tokens": 160854121.0, "step": 1504 }, { "epoch": 3.428734321550741, "grad_norm": 2.515625, "learning_rate": 3.918730709994361e-06, "loss": 0.5771, "mean_token_accuracy": 0.8799080401659012, "num_tokens": 160961324.0, "step": 1505 }, { "epoch": 3.4310148232611173, "grad_norm": 3.15625, "learning_rate": 3.91717951583984e-06, "loss": 0.5808, "mean_token_accuracy": 0.8835075944662094, "num_tokens": 161068109.0, "step": 1506 }, { "epoch": 3.433295324971494, "grad_norm": 2.609375, "learning_rate": 3.915627517321584e-06, "loss": 0.5763, "mean_token_accuracy": 0.8793487548828125, "num_tokens": 161175558.0, "step": 1507 }, { "epoch": 3.43557582668187, "grad_norm": 5.03125, "learning_rate": 3.914074715320479e-06, "loss": 0.581, "mean_token_accuracy": 0.8839425444602966, "num_tokens": 161282715.0, "step": 1508 }, { "epoch": 3.4378563283922463, "grad_norm": 4.15625, "learning_rate": 3.912521110717866e-06, "loss": 0.5878, "mean_token_accuracy": 0.879220187664032, "num_tokens": 161390083.0, "step": 1509 }, { "epoch": 3.4401368301026225, "grad_norm": 3.1875, "learning_rate": 3.9109667043955405e-06, "loss": 0.5901, "mean_token_accuracy": 0.8790208101272583, "num_tokens": 161497044.0, "step": 1510 }, { "epoch": 3.4424173318129987, "grad_norm": 3.78125, "learning_rate": 3.909411497235752e-06, "loss": 0.5732, "mean_token_accuracy": 0.8846622407436371, "num_tokens": 161603896.0, "step": 1511 }, { "epoch": 3.4446978335233753, "grad_norm": 4.4375, "learning_rate": 3.907855490121208e-06, "loss": 0.5878, "mean_token_accuracy": 0.8800790905952454, "num_tokens": 161710931.0, "step": 1512 }, { "epoch": 3.4469783352337515, "grad_norm": 3.859375, "learning_rate": 3.906298683935068e-06, "loss": 0.5958, "mean_token_accuracy": 0.8813421279191971, "num_tokens": 161817975.0, "step": 1513 }, { "epoch": 3.4492588369441277, "grad_norm": 2.4375, "learning_rate": 3.904741079560944e-06, "loss": 0.5791, "mean_token_accuracy": 0.8818980902433395, "num_tokens": 161925410.0, "step": 1514 }, { "epoch": 3.451539338654504, "grad_norm": 2.6875, "learning_rate": 3.903182677882904e-06, "loss": 0.5729, "mean_token_accuracy": 0.8834614157676697, "num_tokens": 162032741.0, "step": 1515 }, { "epoch": 3.45381984036488, "grad_norm": 3.875, "learning_rate": 3.901623479785465e-06, "loss": 0.5823, "mean_token_accuracy": 0.8787531405687332, "num_tokens": 162139480.0, "step": 1516 }, { "epoch": 3.4561003420752567, "grad_norm": 3.03125, "learning_rate": 3.900063486153598e-06, "loss": 0.5717, "mean_token_accuracy": 0.8862394094467163, "num_tokens": 162246597.0, "step": 1517 }, { "epoch": 3.458380843785633, "grad_norm": 3.0625, "learning_rate": 3.898502697872725e-06, "loss": 0.5941, "mean_token_accuracy": 0.8790966123342514, "num_tokens": 162353845.0, "step": 1518 }, { "epoch": 3.460661345496009, "grad_norm": 2.421875, "learning_rate": 3.896941115828721e-06, "loss": 0.5811, "mean_token_accuracy": 0.8806100636720657, "num_tokens": 162460754.0, "step": 1519 }, { "epoch": 3.4629418472063853, "grad_norm": 3.9375, "learning_rate": 3.895378740907908e-06, "loss": 0.6153, "mean_token_accuracy": 0.8727654367685318, "num_tokens": 162567222.0, "step": 1520 }, { "epoch": 3.4652223489167615, "grad_norm": 2.671875, "learning_rate": 3.89381557399706e-06, "loss": 0.5973, "mean_token_accuracy": 0.8774588257074356, "num_tokens": 162674330.0, "step": 1521 }, { "epoch": 3.467502850627138, "grad_norm": 2.515625, "learning_rate": 3.892251615983401e-06, "loss": 0.587, "mean_token_accuracy": 0.8811975121498108, "num_tokens": 162781710.0, "step": 1522 }, { "epoch": 3.4697833523375143, "grad_norm": 2.65625, "learning_rate": 3.890686867754604e-06, "loss": 0.6, "mean_token_accuracy": 0.8789321780204773, "num_tokens": 162888560.0, "step": 1523 }, { "epoch": 3.4720638540478905, "grad_norm": 2.546875, "learning_rate": 3.889121330198788e-06, "loss": 0.5669, "mean_token_accuracy": 0.8869301974773407, "num_tokens": 162995694.0, "step": 1524 }, { "epoch": 3.4743443557582667, "grad_norm": 2.578125, "learning_rate": 3.887555004204524e-06, "loss": 0.5906, "mean_token_accuracy": 0.8777419775724411, "num_tokens": 163103255.0, "step": 1525 }, { "epoch": 3.476624857468643, "grad_norm": 2.796875, "learning_rate": 3.885987890660828e-06, "loss": 0.5806, "mean_token_accuracy": 0.8802089840173721, "num_tokens": 163210670.0, "step": 1526 }, { "epoch": 3.4789053591790196, "grad_norm": 3.234375, "learning_rate": 3.884419990457161e-06, "loss": 0.5972, "mean_token_accuracy": 0.8770112693309784, "num_tokens": 163317663.0, "step": 1527 }, { "epoch": 3.4811858608893957, "grad_norm": 3.0, "learning_rate": 3.882851304483436e-06, "loss": 0.599, "mean_token_accuracy": 0.8784664124250412, "num_tokens": 163424638.0, "step": 1528 }, { "epoch": 3.483466362599772, "grad_norm": 2.75, "learning_rate": 3.881281833630007e-06, "loss": 0.5561, "mean_token_accuracy": 0.8907039016485214, "num_tokens": 163532458.0, "step": 1529 }, { "epoch": 3.485746864310148, "grad_norm": 3.9375, "learning_rate": 3.879711578787676e-06, "loss": 0.5905, "mean_token_accuracy": 0.8753378838300705, "num_tokens": 163639976.0, "step": 1530 }, { "epoch": 3.4880273660205243, "grad_norm": 2.625, "learning_rate": 3.87814054084769e-06, "loss": 0.5656, "mean_token_accuracy": 0.8845688998699188, "num_tokens": 163747328.0, "step": 1531 }, { "epoch": 3.490307867730901, "grad_norm": 3.015625, "learning_rate": 3.8765687207017375e-06, "loss": 0.5904, "mean_token_accuracy": 0.8783632963895798, "num_tokens": 163854482.0, "step": 1532 }, { "epoch": 3.492588369441277, "grad_norm": 2.4375, "learning_rate": 3.874996119241956e-06, "loss": 0.5855, "mean_token_accuracy": 0.8804601579904556, "num_tokens": 163961669.0, "step": 1533 }, { "epoch": 3.4948688711516533, "grad_norm": 3.125, "learning_rate": 3.873422737360922e-06, "loss": 0.5816, "mean_token_accuracy": 0.8807873427867889, "num_tokens": 164068614.0, "step": 1534 }, { "epoch": 3.4971493728620295, "grad_norm": 2.609375, "learning_rate": 3.871848575951658e-06, "loss": 0.564, "mean_token_accuracy": 0.8857271820306778, "num_tokens": 164175640.0, "step": 1535 }, { "epoch": 3.4994298745724057, "grad_norm": 3.546875, "learning_rate": 3.8702736359076265e-06, "loss": 0.5652, "mean_token_accuracy": 0.8821373879909515, "num_tokens": 164283025.0, "step": 1536 }, { "epoch": 3.5017103762827824, "grad_norm": 2.140625, "learning_rate": 3.868697918122733e-06, "loss": 0.5911, "mean_token_accuracy": 0.8805082738399506, "num_tokens": 164389945.0, "step": 1537 }, { "epoch": 3.5039908779931586, "grad_norm": 4.4375, "learning_rate": 3.867121423491325e-06, "loss": 0.5697, "mean_token_accuracy": 0.8843076974153519, "num_tokens": 164497358.0, "step": 1538 }, { "epoch": 3.5062713797035348, "grad_norm": 4.71875, "learning_rate": 3.86554415290819e-06, "loss": 0.5839, "mean_token_accuracy": 0.8772299587726593, "num_tokens": 164604547.0, "step": 1539 }, { "epoch": 3.508551881413911, "grad_norm": 3.0, "learning_rate": 3.8639661072685575e-06, "loss": 0.5968, "mean_token_accuracy": 0.8764625191688538, "num_tokens": 164711483.0, "step": 1540 }, { "epoch": 3.508551881413911, "eval_loss": 0.5934817790985107, "eval_mean_token_accuracy": 0.8786315757058872, "eval_num_tokens": 164711483.0, "eval_runtime": 58.6311, "eval_samples_per_second": 143.013, "eval_steps_per_second": 4.486, "step": 1540 }, { "epoch": 3.5108323831242876, "grad_norm": 2.46875, "learning_rate": 3.862387287468095e-06, "loss": 0.5669, "mean_token_accuracy": 0.884019061923027, "num_tokens": 164818766.0, "step": 1541 }, { "epoch": 3.5131128848346638, "grad_norm": 3.03125, "learning_rate": 3.860807694402909e-06, "loss": 0.6091, "mean_token_accuracy": 0.872483566403389, "num_tokens": 164925458.0, "step": 1542 }, { "epoch": 3.51539338654504, "grad_norm": 2.46875, "learning_rate": 3.859227328969547e-06, "loss": 0.5638, "mean_token_accuracy": 0.8828099370002747, "num_tokens": 165033022.0, "step": 1543 }, { "epoch": 3.517673888255416, "grad_norm": 4.09375, "learning_rate": 3.857646192064995e-06, "loss": 0.5737, "mean_token_accuracy": 0.885229229927063, "num_tokens": 165140269.0, "step": 1544 }, { "epoch": 3.5199543899657924, "grad_norm": 2.640625, "learning_rate": 3.856064284586674e-06, "loss": 0.581, "mean_token_accuracy": 0.878631979227066, "num_tokens": 165247172.0, "step": 1545 }, { "epoch": 3.5222348916761685, "grad_norm": 4.84375, "learning_rate": 3.854481607432445e-06, "loss": 0.5869, "mean_token_accuracy": 0.881737545132637, "num_tokens": 165353974.0, "step": 1546 }, { "epoch": 3.524515393386545, "grad_norm": 2.609375, "learning_rate": 3.852898161500605e-06, "loss": 0.5807, "mean_token_accuracy": 0.8790680915117264, "num_tokens": 165461254.0, "step": 1547 }, { "epoch": 3.5267958950969214, "grad_norm": 3.09375, "learning_rate": 3.851313947689888e-06, "loss": 0.6022, "mean_token_accuracy": 0.8754421025514603, "num_tokens": 165568029.0, "step": 1548 }, { "epoch": 3.5290763968072976, "grad_norm": 2.84375, "learning_rate": 3.849728966899462e-06, "loss": 0.5803, "mean_token_accuracy": 0.8801588118076324, "num_tokens": 165675255.0, "step": 1549 }, { "epoch": 3.5313568985176738, "grad_norm": 3.234375, "learning_rate": 3.848143220028931e-06, "loss": 0.5872, "mean_token_accuracy": 0.8785509765148163, "num_tokens": 165781381.0, "step": 1550 }, { "epoch": 3.5336374002280504, "grad_norm": 2.203125, "learning_rate": 3.846556707978337e-06, "loss": 0.556, "mean_token_accuracy": 0.8853710889816284, "num_tokens": 165888356.0, "step": 1551 }, { "epoch": 3.5359179019384266, "grad_norm": 3.515625, "learning_rate": 3.844969431648151e-06, "loss": 0.5837, "mean_token_accuracy": 0.8783566206693649, "num_tokens": 165995028.0, "step": 1552 }, { "epoch": 3.538198403648803, "grad_norm": 3.171875, "learning_rate": 3.843381391939281e-06, "loss": 0.5928, "mean_token_accuracy": 0.8758636116981506, "num_tokens": 166102173.0, "step": 1553 }, { "epoch": 3.540478905359179, "grad_norm": 2.65625, "learning_rate": 3.841792589753067e-06, "loss": 0.5566, "mean_token_accuracy": 0.8812205046415329, "num_tokens": 166209381.0, "step": 1554 }, { "epoch": 3.542759407069555, "grad_norm": 5.65625, "learning_rate": 3.840203025991285e-06, "loss": 0.5516, "mean_token_accuracy": 0.8851082473993301, "num_tokens": 166316578.0, "step": 1555 }, { "epoch": 3.5450399087799314, "grad_norm": 3.484375, "learning_rate": 3.838612701556138e-06, "loss": 0.5828, "mean_token_accuracy": 0.879833921790123, "num_tokens": 166423915.0, "step": 1556 }, { "epoch": 3.547320410490308, "grad_norm": 3.125, "learning_rate": 3.837021617350266e-06, "loss": 0.585, "mean_token_accuracy": 0.8793807029724121, "num_tokens": 166530907.0, "step": 1557 }, { "epoch": 3.549600912200684, "grad_norm": 2.375, "learning_rate": 3.8354297742767345e-06, "loss": 0.5728, "mean_token_accuracy": 0.8835282325744629, "num_tokens": 166638263.0, "step": 1558 }, { "epoch": 3.5518814139110604, "grad_norm": 4.15625, "learning_rate": 3.833837173239044e-06, "loss": 0.5898, "mean_token_accuracy": 0.8798348754644394, "num_tokens": 166745468.0, "step": 1559 }, { "epoch": 3.5541619156214366, "grad_norm": 3.265625, "learning_rate": 3.832243815141126e-06, "loss": 0.5816, "mean_token_accuracy": 0.8829097151756287, "num_tokens": 166852138.0, "step": 1560 }, { "epoch": 3.556442417331813, "grad_norm": 2.703125, "learning_rate": 3.830649700887339e-06, "loss": 0.6001, "mean_token_accuracy": 0.8771270960569382, "num_tokens": 166958904.0, "step": 1561 }, { "epoch": 3.5587229190421894, "grad_norm": 2.4375, "learning_rate": 3.829054831382471e-06, "loss": 0.5895, "mean_token_accuracy": 0.8783409595489502, "num_tokens": 167066230.0, "step": 1562 }, { "epoch": 3.5610034207525656, "grad_norm": 2.453125, "learning_rate": 3.827459207531739e-06, "loss": 0.5762, "mean_token_accuracy": 0.8838016241788864, "num_tokens": 167173536.0, "step": 1563 }, { "epoch": 3.563283922462942, "grad_norm": 2.625, "learning_rate": 3.825862830240787e-06, "loss": 0.6079, "mean_token_accuracy": 0.8744715452194214, "num_tokens": 167280709.0, "step": 1564 }, { "epoch": 3.565564424173318, "grad_norm": 3.140625, "learning_rate": 3.82426570041569e-06, "loss": 0.5893, "mean_token_accuracy": 0.8770562261343002, "num_tokens": 167387081.0, "step": 1565 }, { "epoch": 3.567844925883694, "grad_norm": 2.71875, "learning_rate": 3.822667818962948e-06, "loss": 0.5612, "mean_token_accuracy": 0.8852861523628235, "num_tokens": 167494025.0, "step": 1566 }, { "epoch": 3.570125427594071, "grad_norm": 2.5, "learning_rate": 3.821069186789486e-06, "loss": 0.5904, "mean_token_accuracy": 0.8804926127195358, "num_tokens": 167600832.0, "step": 1567 }, { "epoch": 3.572405929304447, "grad_norm": 3.296875, "learning_rate": 3.819469804802659e-06, "loss": 0.5898, "mean_token_accuracy": 0.8771549314260483, "num_tokens": 167707990.0, "step": 1568 }, { "epoch": 3.574686431014823, "grad_norm": 2.578125, "learning_rate": 3.8178696739102435e-06, "loss": 0.5968, "mean_token_accuracy": 0.8794214427471161, "num_tokens": 167815132.0, "step": 1569 }, { "epoch": 3.5769669327251994, "grad_norm": 5.65625, "learning_rate": 3.816268795020443e-06, "loss": 0.5548, "mean_token_accuracy": 0.8854867666959763, "num_tokens": 167922544.0, "step": 1570 }, { "epoch": 3.579247434435576, "grad_norm": 2.84375, "learning_rate": 3.814667169041887e-06, "loss": 0.6035, "mean_token_accuracy": 0.8798968493938446, "num_tokens": 168029140.0, "step": 1571 }, { "epoch": 3.581527936145952, "grad_norm": 5.09375, "learning_rate": 3.8130647968836254e-06, "loss": 0.6082, "mean_token_accuracy": 0.8785337060689926, "num_tokens": 168136139.0, "step": 1572 }, { "epoch": 3.5838084378563284, "grad_norm": 2.34375, "learning_rate": 3.811461679455136e-06, "loss": 0.58, "mean_token_accuracy": 0.8779959082603455, "num_tokens": 168242818.0, "step": 1573 }, { "epoch": 3.5860889395667046, "grad_norm": 3.765625, "learning_rate": 3.809857817666316e-06, "loss": 0.5924, "mean_token_accuracy": 0.8785370439291, "num_tokens": 168350600.0, "step": 1574 }, { "epoch": 3.588369441277081, "grad_norm": 2.359375, "learning_rate": 3.808253212427486e-06, "loss": 0.5769, "mean_token_accuracy": 0.879870593547821, "num_tokens": 168457735.0, "step": 1575 }, { "epoch": 3.590649942987457, "grad_norm": 2.78125, "learning_rate": 3.8066478646493898e-06, "loss": 0.5731, "mean_token_accuracy": 0.8814902901649475, "num_tokens": 168564725.0, "step": 1576 }, { "epoch": 3.5929304446978336, "grad_norm": 4.65625, "learning_rate": 3.805041775243191e-06, "loss": 0.592, "mean_token_accuracy": 0.8772566318511963, "num_tokens": 168671913.0, "step": 1577 }, { "epoch": 3.59521094640821, "grad_norm": 3.53125, "learning_rate": 3.803434945120475e-06, "loss": 0.5593, "mean_token_accuracy": 0.8856948614120483, "num_tokens": 168779191.0, "step": 1578 }, { "epoch": 3.597491448118586, "grad_norm": 2.90625, "learning_rate": 3.801827375193249e-06, "loss": 0.5855, "mean_token_accuracy": 0.8798210918903351, "num_tokens": 168886143.0, "step": 1579 }, { "epoch": 3.5997719498289626, "grad_norm": 6.9375, "learning_rate": 3.8002190663739362e-06, "loss": 0.5783, "mean_token_accuracy": 0.8814046531915665, "num_tokens": 168993134.0, "step": 1580 }, { "epoch": 3.602052451539339, "grad_norm": 8.6875, "learning_rate": 3.798610019575384e-06, "loss": 0.5846, "mean_token_accuracy": 0.8799954205751419, "num_tokens": 169099760.0, "step": 1581 }, { "epoch": 3.604332953249715, "grad_norm": 6.90625, "learning_rate": 3.7970002357108554e-06, "loss": 0.5825, "mean_token_accuracy": 0.8843145817518234, "num_tokens": 169206545.0, "step": 1582 }, { "epoch": 3.6066134549600912, "grad_norm": 3.625, "learning_rate": 3.7953897156940323e-06, "loss": 0.5927, "mean_token_accuracy": 0.8787747472524643, "num_tokens": 169313229.0, "step": 1583 }, { "epoch": 3.6088939566704674, "grad_norm": 2.75, "learning_rate": 3.793778460439015e-06, "loss": 0.5878, "mean_token_accuracy": 0.8789673447608948, "num_tokens": 169420044.0, "step": 1584 }, { "epoch": 3.6111744583808436, "grad_norm": 4.59375, "learning_rate": 3.792166470860321e-06, "loss": 0.5764, "mean_token_accuracy": 0.8790552765130997, "num_tokens": 169526883.0, "step": 1585 }, { "epoch": 3.61345496009122, "grad_norm": 4.03125, "learning_rate": 3.790553747872885e-06, "loss": 0.5689, "mean_token_accuracy": 0.8849265277385712, "num_tokens": 169633699.0, "step": 1586 }, { "epoch": 3.6157354618015964, "grad_norm": 4.96875, "learning_rate": 3.788940292392056e-06, "loss": 0.5641, "mean_token_accuracy": 0.8801223486661911, "num_tokens": 169741250.0, "step": 1587 }, { "epoch": 3.6180159635119726, "grad_norm": 5.59375, "learning_rate": 3.787326105333601e-06, "loss": 0.5694, "mean_token_accuracy": 0.8809481114149094, "num_tokens": 169848862.0, "step": 1588 }, { "epoch": 3.620296465222349, "grad_norm": 2.453125, "learning_rate": 3.7857111876137017e-06, "loss": 0.5862, "mean_token_accuracy": 0.8804655224084854, "num_tokens": 169955712.0, "step": 1589 }, { "epoch": 3.6225769669327255, "grad_norm": 2.9375, "learning_rate": 3.784095540148954e-06, "loss": 0.5981, "mean_token_accuracy": 0.8755791783332825, "num_tokens": 170062534.0, "step": 1590 }, { "epoch": 3.6248574686431017, "grad_norm": 2.859375, "learning_rate": 3.7824791638563674e-06, "loss": 0.5882, "mean_token_accuracy": 0.8761962950229645, "num_tokens": 170169349.0, "step": 1591 }, { "epoch": 3.627137970353478, "grad_norm": 4.21875, "learning_rate": 3.7808620596533675e-06, "loss": 0.5726, "mean_token_accuracy": 0.8823762983083725, "num_tokens": 170276611.0, "step": 1592 }, { "epoch": 3.629418472063854, "grad_norm": 3.234375, "learning_rate": 3.77924422845779e-06, "loss": 0.5998, "mean_token_accuracy": 0.873461440205574, "num_tokens": 170383192.0, "step": 1593 }, { "epoch": 3.6316989737742302, "grad_norm": 2.65625, "learning_rate": 3.7776256711878856e-06, "loss": 0.5914, "mean_token_accuracy": 0.8810142874717712, "num_tokens": 170490101.0, "step": 1594 }, { "epoch": 3.6339794754846064, "grad_norm": 2.765625, "learning_rate": 3.7760063887623155e-06, "loss": 0.583, "mean_token_accuracy": 0.8790360242128372, "num_tokens": 170596909.0, "step": 1595 }, { "epoch": 3.636259977194983, "grad_norm": 3.4375, "learning_rate": 3.7743863821001538e-06, "loss": 0.5571, "mean_token_accuracy": 0.886503741145134, "num_tokens": 170704691.0, "step": 1596 }, { "epoch": 3.6385404789053593, "grad_norm": 2.96875, "learning_rate": 3.7727656521208843e-06, "loss": 0.5991, "mean_token_accuracy": 0.8780782669782639, "num_tokens": 170811447.0, "step": 1597 }, { "epoch": 3.6408209806157354, "grad_norm": 3.796875, "learning_rate": 3.771144199744402e-06, "loss": 0.5773, "mean_token_accuracy": 0.8789367228746414, "num_tokens": 170918012.0, "step": 1598 }, { "epoch": 3.6431014823261116, "grad_norm": 2.421875, "learning_rate": 3.7695220258910124e-06, "loss": 0.5963, "mean_token_accuracy": 0.8771436512470245, "num_tokens": 171024919.0, "step": 1599 }, { "epoch": 3.6453819840364883, "grad_norm": 3.625, "learning_rate": 3.7678991314814305e-06, "loss": 0.6, "mean_token_accuracy": 0.8739635199308395, "num_tokens": 171131530.0, "step": 1600 }, { "epoch": 3.6476624857468645, "grad_norm": 3.84375, "learning_rate": 3.766275517436779e-06, "loss": 0.5897, "mean_token_accuracy": 0.8780031353235245, "num_tokens": 171238533.0, "step": 1601 }, { "epoch": 3.6499429874572407, "grad_norm": 3.15625, "learning_rate": 3.7646511846785904e-06, "loss": 0.5746, "mean_token_accuracy": 0.8816352784633636, "num_tokens": 171345881.0, "step": 1602 }, { "epoch": 3.652223489167617, "grad_norm": 4.0625, "learning_rate": 3.7630261341288044e-06, "loss": 0.5673, "mean_token_accuracy": 0.883026197552681, "num_tokens": 171452807.0, "step": 1603 }, { "epoch": 3.654503990877993, "grad_norm": 4.0, "learning_rate": 3.7614003667097674e-06, "loss": 0.6065, "mean_token_accuracy": 0.8770434111356735, "num_tokens": 171559794.0, "step": 1604 }, { "epoch": 3.6567844925883692, "grad_norm": 2.515625, "learning_rate": 3.759773883344236e-06, "loss": 0.6215, "mean_token_accuracy": 0.8752636462450027, "num_tokens": 171666284.0, "step": 1605 }, { "epoch": 3.659064994298746, "grad_norm": 5.625, "learning_rate": 3.7581466849553685e-06, "loss": 0.569, "mean_token_accuracy": 0.884151503443718, "num_tokens": 171774291.0, "step": 1606 }, { "epoch": 3.661345496009122, "grad_norm": 2.5, "learning_rate": 3.7565187724667324e-06, "loss": 0.562, "mean_token_accuracy": 0.8847729563713074, "num_tokens": 171881709.0, "step": 1607 }, { "epoch": 3.6636259977194983, "grad_norm": 2.53125, "learning_rate": 3.7548901468022993e-06, "loss": 0.5917, "mean_token_accuracy": 0.8790787905454636, "num_tokens": 171988700.0, "step": 1608 }, { "epoch": 3.6659064994298745, "grad_norm": 3.5, "learning_rate": 3.7532608088864444e-06, "loss": 0.5883, "mean_token_accuracy": 0.876745343208313, "num_tokens": 172095530.0, "step": 1609 }, { "epoch": 3.668187001140251, "grad_norm": 3.0625, "learning_rate": 3.75163075964395e-06, "loss": 0.5797, "mean_token_accuracy": 0.8810919225215912, "num_tokens": 172202580.0, "step": 1610 }, { "epoch": 3.6704675028506273, "grad_norm": 3.140625, "learning_rate": 3.7500000000000005e-06, "loss": 0.5822, "mean_token_accuracy": 0.8769372403621674, "num_tokens": 172310027.0, "step": 1611 }, { "epoch": 3.6727480045610035, "grad_norm": 2.265625, "learning_rate": 3.748368530880183e-06, "loss": 0.581, "mean_token_accuracy": 0.8828428983688354, "num_tokens": 172416427.0, "step": 1612 }, { "epoch": 3.6750285062713797, "grad_norm": 2.6875, "learning_rate": 3.7467363532104874e-06, "loss": 0.5595, "mean_token_accuracy": 0.8848859220743179, "num_tokens": 172523861.0, "step": 1613 }, { "epoch": 3.677309007981756, "grad_norm": 2.796875, "learning_rate": 3.7451034679173082e-06, "loss": 0.5876, "mean_token_accuracy": 0.8770375549793243, "num_tokens": 172630804.0, "step": 1614 }, { "epoch": 3.679589509692132, "grad_norm": 2.5625, "learning_rate": 3.7434698759274366e-06, "loss": 0.5891, "mean_token_accuracy": 0.8780438005924225, "num_tokens": 172738455.0, "step": 1615 }, { "epoch": 3.6818700114025087, "grad_norm": 3.40625, "learning_rate": 3.741835578168071e-06, "loss": 0.5635, "mean_token_accuracy": 0.8857946693897247, "num_tokens": 172845911.0, "step": 1616 }, { "epoch": 3.684150513112885, "grad_norm": 4.4375, "learning_rate": 3.740200575566806e-06, "loss": 0.599, "mean_token_accuracy": 0.8763966113328934, "num_tokens": 172952684.0, "step": 1617 }, { "epoch": 3.686431014823261, "grad_norm": 2.515625, "learning_rate": 3.7385648690516364e-06, "loss": 0.5814, "mean_token_accuracy": 0.879105344414711, "num_tokens": 173059713.0, "step": 1618 }, { "epoch": 3.6887115165336373, "grad_norm": 3.953125, "learning_rate": 3.7369284595509587e-06, "loss": 0.5858, "mean_token_accuracy": 0.873798742890358, "num_tokens": 173167102.0, "step": 1619 }, { "epoch": 3.690992018244014, "grad_norm": 2.890625, "learning_rate": 3.7352913479935672e-06, "loss": 0.5757, "mean_token_accuracy": 0.8798558115959167, "num_tokens": 173274635.0, "step": 1620 }, { "epoch": 3.69327251995439, "grad_norm": 2.765625, "learning_rate": 3.7336535353086546e-06, "loss": 0.5997, "mean_token_accuracy": 0.8765471577644348, "num_tokens": 173382044.0, "step": 1621 }, { "epoch": 3.6955530216647663, "grad_norm": 2.921875, "learning_rate": 3.7320150224258124e-06, "loss": 0.5818, "mean_token_accuracy": 0.8792853504419327, "num_tokens": 173489148.0, "step": 1622 }, { "epoch": 3.6978335233751425, "grad_norm": 3.25, "learning_rate": 3.7303758102750274e-06, "loss": 0.5749, "mean_token_accuracy": 0.8817913979291916, "num_tokens": 173596404.0, "step": 1623 }, { "epoch": 3.7001140250855187, "grad_norm": 2.75, "learning_rate": 3.7287358997866872e-06, "loss": 0.588, "mean_token_accuracy": 0.8805738687515259, "num_tokens": 173703327.0, "step": 1624 }, { "epoch": 3.702394526795895, "grad_norm": 3.25, "learning_rate": 3.7270952918915715e-06, "loss": 0.5788, "mean_token_accuracy": 0.8786198794841766, "num_tokens": 173809938.0, "step": 1625 }, { "epoch": 3.7046750285062715, "grad_norm": 2.71875, "learning_rate": 3.7254539875208577e-06, "loss": 0.5862, "mean_token_accuracy": 0.8781810998916626, "num_tokens": 173916969.0, "step": 1626 }, { "epoch": 3.7069555302166477, "grad_norm": 2.703125, "learning_rate": 3.7238119876061196e-06, "loss": 0.5864, "mean_token_accuracy": 0.8811867833137512, "num_tokens": 174023879.0, "step": 1627 }, { "epoch": 3.709236031927024, "grad_norm": 4.4375, "learning_rate": 3.7221692930793234e-06, "loss": 0.5689, "mean_token_accuracy": 0.8845594227313995, "num_tokens": 174130854.0, "step": 1628 }, { "epoch": 3.7115165336374, "grad_norm": 2.71875, "learning_rate": 3.7205259048728316e-06, "loss": 0.564, "mean_token_accuracy": 0.8832680881023407, "num_tokens": 174238556.0, "step": 1629 }, { "epoch": 3.7137970353477767, "grad_norm": 2.515625, "learning_rate": 3.718881823919399e-06, "loss": 0.5787, "mean_token_accuracy": 0.8850577622652054, "num_tokens": 174346096.0, "step": 1630 }, { "epoch": 3.716077537058153, "grad_norm": 3.28125, "learning_rate": 3.717237051152175e-06, "loss": 0.5738, "mean_token_accuracy": 0.8809388726949692, "num_tokens": 174453264.0, "step": 1631 }, { "epoch": 3.718358038768529, "grad_norm": 3.8125, "learning_rate": 3.7155915875047005e-06, "loss": 0.5737, "mean_token_accuracy": 0.8833019137382507, "num_tokens": 174560352.0, "step": 1632 }, { "epoch": 3.7206385404789053, "grad_norm": 4.0, "learning_rate": 3.7139454339109082e-06, "loss": 0.59, "mean_token_accuracy": 0.8777986019849777, "num_tokens": 174667183.0, "step": 1633 }, { "epoch": 3.7229190421892815, "grad_norm": 5.375, "learning_rate": 3.7122985913051242e-06, "loss": 0.5664, "mean_token_accuracy": 0.8856906592845917, "num_tokens": 174774451.0, "step": 1634 }, { "epoch": 3.7251995438996577, "grad_norm": 2.203125, "learning_rate": 3.710651060622064e-06, "loss": 0.5653, "mean_token_accuracy": 0.8867538422346115, "num_tokens": 174881357.0, "step": 1635 }, { "epoch": 3.7274800456100343, "grad_norm": 3.671875, "learning_rate": 3.7090028427968343e-06, "loss": 0.6073, "mean_token_accuracy": 0.8716320544481277, "num_tokens": 174987742.0, "step": 1636 }, { "epoch": 3.7297605473204105, "grad_norm": 3.546875, "learning_rate": 3.7073539387649316e-06, "loss": 0.5933, "mean_token_accuracy": 0.8786651045084, "num_tokens": 175094511.0, "step": 1637 }, { "epoch": 3.7320410490307867, "grad_norm": 3.78125, "learning_rate": 3.7057043494622423e-06, "loss": 0.6106, "mean_token_accuracy": 0.8771418780088425, "num_tokens": 175201546.0, "step": 1638 }, { "epoch": 3.734321550741163, "grad_norm": 3.046875, "learning_rate": 3.704054075825042e-06, "loss": 0.5927, "mean_token_accuracy": 0.8767105042934418, "num_tokens": 175308332.0, "step": 1639 }, { "epoch": 3.7366020524515395, "grad_norm": 2.8125, "learning_rate": 3.702403118789992e-06, "loss": 0.5588, "mean_token_accuracy": 0.8864465802907944, "num_tokens": 175415556.0, "step": 1640 }, { "epoch": 3.7388825541619157, "grad_norm": 5.78125, "learning_rate": 3.7007514792941462e-06, "loss": 0.5763, "mean_token_accuracy": 0.8830393701791763, "num_tokens": 175522759.0, "step": 1641 }, { "epoch": 3.741163055872292, "grad_norm": 3.03125, "learning_rate": 3.6990991582749414e-06, "loss": 0.5712, "mean_token_accuracy": 0.8837911188602448, "num_tokens": 175630033.0, "step": 1642 }, { "epoch": 3.743443557582668, "grad_norm": 3.125, "learning_rate": 3.6974461566702048e-06, "loss": 0.5963, "mean_token_accuracy": 0.8770407438278198, "num_tokens": 175737297.0, "step": 1643 }, { "epoch": 3.7457240592930443, "grad_norm": 2.578125, "learning_rate": 3.695792475418146e-06, "loss": 0.5899, "mean_token_accuracy": 0.8814648389816284, "num_tokens": 175844521.0, "step": 1644 }, { "epoch": 3.7480045610034205, "grad_norm": 6.34375, "learning_rate": 3.6941381154573646e-06, "loss": 0.5928, "mean_token_accuracy": 0.878902330994606, "num_tokens": 175951433.0, "step": 1645 }, { "epoch": 3.750285062713797, "grad_norm": 4.84375, "learning_rate": 3.692483077726843e-06, "loss": 0.57, "mean_token_accuracy": 0.884130209684372, "num_tokens": 176058602.0, "step": 1646 }, { "epoch": 3.7525655644241733, "grad_norm": 4.03125, "learning_rate": 3.6908273631659475e-06, "loss": 0.5716, "mean_token_accuracy": 0.8837015181779861, "num_tokens": 176165024.0, "step": 1647 }, { "epoch": 3.7548460661345495, "grad_norm": 4.5, "learning_rate": 3.689170972714431e-06, "loss": 0.5769, "mean_token_accuracy": 0.8826144188642502, "num_tokens": 176272341.0, "step": 1648 }, { "epoch": 3.757126567844926, "grad_norm": 2.78125, "learning_rate": 3.6875139073124277e-06, "loss": 0.5737, "mean_token_accuracy": 0.8824542462825775, "num_tokens": 176379289.0, "step": 1649 }, { "epoch": 3.7594070695553023, "grad_norm": 4.375, "learning_rate": 3.6858561679004567e-06, "loss": 0.5709, "mean_token_accuracy": 0.8805209249258041, "num_tokens": 176486406.0, "step": 1650 }, { "epoch": 3.7616875712656785, "grad_norm": 5.21875, "learning_rate": 3.684197755419419e-06, "loss": 0.5759, "mean_token_accuracy": 0.8811310976743698, "num_tokens": 176593647.0, "step": 1651 }, { "epoch": 3.7639680729760547, "grad_norm": 6.21875, "learning_rate": 3.6825386708105963e-06, "loss": 0.5575, "mean_token_accuracy": 0.8876552581787109, "num_tokens": 176700583.0, "step": 1652 }, { "epoch": 3.766248574686431, "grad_norm": 2.890625, "learning_rate": 3.6808789150156545e-06, "loss": 0.5908, "mean_token_accuracy": 0.8768218755722046, "num_tokens": 176807605.0, "step": 1653 }, { "epoch": 3.768529076396807, "grad_norm": 3.453125, "learning_rate": 3.679218488976638e-06, "loss": 0.5967, "mean_token_accuracy": 0.8734856247901917, "num_tokens": 176914867.0, "step": 1654 }, { "epoch": 3.7708095781071833, "grad_norm": 3.21875, "learning_rate": 3.677557393635973e-06, "loss": 0.5803, "mean_token_accuracy": 0.8815526366233826, "num_tokens": 177021989.0, "step": 1655 }, { "epoch": 3.77309007981756, "grad_norm": 3.21875, "learning_rate": 3.6758956299364643e-06, "loss": 0.574, "mean_token_accuracy": 0.8808587789535522, "num_tokens": 177128640.0, "step": 1656 }, { "epoch": 3.775370581527936, "grad_norm": 2.375, "learning_rate": 3.674233198821299e-06, "loss": 0.5594, "mean_token_accuracy": 0.8834094703197479, "num_tokens": 177236418.0, "step": 1657 }, { "epoch": 3.7776510832383123, "grad_norm": 3.078125, "learning_rate": 3.6725701012340387e-06, "loss": 0.5785, "mean_token_accuracy": 0.8827496618032455, "num_tokens": 177343371.0, "step": 1658 }, { "epoch": 3.779931584948689, "grad_norm": 2.25, "learning_rate": 3.6709063381186267e-06, "loss": 0.5819, "mean_token_accuracy": 0.8798905909061432, "num_tokens": 177450862.0, "step": 1659 }, { "epoch": 3.782212086659065, "grad_norm": 2.953125, "learning_rate": 3.6692419104193823e-06, "loss": 0.6005, "mean_token_accuracy": 0.8762509971857071, "num_tokens": 177557890.0, "step": 1660 }, { "epoch": 3.7844925883694414, "grad_norm": 3.0, "learning_rate": 3.6675768190810023e-06, "loss": 0.5784, "mean_token_accuracy": 0.8809296637773514, "num_tokens": 177664846.0, "step": 1661 }, { "epoch": 3.7867730900798175, "grad_norm": 2.578125, "learning_rate": 3.665911065048561e-06, "loss": 0.5714, "mean_token_accuracy": 0.8829169422388077, "num_tokens": 177772527.0, "step": 1662 }, { "epoch": 3.7890535917901937, "grad_norm": 5.25, "learning_rate": 3.6642446492675075e-06, "loss": 0.5813, "mean_token_accuracy": 0.8831394165754318, "num_tokens": 177879427.0, "step": 1663 }, { "epoch": 3.79133409350057, "grad_norm": 2.96875, "learning_rate": 3.6625775726836677e-06, "loss": 0.5886, "mean_token_accuracy": 0.8788637816905975, "num_tokens": 177986497.0, "step": 1664 }, { "epoch": 3.7936145952109466, "grad_norm": 3.046875, "learning_rate": 3.6609098362432425e-06, "loss": 0.5742, "mean_token_accuracy": 0.8837129175662994, "num_tokens": 178093286.0, "step": 1665 }, { "epoch": 3.7958950969213228, "grad_norm": 4.0, "learning_rate": 3.659241440892806e-06, "loss": 0.5736, "mean_token_accuracy": 0.8812308311462402, "num_tokens": 178200373.0, "step": 1666 }, { "epoch": 3.798175598631699, "grad_norm": 4.1875, "learning_rate": 3.6575723875793085e-06, "loss": 0.5917, "mean_token_accuracy": 0.8776303678750992, "num_tokens": 178307338.0, "step": 1667 }, { "epoch": 3.800456100342075, "grad_norm": 6.3125, "learning_rate": 3.655902677250071e-06, "loss": 0.5764, "mean_token_accuracy": 0.8835581988096237, "num_tokens": 178414549.0, "step": 1668 }, { "epoch": 3.802736602052452, "grad_norm": 3.578125, "learning_rate": 3.6542323108527896e-06, "loss": 0.5819, "mean_token_accuracy": 0.8782322406768799, "num_tokens": 178521662.0, "step": 1669 }, { "epoch": 3.805017103762828, "grad_norm": 2.453125, "learning_rate": 3.652561289335532e-06, "loss": 0.5973, "mean_token_accuracy": 0.8794067353010178, "num_tokens": 178627910.0, "step": 1670 }, { "epoch": 3.807297605473204, "grad_norm": 3.765625, "learning_rate": 3.6508896136467376e-06, "loss": 0.5739, "mean_token_accuracy": 0.8825301080942154, "num_tokens": 178735254.0, "step": 1671 }, { "epoch": 3.8095781071835804, "grad_norm": 3.0, "learning_rate": 3.649217284735217e-06, "loss": 0.5672, "mean_token_accuracy": 0.8833526968955994, "num_tokens": 178842561.0, "step": 1672 }, { "epoch": 3.8118586088939566, "grad_norm": 4.59375, "learning_rate": 3.6475443035501522e-06, "loss": 0.5977, "mean_token_accuracy": 0.8792631328105927, "num_tokens": 178949352.0, "step": 1673 }, { "epoch": 3.8141391106043327, "grad_norm": 3.90625, "learning_rate": 3.645870671041095e-06, "loss": 0.5765, "mean_token_accuracy": 0.8842526823282242, "num_tokens": 179056304.0, "step": 1674 }, { "epoch": 3.8164196123147094, "grad_norm": 4.15625, "learning_rate": 3.6441963881579668e-06, "loss": 0.586, "mean_token_accuracy": 0.8808672279119492, "num_tokens": 179163897.0, "step": 1675 }, { "epoch": 3.8187001140250856, "grad_norm": 4.75, "learning_rate": 3.642521455851058e-06, "loss": 0.5614, "mean_token_accuracy": 0.8823509216308594, "num_tokens": 179270806.0, "step": 1676 }, { "epoch": 3.8209806157354618, "grad_norm": 3.3125, "learning_rate": 3.6408458750710284e-06, "loss": 0.5578, "mean_token_accuracy": 0.8836124241352081, "num_tokens": 179378081.0, "step": 1677 }, { "epoch": 3.823261117445838, "grad_norm": 4.625, "learning_rate": 3.639169646768905e-06, "loss": 0.5895, "mean_token_accuracy": 0.8754040449857712, "num_tokens": 179485558.0, "step": 1678 }, { "epoch": 3.8255416191562146, "grad_norm": 5.21875, "learning_rate": 3.637492771896082e-06, "loss": 0.5817, "mean_token_accuracy": 0.8810373991727829, "num_tokens": 179592323.0, "step": 1679 }, { "epoch": 3.827822120866591, "grad_norm": 3.0625, "learning_rate": 3.6358152514043226e-06, "loss": 0.583, "mean_token_accuracy": 0.8800250738859177, "num_tokens": 179699420.0, "step": 1680 }, { "epoch": 3.830102622576967, "grad_norm": 3.90625, "learning_rate": 3.634137086245754e-06, "loss": 0.5725, "mean_token_accuracy": 0.8808536231517792, "num_tokens": 179806666.0, "step": 1681 }, { "epoch": 3.832383124287343, "grad_norm": 4.28125, "learning_rate": 3.6324582773728712e-06, "loss": 0.5973, "mean_token_accuracy": 0.8750377893447876, "num_tokens": 179914021.0, "step": 1682 }, { "epoch": 3.8346636259977194, "grad_norm": 3.078125, "learning_rate": 3.6307788257385325e-06, "loss": 0.5859, "mean_token_accuracy": 0.8790106326341629, "num_tokens": 180020983.0, "step": 1683 }, { "epoch": 3.8369441277080956, "grad_norm": 2.59375, "learning_rate": 3.6290987322959624e-06, "loss": 0.5493, "mean_token_accuracy": 0.8865607380867004, "num_tokens": 180128755.0, "step": 1684 }, { "epoch": 3.839224629418472, "grad_norm": 2.375, "learning_rate": 3.6274179979987507e-06, "loss": 0.5751, "mean_token_accuracy": 0.881371021270752, "num_tokens": 180235525.0, "step": 1685 }, { "epoch": 3.8415051311288484, "grad_norm": 2.890625, "learning_rate": 3.625736623800849e-06, "loss": 0.5911, "mean_token_accuracy": 0.878914400935173, "num_tokens": 180342761.0, "step": 1686 }, { "epoch": 3.8437856328392246, "grad_norm": 2.828125, "learning_rate": 3.624054610656572e-06, "loss": 0.5888, "mean_token_accuracy": 0.879774883389473, "num_tokens": 180450260.0, "step": 1687 }, { "epoch": 3.846066134549601, "grad_norm": 3.34375, "learning_rate": 3.622371959520599e-06, "loss": 0.5879, "mean_token_accuracy": 0.881365180015564, "num_tokens": 180557292.0, "step": 1688 }, { "epoch": 3.8483466362599774, "grad_norm": 2.734375, "learning_rate": 3.6206886713479705e-06, "loss": 0.6012, "mean_token_accuracy": 0.8755958080291748, "num_tokens": 180664350.0, "step": 1689 }, { "epoch": 3.8506271379703536, "grad_norm": 2.859375, "learning_rate": 3.6190047470940875e-06, "loss": 0.5677, "mean_token_accuracy": 0.8807881772518158, "num_tokens": 180771157.0, "step": 1690 }, { "epoch": 3.85290763968073, "grad_norm": 2.875, "learning_rate": 3.6173201877147134e-06, "loss": 0.5997, "mean_token_accuracy": 0.8763711750507355, "num_tokens": 180877774.0, "step": 1691 }, { "epoch": 3.855188141391106, "grad_norm": 3.90625, "learning_rate": 3.6156349941659717e-06, "loss": 0.5768, "mean_token_accuracy": 0.8838952630758286, "num_tokens": 180985055.0, "step": 1692 }, { "epoch": 3.857468643101482, "grad_norm": 4.53125, "learning_rate": 3.613949167404345e-06, "loss": 0.5875, "mean_token_accuracy": 0.8797314018011093, "num_tokens": 181091651.0, "step": 1693 }, { "epoch": 3.8597491448118584, "grad_norm": 4.3125, "learning_rate": 3.6122627083866773e-06, "loss": 0.5671, "mean_token_accuracy": 0.8829943537712097, "num_tokens": 181198250.0, "step": 1694 }, { "epoch": 3.862029646522235, "grad_norm": 3.578125, "learning_rate": 3.610575618070169e-06, "loss": 0.559, "mean_token_accuracy": 0.8832627683877945, "num_tokens": 181305182.0, "step": 1695 }, { "epoch": 3.864310148232611, "grad_norm": 3.609375, "learning_rate": 3.6088878974123796e-06, "loss": 0.5592, "mean_token_accuracy": 0.8848606944084167, "num_tokens": 181412694.0, "step": 1696 }, { "epoch": 3.8665906499429874, "grad_norm": 5.125, "learning_rate": 3.6071995473712284e-06, "loss": 0.5849, "mean_token_accuracy": 0.8789099603891373, "num_tokens": 181519464.0, "step": 1697 }, { "epoch": 3.8688711516533636, "grad_norm": 3.640625, "learning_rate": 3.605510568904989e-06, "loss": 0.6049, "mean_token_accuracy": 0.8752625435590744, "num_tokens": 181626561.0, "step": 1698 }, { "epoch": 3.8711516533637402, "grad_norm": 4.90625, "learning_rate": 3.6038209629722936e-06, "loss": 0.566, "mean_token_accuracy": 0.8840775638818741, "num_tokens": 181733816.0, "step": 1699 }, { "epoch": 3.8734321550741164, "grad_norm": 2.609375, "learning_rate": 3.6021307305321295e-06, "loss": 0.5629, "mean_token_accuracy": 0.8823692500591278, "num_tokens": 181841269.0, "step": 1700 }, { "epoch": 3.8757126567844926, "grad_norm": 4.40625, "learning_rate": 3.6004398725438406e-06, "loss": 0.5469, "mean_token_accuracy": 0.8876504898071289, "num_tokens": 181948467.0, "step": 1701 }, { "epoch": 3.877993158494869, "grad_norm": 4.8125, "learning_rate": 3.5987483899671245e-06, "loss": 0.5609, "mean_token_accuracy": 0.8821342885494232, "num_tokens": 182055789.0, "step": 1702 }, { "epoch": 3.880273660205245, "grad_norm": 4.5625, "learning_rate": 3.597056283762034e-06, "loss": 0.5856, "mean_token_accuracy": 0.8786395937204361, "num_tokens": 182163625.0, "step": 1703 }, { "epoch": 3.882554161915621, "grad_norm": 2.828125, "learning_rate": 3.5953635548889777e-06, "loss": 0.5699, "mean_token_accuracy": 0.8824369460344315, "num_tokens": 182270282.0, "step": 1704 }, { "epoch": 3.884834663625998, "grad_norm": 2.796875, "learning_rate": 3.5936702043087134e-06, "loss": 0.6037, "mean_token_accuracy": 0.8777669966220856, "num_tokens": 182376966.0, "step": 1705 }, { "epoch": 3.887115165336374, "grad_norm": 2.75, "learning_rate": 3.5919762329823556e-06, "loss": 0.5711, "mean_token_accuracy": 0.8837083727121353, "num_tokens": 182484405.0, "step": 1706 }, { "epoch": 3.88939566704675, "grad_norm": 4.125, "learning_rate": 3.5902816418713694e-06, "loss": 0.5778, "mean_token_accuracy": 0.8821840286254883, "num_tokens": 182591485.0, "step": 1707 }, { "epoch": 3.8916761687571264, "grad_norm": 8.375, "learning_rate": 3.5885864319375717e-06, "loss": 0.5867, "mean_token_accuracy": 0.8777542114257812, "num_tokens": 182699148.0, "step": 1708 }, { "epoch": 3.893956670467503, "grad_norm": 4.15625, "learning_rate": 3.5868906041431313e-06, "loss": 0.5726, "mean_token_accuracy": 0.8843749016523361, "num_tokens": 182805780.0, "step": 1709 }, { "epoch": 3.8962371721778792, "grad_norm": 4.125, "learning_rate": 3.5851941594505674e-06, "loss": 0.5906, "mean_token_accuracy": 0.8774453997612, "num_tokens": 182912492.0, "step": 1710 }, { "epoch": 3.8985176738882554, "grad_norm": 2.390625, "learning_rate": 3.5834970988227484e-06, "loss": 0.577, "mean_token_accuracy": 0.8813342452049255, "num_tokens": 183019581.0, "step": 1711 }, { "epoch": 3.9007981755986316, "grad_norm": 5.125, "learning_rate": 3.581799423222895e-06, "loss": 0.5784, "mean_token_accuracy": 0.8827546834945679, "num_tokens": 183126811.0, "step": 1712 }, { "epoch": 3.903078677309008, "grad_norm": 8.5, "learning_rate": 3.580101133614573e-06, "loss": 0.583, "mean_token_accuracy": 0.879145547747612, "num_tokens": 183234837.0, "step": 1713 }, { "epoch": 3.905359179019384, "grad_norm": 3.578125, "learning_rate": 3.5784022309617006e-06, "loss": 0.5614, "mean_token_accuracy": 0.8833577632904053, "num_tokens": 183342206.0, "step": 1714 }, { "epoch": 3.9076396807297606, "grad_norm": 4.0625, "learning_rate": 3.57670271622854e-06, "loss": 0.5784, "mean_token_accuracy": 0.8815594464540482, "num_tokens": 183448841.0, "step": 1715 }, { "epoch": 3.909920182440137, "grad_norm": 2.546875, "learning_rate": 3.5750025903797053e-06, "loss": 0.5777, "mean_token_accuracy": 0.8809472620487213, "num_tokens": 183555198.0, "step": 1716 }, { "epoch": 3.912200684150513, "grad_norm": 3.34375, "learning_rate": 3.5733018543801534e-06, "loss": 0.5877, "mean_token_accuracy": 0.8786370903253555, "num_tokens": 183661758.0, "step": 1717 }, { "epoch": 3.9144811858608897, "grad_norm": 6.25, "learning_rate": 3.5716005091951906e-06, "loss": 0.5861, "mean_token_accuracy": 0.8804649114608765, "num_tokens": 183768385.0, "step": 1718 }, { "epoch": 3.916761687571266, "grad_norm": 5.34375, "learning_rate": 3.569898555790466e-06, "loss": 0.5782, "mean_token_accuracy": 0.8821226358413696, "num_tokens": 183874915.0, "step": 1719 }, { "epoch": 3.919042189281642, "grad_norm": 3.921875, "learning_rate": 3.5681959951319766e-06, "loss": 0.6009, "mean_token_accuracy": 0.8761325925588608, "num_tokens": 183982390.0, "step": 1720 }, { "epoch": 3.9213226909920182, "grad_norm": 2.796875, "learning_rate": 3.566492828186063e-06, "loss": 0.5688, "mean_token_accuracy": 0.8820738047361374, "num_tokens": 184089244.0, "step": 1721 }, { "epoch": 3.9236031927023944, "grad_norm": 3.515625, "learning_rate": 3.564789055919409e-06, "loss": 0.5631, "mean_token_accuracy": 0.8825008720159531, "num_tokens": 184196515.0, "step": 1722 }, { "epoch": 3.9258836944127706, "grad_norm": 2.984375, "learning_rate": 3.5630846792990435e-06, "loss": 0.6072, "mean_token_accuracy": 0.8774504512548447, "num_tokens": 184303445.0, "step": 1723 }, { "epoch": 3.928164196123147, "grad_norm": 3.859375, "learning_rate": 3.5613796992923382e-06, "loss": 0.5728, "mean_token_accuracy": 0.8825356364250183, "num_tokens": 184410680.0, "step": 1724 }, { "epoch": 3.9304446978335235, "grad_norm": 2.90625, "learning_rate": 3.559674116867006e-06, "loss": 0.577, "mean_token_accuracy": 0.881066232919693, "num_tokens": 184517883.0, "step": 1725 }, { "epoch": 3.9327251995438997, "grad_norm": 2.390625, "learning_rate": 3.5579679329911025e-06, "loss": 0.5761, "mean_token_accuracy": 0.8830063939094543, "num_tokens": 184625069.0, "step": 1726 }, { "epoch": 3.935005701254276, "grad_norm": 3.234375, "learning_rate": 3.556261148633026e-06, "loss": 0.576, "mean_token_accuracy": 0.8830278366804123, "num_tokens": 184732358.0, "step": 1727 }, { "epoch": 3.9372862029646525, "grad_norm": 3.34375, "learning_rate": 3.5545537647615125e-06, "loss": 0.5697, "mean_token_accuracy": 0.8799594938755035, "num_tokens": 184839208.0, "step": 1728 }, { "epoch": 3.9395667046750287, "grad_norm": 2.671875, "learning_rate": 3.552845782345642e-06, "loss": 0.5872, "mean_token_accuracy": 0.8789944648742676, "num_tokens": 184945933.0, "step": 1729 }, { "epoch": 3.941847206385405, "grad_norm": 2.8125, "learning_rate": 3.551137202354831e-06, "loss": 0.5849, "mean_token_accuracy": 0.8809881657361984, "num_tokens": 185052752.0, "step": 1730 }, { "epoch": 3.944127708095781, "grad_norm": 2.84375, "learning_rate": 3.5494280257588367e-06, "loss": 0.5669, "mean_token_accuracy": 0.8846791535615921, "num_tokens": 185159970.0, "step": 1731 }, { "epoch": 3.9464082098061573, "grad_norm": 3.515625, "learning_rate": 3.547718253527755e-06, "loss": 0.5646, "mean_token_accuracy": 0.8843375146389008, "num_tokens": 185267250.0, "step": 1732 }, { "epoch": 3.9486887115165334, "grad_norm": 2.4375, "learning_rate": 3.546007886632019e-06, "loss": 0.5873, "mean_token_accuracy": 0.8792973756790161, "num_tokens": 185373715.0, "step": 1733 }, { "epoch": 3.95096921322691, "grad_norm": 2.796875, "learning_rate": 3.5442969260424022e-06, "loss": 0.5772, "mean_token_accuracy": 0.8815480768680573, "num_tokens": 185480906.0, "step": 1734 }, { "epoch": 3.9532497149372863, "grad_norm": 3.109375, "learning_rate": 3.5425853727300095e-06, "loss": 0.5888, "mean_token_accuracy": 0.8790358453989029, "num_tokens": 185588351.0, "step": 1735 }, { "epoch": 3.9555302166476625, "grad_norm": 2.8125, "learning_rate": 3.5408732276662882e-06, "loss": 0.5974, "mean_token_accuracy": 0.8829907327890396, "num_tokens": 185695047.0, "step": 1736 }, { "epoch": 3.9578107183580387, "grad_norm": 3.59375, "learning_rate": 3.5391604918230173e-06, "loss": 0.582, "mean_token_accuracy": 0.878580242395401, "num_tokens": 185802784.0, "step": 1737 }, { "epoch": 3.9600912200684153, "grad_norm": 2.765625, "learning_rate": 3.537447166172313e-06, "loss": 0.5724, "mean_token_accuracy": 0.879580944776535, "num_tokens": 185909720.0, "step": 1738 }, { "epoch": 3.9623717217787915, "grad_norm": 2.765625, "learning_rate": 3.5357332516866256e-06, "loss": 0.5551, "mean_token_accuracy": 0.8843010365962982, "num_tokens": 186016443.0, "step": 1739 }, { "epoch": 3.9646522234891677, "grad_norm": 3.375, "learning_rate": 3.534018749338741e-06, "loss": 0.5755, "mean_token_accuracy": 0.8813291490077972, "num_tokens": 186123915.0, "step": 1740 }, { "epoch": 3.966932725199544, "grad_norm": 2.8125, "learning_rate": 3.532303660101776e-06, "loss": 0.5741, "mean_token_accuracy": 0.8835762590169907, "num_tokens": 186230831.0, "step": 1741 }, { "epoch": 3.96921322690992, "grad_norm": 4.15625, "learning_rate": 3.530587984949183e-06, "loss": 0.6007, "mean_token_accuracy": 0.8778104931116104, "num_tokens": 186337578.0, "step": 1742 }, { "epoch": 3.9714937286202963, "grad_norm": 3.125, "learning_rate": 3.5288717248547453e-06, "loss": 0.5634, "mean_token_accuracy": 0.8861571699380875, "num_tokens": 186445605.0, "step": 1743 }, { "epoch": 3.973774230330673, "grad_norm": 3.65625, "learning_rate": 3.5271548807925803e-06, "loss": 0.5857, "mean_token_accuracy": 0.8777986913919449, "num_tokens": 186552754.0, "step": 1744 }, { "epoch": 3.976054732041049, "grad_norm": 2.90625, "learning_rate": 3.525437453737136e-06, "loss": 0.5911, "mean_token_accuracy": 0.8793213814496994, "num_tokens": 186659906.0, "step": 1745 }, { "epoch": 3.9783352337514253, "grad_norm": 3.15625, "learning_rate": 3.5237194446631883e-06, "loss": 0.5731, "mean_token_accuracy": 0.8833697587251663, "num_tokens": 186766193.0, "step": 1746 }, { "epoch": 3.9806157354618015, "grad_norm": 2.59375, "learning_rate": 3.522000854545849e-06, "loss": 0.5745, "mean_token_accuracy": 0.8789832890033722, "num_tokens": 186872982.0, "step": 1747 }, { "epoch": 3.982896237172178, "grad_norm": 2.78125, "learning_rate": 3.520281684360554e-06, "loss": 0.5893, "mean_token_accuracy": 0.8788421899080276, "num_tokens": 186979970.0, "step": 1748 }, { "epoch": 3.9851767388825543, "grad_norm": 2.875, "learning_rate": 3.5185619350830725e-06, "loss": 0.5813, "mean_token_accuracy": 0.8799102157354355, "num_tokens": 187086646.0, "step": 1749 }, { "epoch": 3.9874572405929305, "grad_norm": 2.296875, "learning_rate": 3.516841607689501e-06, "loss": 0.5596, "mean_token_accuracy": 0.8887099474668503, "num_tokens": 187194165.0, "step": 1750 }, { "epoch": 3.9897377423033067, "grad_norm": 3.078125, "learning_rate": 3.515120703156264e-06, "loss": 0.5885, "mean_token_accuracy": 0.8788872063159943, "num_tokens": 187300450.0, "step": 1751 }, { "epoch": 3.992018244013683, "grad_norm": 2.84375, "learning_rate": 3.5133992224601126e-06, "loss": 0.5894, "mean_token_accuracy": 0.8765320926904678, "num_tokens": 187407316.0, "step": 1752 }, { "epoch": 3.994298745724059, "grad_norm": 4.46875, "learning_rate": 3.511677166578128e-06, "loss": 0.5517, "mean_token_accuracy": 0.8845993727445602, "num_tokens": 187515138.0, "step": 1753 }, { "epoch": 3.9965792474344357, "grad_norm": 2.328125, "learning_rate": 3.509954536487714e-06, "loss": 0.5822, "mean_token_accuracy": 0.8815711736679077, "num_tokens": 187622052.0, "step": 1754 }, { "epoch": 3.998859749144812, "grad_norm": 4.40625, "learning_rate": 3.5082313331666035e-06, "loss": 0.5521, "mean_token_accuracy": 0.8849412500858307, "num_tokens": 187729710.0, "step": 1755 }, { "epoch": 4.0, "grad_norm": 4.90625, "learning_rate": 3.506507557592853e-06, "loss": 0.5827, "mean_token_accuracy": 0.879196435213089, "num_tokens": 187768928.0, "step": 1756 }, { "epoch": 4.002280501710376, "grad_norm": 3.0, "learning_rate": 3.5047832107448437e-06, "loss": 0.5866, "mean_token_accuracy": 0.8787702172994614, "num_tokens": 187875429.0, "step": 1757 }, { "epoch": 4.004561003420752, "grad_norm": 2.703125, "learning_rate": 3.503058293601283e-06, "loss": 0.5814, "mean_token_accuracy": 0.8796392232179642, "num_tokens": 187982859.0, "step": 1758 }, { "epoch": 4.006841505131129, "grad_norm": 2.921875, "learning_rate": 3.5013328071411995e-06, "loss": 0.5651, "mean_token_accuracy": 0.8837246298789978, "num_tokens": 188090247.0, "step": 1759 }, { "epoch": 4.009122006841505, "grad_norm": 2.640625, "learning_rate": 3.499606752343945e-06, "loss": 0.5934, "mean_token_accuracy": 0.8794354498386383, "num_tokens": 188198071.0, "step": 1760 }, { "epoch": 4.009122006841505, "eval_loss": 0.5914322733879089, "eval_mean_token_accuracy": 0.8791430533612182, "eval_num_tokens": 188198071.0, "eval_runtime": 58.7059, "eval_samples_per_second": 142.831, "eval_steps_per_second": 4.48, "step": 1760 }, { "epoch": 4.011402508551882, "grad_norm": 2.765625, "learning_rate": 3.4978801301891972e-06, "loss": 0.5778, "mean_token_accuracy": 0.8818523436784744, "num_tokens": 188305205.0, "step": 1761 }, { "epoch": 4.013683010262258, "grad_norm": 3.625, "learning_rate": 3.496152941656952e-06, "loss": 0.5695, "mean_token_accuracy": 0.8844558596611023, "num_tokens": 188411935.0, "step": 1762 }, { "epoch": 4.015963511972634, "grad_norm": 3.453125, "learning_rate": 3.494425187727528e-06, "loss": 0.5758, "mean_token_accuracy": 0.8814276456832886, "num_tokens": 188519090.0, "step": 1763 }, { "epoch": 4.01824401368301, "grad_norm": 3.609375, "learning_rate": 3.4926968693815667e-06, "loss": 0.5796, "mean_token_accuracy": 0.8852830976247787, "num_tokens": 188626017.0, "step": 1764 }, { "epoch": 4.020524515393387, "grad_norm": 2.984375, "learning_rate": 3.4909679876000256e-06, "loss": 0.5684, "mean_token_accuracy": 0.8862181156873703, "num_tokens": 188733308.0, "step": 1765 }, { "epoch": 4.022805017103763, "grad_norm": 4.4375, "learning_rate": 3.4892385433641875e-06, "loss": 0.5831, "mean_token_accuracy": 0.8816091120243073, "num_tokens": 188840326.0, "step": 1766 }, { "epoch": 4.025085518814139, "grad_norm": 2.921875, "learning_rate": 3.4875085376556493e-06, "loss": 0.5696, "mean_token_accuracy": 0.881555825471878, "num_tokens": 188947739.0, "step": 1767 }, { "epoch": 4.027366020524515, "grad_norm": 3.109375, "learning_rate": 3.4857779714563305e-06, "loss": 0.5838, "mean_token_accuracy": 0.8749964982271194, "num_tokens": 189054532.0, "step": 1768 }, { "epoch": 4.029646522234891, "grad_norm": 2.53125, "learning_rate": 3.4840468457484654e-06, "loss": 0.575, "mean_token_accuracy": 0.8818257004022598, "num_tokens": 189161911.0, "step": 1769 }, { "epoch": 4.031927023945268, "grad_norm": 2.515625, "learning_rate": 3.4823151615146093e-06, "loss": 0.5675, "mean_token_accuracy": 0.8840532153844833, "num_tokens": 189269010.0, "step": 1770 }, { "epoch": 4.034207525655645, "grad_norm": 3.484375, "learning_rate": 3.480582919737631e-06, "loss": 0.5791, "mean_token_accuracy": 0.8821255415678024, "num_tokens": 189375921.0, "step": 1771 }, { "epoch": 4.036488027366021, "grad_norm": 4.875, "learning_rate": 3.478850121400719e-06, "loss": 0.5811, "mean_token_accuracy": 0.8794375658035278, "num_tokens": 189483390.0, "step": 1772 }, { "epoch": 4.038768529076397, "grad_norm": 3.265625, "learning_rate": 3.477116767487375e-06, "loss": 0.5762, "mean_token_accuracy": 0.8811006546020508, "num_tokens": 189590557.0, "step": 1773 }, { "epoch": 4.041049030786773, "grad_norm": 2.734375, "learning_rate": 3.475382858981418e-06, "loss": 0.5838, "mean_token_accuracy": 0.8820205479860306, "num_tokens": 189697618.0, "step": 1774 }, { "epoch": 4.043329532497149, "grad_norm": 7.40625, "learning_rate": 3.473648396866981e-06, "loss": 0.5937, "mean_token_accuracy": 0.8759682923555374, "num_tokens": 189804472.0, "step": 1775 }, { "epoch": 4.045610034207526, "grad_norm": 4.21875, "learning_rate": 3.4719133821285108e-06, "loss": 0.5713, "mean_token_accuracy": 0.8793429881334305, "num_tokens": 189911753.0, "step": 1776 }, { "epoch": 4.047890535917902, "grad_norm": 4.78125, "learning_rate": 3.470177815750769e-06, "loss": 0.5698, "mean_token_accuracy": 0.8850299268960953, "num_tokens": 190019669.0, "step": 1777 }, { "epoch": 4.050171037628278, "grad_norm": 3.734375, "learning_rate": 3.4684416987188273e-06, "loss": 0.5716, "mean_token_accuracy": 0.8823755383491516, "num_tokens": 190127195.0, "step": 1778 }, { "epoch": 4.052451539338654, "grad_norm": 3.234375, "learning_rate": 3.4667050320180755e-06, "loss": 0.5832, "mean_token_accuracy": 0.8784758597612381, "num_tokens": 190234158.0, "step": 1779 }, { "epoch": 4.05473204104903, "grad_norm": 2.59375, "learning_rate": 3.4649678166342104e-06, "loss": 0.573, "mean_token_accuracy": 0.8827002048492432, "num_tokens": 190341000.0, "step": 1780 }, { "epoch": 4.0570125427594075, "grad_norm": 2.671875, "learning_rate": 3.4632300535532415e-06, "loss": 0.6008, "mean_token_accuracy": 0.8756804317235947, "num_tokens": 190448392.0, "step": 1781 }, { "epoch": 4.059293044469784, "grad_norm": 4.4375, "learning_rate": 3.46149174376149e-06, "loss": 0.5771, "mean_token_accuracy": 0.8806837797164917, "num_tokens": 190555060.0, "step": 1782 }, { "epoch": 4.06157354618016, "grad_norm": 5.09375, "learning_rate": 3.459752888245587e-06, "loss": 0.5831, "mean_token_accuracy": 0.8827811032533646, "num_tokens": 190662364.0, "step": 1783 }, { "epoch": 4.063854047890536, "grad_norm": 2.546875, "learning_rate": 3.4580134879924732e-06, "loss": 0.584, "mean_token_accuracy": 0.877520278096199, "num_tokens": 190769329.0, "step": 1784 }, { "epoch": 4.066134549600912, "grad_norm": 3.0, "learning_rate": 3.4562735439894e-06, "loss": 0.5815, "mean_token_accuracy": 0.88214111328125, "num_tokens": 190876871.0, "step": 1785 }, { "epoch": 4.068415051311288, "grad_norm": 3.890625, "learning_rate": 3.4545330572239234e-06, "loss": 0.6067, "mean_token_accuracy": 0.8759456723928452, "num_tokens": 190983692.0, "step": 1786 }, { "epoch": 4.070695553021665, "grad_norm": 4.65625, "learning_rate": 3.452792028683912e-06, "loss": 0.59, "mean_token_accuracy": 0.8788794428110123, "num_tokens": 191090686.0, "step": 1787 }, { "epoch": 4.072976054732041, "grad_norm": 2.71875, "learning_rate": 3.4510504593575396e-06, "loss": 0.5766, "mean_token_accuracy": 0.8841027617454529, "num_tokens": 191197679.0, "step": 1788 }, { "epoch": 4.075256556442417, "grad_norm": 3.96875, "learning_rate": 3.449308350233287e-06, "loss": 0.5768, "mean_token_accuracy": 0.8803805410861969, "num_tokens": 191305286.0, "step": 1789 }, { "epoch": 4.077537058152793, "grad_norm": 2.765625, "learning_rate": 3.447565702299942e-06, "loss": 0.5768, "mean_token_accuracy": 0.8774025738239288, "num_tokens": 191412336.0, "step": 1790 }, { "epoch": 4.07981755986317, "grad_norm": 2.859375, "learning_rate": 3.445822516546598e-06, "loss": 0.5745, "mean_token_accuracy": 0.881226509809494, "num_tokens": 191519740.0, "step": 1791 }, { "epoch": 4.0820980615735465, "grad_norm": 2.5625, "learning_rate": 3.444078793962653e-06, "loss": 0.5699, "mean_token_accuracy": 0.8817672431468964, "num_tokens": 191626597.0, "step": 1792 }, { "epoch": 4.084378563283923, "grad_norm": 2.328125, "learning_rate": 3.4423345355378114e-06, "loss": 0.5748, "mean_token_accuracy": 0.8822430968284607, "num_tokens": 191733873.0, "step": 1793 }, { "epoch": 4.086659064994299, "grad_norm": 4.625, "learning_rate": 3.440589742262079e-06, "loss": 0.5825, "mean_token_accuracy": 0.8803035467863083, "num_tokens": 191840789.0, "step": 1794 }, { "epoch": 4.088939566704675, "grad_norm": 5.25, "learning_rate": 3.438844415125768e-06, "loss": 0.5787, "mean_token_accuracy": 0.8835125714540482, "num_tokens": 191947897.0, "step": 1795 }, { "epoch": 4.091220068415051, "grad_norm": 2.4375, "learning_rate": 3.437098555119493e-06, "loss": 0.5752, "mean_token_accuracy": 0.8822659403085709, "num_tokens": 192055072.0, "step": 1796 }, { "epoch": 4.0935005701254275, "grad_norm": 2.890625, "learning_rate": 3.4353521632341686e-06, "loss": 0.5863, "mean_token_accuracy": 0.8791303038597107, "num_tokens": 192162000.0, "step": 1797 }, { "epoch": 4.095781071835804, "grad_norm": 7.75, "learning_rate": 3.4336052404610138e-06, "loss": 0.5796, "mean_token_accuracy": 0.8811719119548798, "num_tokens": 192269139.0, "step": 1798 }, { "epoch": 4.09806157354618, "grad_norm": 8.6875, "learning_rate": 3.431857787791549e-06, "loss": 0.5892, "mean_token_accuracy": 0.8792611956596375, "num_tokens": 192376137.0, "step": 1799 }, { "epoch": 4.100342075256556, "grad_norm": 6.96875, "learning_rate": 3.4301098062175936e-06, "loss": 0.5919, "mean_token_accuracy": 0.8786915689706802, "num_tokens": 192483573.0, "step": 1800 }, { "epoch": 4.102622576966933, "grad_norm": 3.4375, "learning_rate": 3.4283612967312692e-06, "loss": 0.6022, "mean_token_accuracy": 0.8749962151050568, "num_tokens": 192590098.0, "step": 1801 }, { "epoch": 4.104903078677309, "grad_norm": 3.21875, "learning_rate": 3.426612260324996e-06, "loss": 0.5693, "mean_token_accuracy": 0.8811412006616592, "num_tokens": 192696798.0, "step": 1802 }, { "epoch": 4.1071835803876855, "grad_norm": 4.84375, "learning_rate": 3.424862697991491e-06, "loss": 0.5495, "mean_token_accuracy": 0.8869053721427917, "num_tokens": 192804017.0, "step": 1803 }, { "epoch": 4.109464082098062, "grad_norm": 4.21875, "learning_rate": 3.4231126107237754e-06, "loss": 0.5735, "mean_token_accuracy": 0.8787698745727539, "num_tokens": 192912495.0, "step": 1804 }, { "epoch": 4.111744583808438, "grad_norm": 3.1875, "learning_rate": 3.4213619995151628e-06, "loss": 0.5761, "mean_token_accuracy": 0.8815167546272278, "num_tokens": 193019325.0, "step": 1805 }, { "epoch": 4.114025085518814, "grad_norm": 3.15625, "learning_rate": 3.4196108653592662e-06, "loss": 0.5571, "mean_token_accuracy": 0.8847940266132355, "num_tokens": 193126418.0, "step": 1806 }, { "epoch": 4.11630558722919, "grad_norm": 2.453125, "learning_rate": 3.417859209249997e-06, "loss": 0.5618, "mean_token_accuracy": 0.8841525167226791, "num_tokens": 193233113.0, "step": 1807 }, { "epoch": 4.1185860889395665, "grad_norm": 3.515625, "learning_rate": 3.4161070321815605e-06, "loss": 0.5985, "mean_token_accuracy": 0.8764221370220184, "num_tokens": 193339988.0, "step": 1808 }, { "epoch": 4.120866590649943, "grad_norm": 3.859375, "learning_rate": 3.4143543351484585e-06, "loss": 0.586, "mean_token_accuracy": 0.8799712508916855, "num_tokens": 193446945.0, "step": 1809 }, { "epoch": 4.123147092360319, "grad_norm": 3.8125, "learning_rate": 3.4126011191454877e-06, "loss": 0.5742, "mean_token_accuracy": 0.8829108774662018, "num_tokens": 193553550.0, "step": 1810 }, { "epoch": 4.125427594070696, "grad_norm": 5.15625, "learning_rate": 3.4108473851677408e-06, "loss": 0.5868, "mean_token_accuracy": 0.8795969039201736, "num_tokens": 193660308.0, "step": 1811 }, { "epoch": 4.127708095781072, "grad_norm": 3.375, "learning_rate": 3.4090931342106024e-06, "loss": 0.5837, "mean_token_accuracy": 0.8809874951839447, "num_tokens": 193768000.0, "step": 1812 }, { "epoch": 4.129988597491448, "grad_norm": 3.078125, "learning_rate": 3.4073383672697524e-06, "loss": 0.5897, "mean_token_accuracy": 0.8771644234657288, "num_tokens": 193875084.0, "step": 1813 }, { "epoch": 4.1322690992018245, "grad_norm": 4.8125, "learning_rate": 3.4055830853411616e-06, "loss": 0.6109, "mean_token_accuracy": 0.8745006024837494, "num_tokens": 193981792.0, "step": 1814 }, { "epoch": 4.134549600912201, "grad_norm": 2.734375, "learning_rate": 3.4038272894210945e-06, "loss": 0.5889, "mean_token_accuracy": 0.8784460872411728, "num_tokens": 194088451.0, "step": 1815 }, { "epoch": 4.136830102622577, "grad_norm": 3.21875, "learning_rate": 3.4020709805061066e-06, "loss": 0.574, "mean_token_accuracy": 0.8836447149515152, "num_tokens": 194195882.0, "step": 1816 }, { "epoch": 4.139110604332953, "grad_norm": 2.703125, "learning_rate": 3.4003141595930456e-06, "loss": 0.5758, "mean_token_accuracy": 0.8817628026008606, "num_tokens": 194302902.0, "step": 1817 }, { "epoch": 4.141391106043329, "grad_norm": 2.78125, "learning_rate": 3.3985568276790487e-06, "loss": 0.5747, "mean_token_accuracy": 0.8798153549432755, "num_tokens": 194410857.0, "step": 1818 }, { "epoch": 4.1436716077537055, "grad_norm": 3.734375, "learning_rate": 3.3967989857615434e-06, "loss": 0.589, "mean_token_accuracy": 0.8804911673069, "num_tokens": 194517991.0, "step": 1819 }, { "epoch": 4.145952109464082, "grad_norm": 4.09375, "learning_rate": 3.3950406348382483e-06, "loss": 0.5691, "mean_token_accuracy": 0.8826321363449097, "num_tokens": 194624664.0, "step": 1820 }, { "epoch": 4.148232611174459, "grad_norm": 4.375, "learning_rate": 3.3932817759071666e-06, "loss": 0.5761, "mean_token_accuracy": 0.8814938217401505, "num_tokens": 194731631.0, "step": 1821 }, { "epoch": 4.150513112884835, "grad_norm": 2.65625, "learning_rate": 3.3915224099665962e-06, "loss": 0.5791, "mean_token_accuracy": 0.8795495629310608, "num_tokens": 194838909.0, "step": 1822 }, { "epoch": 4.152793614595211, "grad_norm": 4.15625, "learning_rate": 3.389762538015116e-06, "loss": 0.5996, "mean_token_accuracy": 0.8770654946565628, "num_tokens": 194945773.0, "step": 1823 }, { "epoch": 4.155074116305587, "grad_norm": 2.65625, "learning_rate": 3.388002161051598e-06, "loss": 0.5975, "mean_token_accuracy": 0.8804232776165009, "num_tokens": 195052653.0, "step": 1824 }, { "epoch": 4.1573546180159635, "grad_norm": 4.625, "learning_rate": 3.3862412800751963e-06, "loss": 0.595, "mean_token_accuracy": 0.8783848881721497, "num_tokens": 195159206.0, "step": 1825 }, { "epoch": 4.15963511972634, "grad_norm": 5.15625, "learning_rate": 3.3844798960853533e-06, "loss": 0.5826, "mean_token_accuracy": 0.879721149802208, "num_tokens": 195266220.0, "step": 1826 }, { "epoch": 4.161915621436716, "grad_norm": 3.734375, "learning_rate": 3.382718010081797e-06, "loss": 0.5853, "mean_token_accuracy": 0.8808034509420395, "num_tokens": 195373432.0, "step": 1827 }, { "epoch": 4.164196123147092, "grad_norm": 3.40625, "learning_rate": 3.38095562306454e-06, "loss": 0.5872, "mean_token_accuracy": 0.8810047507286072, "num_tokens": 195480280.0, "step": 1828 }, { "epoch": 4.166476624857468, "grad_norm": 2.28125, "learning_rate": 3.3791927360338785e-06, "loss": 0.5697, "mean_token_accuracy": 0.8826696872711182, "num_tokens": 195587648.0, "step": 1829 }, { "epoch": 4.168757126567845, "grad_norm": 3.203125, "learning_rate": 3.3774293499903934e-06, "loss": 0.5888, "mean_token_accuracy": 0.8804333359003067, "num_tokens": 195694201.0, "step": 1830 }, { "epoch": 4.1710376282782216, "grad_norm": 4.53125, "learning_rate": 3.3756654659349487e-06, "loss": 0.5804, "mean_token_accuracy": 0.879314973950386, "num_tokens": 195800840.0, "step": 1831 }, { "epoch": 4.173318129988598, "grad_norm": 3.859375, "learning_rate": 3.373901084868691e-06, "loss": 0.5692, "mean_token_accuracy": 0.8824407607316971, "num_tokens": 195908050.0, "step": 1832 }, { "epoch": 4.175598631698974, "grad_norm": 4.4375, "learning_rate": 3.372136207793049e-06, "loss": 0.5689, "mean_token_accuracy": 0.8842916339635849, "num_tokens": 196014977.0, "step": 1833 }, { "epoch": 4.17787913340935, "grad_norm": 2.671875, "learning_rate": 3.3703708357097333e-06, "loss": 0.563, "mean_token_accuracy": 0.8835365027189255, "num_tokens": 196122033.0, "step": 1834 }, { "epoch": 4.180159635119726, "grad_norm": 5.25, "learning_rate": 3.3686049696207336e-06, "loss": 0.5711, "mean_token_accuracy": 0.8810452073812485, "num_tokens": 196228948.0, "step": 1835 }, { "epoch": 4.1824401368301025, "grad_norm": 4.90625, "learning_rate": 3.3668386105283226e-06, "loss": 0.5784, "mean_token_accuracy": 0.882371187210083, "num_tokens": 196335775.0, "step": 1836 }, { "epoch": 4.184720638540479, "grad_norm": 4.03125, "learning_rate": 3.365071759435051e-06, "loss": 0.574, "mean_token_accuracy": 0.8814999759197235, "num_tokens": 196442967.0, "step": 1837 }, { "epoch": 4.187001140250855, "grad_norm": 2.828125, "learning_rate": 3.363304417343749e-06, "loss": 0.5788, "mean_token_accuracy": 0.8800509870052338, "num_tokens": 196550345.0, "step": 1838 }, { "epoch": 4.189281641961231, "grad_norm": 2.625, "learning_rate": 3.3615365852575276e-06, "loss": 0.5749, "mean_token_accuracy": 0.8801786154508591, "num_tokens": 196658198.0, "step": 1839 }, { "epoch": 4.191562143671608, "grad_norm": 5.03125, "learning_rate": 3.359768264179772e-06, "loss": 0.5852, "mean_token_accuracy": 0.8778839707374573, "num_tokens": 196765684.0, "step": 1840 }, { "epoch": 4.193842645381984, "grad_norm": 2.921875, "learning_rate": 3.357999455114148e-06, "loss": 0.5758, "mean_token_accuracy": 0.8802204430103302, "num_tokens": 196872724.0, "step": 1841 }, { "epoch": 4.196123147092361, "grad_norm": 2.421875, "learning_rate": 3.356230159064599e-06, "loss": 0.5644, "mean_token_accuracy": 0.8841566443443298, "num_tokens": 196979553.0, "step": 1842 }, { "epoch": 4.198403648802737, "grad_norm": 2.59375, "learning_rate": 3.3544603770353407e-06, "loss": 0.5583, "mean_token_accuracy": 0.8825344890356064, "num_tokens": 197086673.0, "step": 1843 }, { "epoch": 4.200684150513113, "grad_norm": 2.921875, "learning_rate": 3.352690110030869e-06, "loss": 0.5991, "mean_token_accuracy": 0.8824966847896576, "num_tokens": 197193889.0, "step": 1844 }, { "epoch": 4.202964652223489, "grad_norm": 2.96875, "learning_rate": 3.350919359055953e-06, "loss": 0.5828, "mean_token_accuracy": 0.8796708136796951, "num_tokens": 197301220.0, "step": 1845 }, { "epoch": 4.205245153933865, "grad_norm": 3.0, "learning_rate": 3.3491481251156355e-06, "loss": 0.5727, "mean_token_accuracy": 0.8849449008703232, "num_tokens": 197408502.0, "step": 1846 }, { "epoch": 4.2075256556442415, "grad_norm": 2.71875, "learning_rate": 3.347376409215236e-06, "loss": 0.5752, "mean_token_accuracy": 0.8793286383152008, "num_tokens": 197515859.0, "step": 1847 }, { "epoch": 4.209806157354618, "grad_norm": 2.546875, "learning_rate": 3.345604212360346e-06, "loss": 0.588, "mean_token_accuracy": 0.8793947696685791, "num_tokens": 197622609.0, "step": 1848 }, { "epoch": 4.212086659064994, "grad_norm": 4.5, "learning_rate": 3.3438315355568295e-06, "loss": 0.5806, "mean_token_accuracy": 0.8782702833414078, "num_tokens": 197729199.0, "step": 1849 }, { "epoch": 4.214367160775371, "grad_norm": 4.71875, "learning_rate": 3.3420583798108253e-06, "loss": 0.578, "mean_token_accuracy": 0.8802850246429443, "num_tokens": 197835966.0, "step": 1850 }, { "epoch": 4.216647662485747, "grad_norm": 3.84375, "learning_rate": 3.34028474612874e-06, "loss": 0.5743, "mean_token_accuracy": 0.8822648972272873, "num_tokens": 197942829.0, "step": 1851 }, { "epoch": 4.218928164196123, "grad_norm": 5.375, "learning_rate": 3.338510635517256e-06, "loss": 0.6037, "mean_token_accuracy": 0.8766555935144424, "num_tokens": 198050288.0, "step": 1852 }, { "epoch": 4.2212086659065, "grad_norm": 6.46875, "learning_rate": 3.3367360489833236e-06, "loss": 0.573, "mean_token_accuracy": 0.8814980983734131, "num_tokens": 198158005.0, "step": 1853 }, { "epoch": 4.223489167616876, "grad_norm": 8.0625, "learning_rate": 3.3349609875341626e-06, "loss": 0.5662, "mean_token_accuracy": 0.882731556892395, "num_tokens": 198265125.0, "step": 1854 }, { "epoch": 4.225769669327252, "grad_norm": 3.390625, "learning_rate": 3.3331854521772656e-06, "loss": 0.558, "mean_token_accuracy": 0.8883755952119827, "num_tokens": 198371867.0, "step": 1855 }, { "epoch": 4.228050171037628, "grad_norm": 2.375, "learning_rate": 3.3314094439203903e-06, "loss": 0.5772, "mean_token_accuracy": 0.8845670074224472, "num_tokens": 198478602.0, "step": 1856 }, { "epoch": 4.230330672748004, "grad_norm": 4.1875, "learning_rate": 3.3296329637715662e-06, "loss": 0.5775, "mean_token_accuracy": 0.8805161267518997, "num_tokens": 198585016.0, "step": 1857 }, { "epoch": 4.2326111744583805, "grad_norm": 3.46875, "learning_rate": 3.3278560127390892e-06, "loss": 0.5769, "mean_token_accuracy": 0.8802484422922134, "num_tokens": 198692273.0, "step": 1858 }, { "epoch": 4.234891676168757, "grad_norm": 4.6875, "learning_rate": 3.32607859183152e-06, "loss": 0.581, "mean_token_accuracy": 0.8806602954864502, "num_tokens": 198798563.0, "step": 1859 }, { "epoch": 4.237172177879134, "grad_norm": 2.34375, "learning_rate": 3.3243007020576917e-06, "loss": 0.5911, "mean_token_accuracy": 0.8770420551300049, "num_tokens": 198905412.0, "step": 1860 }, { "epoch": 4.23945267958951, "grad_norm": 6.125, "learning_rate": 3.322522344426698e-06, "loss": 0.5751, "mean_token_accuracy": 0.883165031671524, "num_tokens": 199012136.0, "step": 1861 }, { "epoch": 4.241733181299886, "grad_norm": 2.734375, "learning_rate": 3.320743519947901e-06, "loss": 0.5804, "mean_token_accuracy": 0.880427822470665, "num_tokens": 199119481.0, "step": 1862 }, { "epoch": 4.244013683010262, "grad_norm": 6.875, "learning_rate": 3.318964229630927e-06, "loss": 0.5902, "mean_token_accuracy": 0.8777157664299011, "num_tokens": 199226442.0, "step": 1863 }, { "epoch": 4.246294184720639, "grad_norm": 3.453125, "learning_rate": 3.3171844744856675e-06, "loss": 0.569, "mean_token_accuracy": 0.8849682509899139, "num_tokens": 199333370.0, "step": 1864 }, { "epoch": 4.248574686431015, "grad_norm": 3.953125, "learning_rate": 3.3154042555222758e-06, "loss": 0.5787, "mean_token_accuracy": 0.8795311897993088, "num_tokens": 199440819.0, "step": 1865 }, { "epoch": 4.250855188141391, "grad_norm": 4.8125, "learning_rate": 3.3136235737511715e-06, "loss": 0.589, "mean_token_accuracy": 0.8744902908802032, "num_tokens": 199547464.0, "step": 1866 }, { "epoch": 4.253135689851767, "grad_norm": 2.90625, "learning_rate": 3.3118424301830343e-06, "loss": 0.5884, "mean_token_accuracy": 0.8773903250694275, "num_tokens": 199654391.0, "step": 1867 }, { "epoch": 4.255416191562143, "grad_norm": 3.265625, "learning_rate": 3.310060825828807e-06, "loss": 0.588, "mean_token_accuracy": 0.8800706118345261, "num_tokens": 199761255.0, "step": 1868 }, { "epoch": 4.2576966932725195, "grad_norm": 2.828125, "learning_rate": 3.3082787616996938e-06, "loss": 0.5681, "mean_token_accuracy": 0.8814914971590042, "num_tokens": 199868966.0, "step": 1869 }, { "epoch": 4.259977194982897, "grad_norm": 2.453125, "learning_rate": 3.3064962388071586e-06, "loss": 0.5867, "mean_token_accuracy": 0.8803159892559052, "num_tokens": 199975974.0, "step": 1870 }, { "epoch": 4.262257696693273, "grad_norm": 2.671875, "learning_rate": 3.3047132581629297e-06, "loss": 0.5617, "mean_token_accuracy": 0.8839969784021378, "num_tokens": 200083683.0, "step": 1871 }, { "epoch": 4.264538198403649, "grad_norm": 2.765625, "learning_rate": 3.3029298207789907e-06, "loss": 0.5708, "mean_token_accuracy": 0.8819585740566254, "num_tokens": 200191486.0, "step": 1872 }, { "epoch": 4.266818700114025, "grad_norm": 6.03125, "learning_rate": 3.301145927667586e-06, "loss": 0.5876, "mean_token_accuracy": 0.8768231570720673, "num_tokens": 200298545.0, "step": 1873 }, { "epoch": 4.269099201824401, "grad_norm": 2.6875, "learning_rate": 3.2993615798412204e-06, "loss": 0.584, "mean_token_accuracy": 0.8820153176784515, "num_tokens": 200405570.0, "step": 1874 }, { "epoch": 4.271379703534778, "grad_norm": 4.9375, "learning_rate": 3.297576778312654e-06, "loss": 0.5716, "mean_token_accuracy": 0.8819970637559891, "num_tokens": 200512849.0, "step": 1875 }, { "epoch": 4.273660205245154, "grad_norm": 3.578125, "learning_rate": 3.295791524094906e-06, "loss": 0.5795, "mean_token_accuracy": 0.878609761595726, "num_tokens": 200620132.0, "step": 1876 }, { "epoch": 4.27594070695553, "grad_norm": 4.8125, "learning_rate": 3.294005818201252e-06, "loss": 0.5811, "mean_token_accuracy": 0.8842368423938751, "num_tokens": 200726719.0, "step": 1877 }, { "epoch": 4.278221208665906, "grad_norm": 2.5625, "learning_rate": 3.2922196616452253e-06, "loss": 0.5841, "mean_token_accuracy": 0.8771747201681137, "num_tokens": 200833547.0, "step": 1878 }, { "epoch": 4.280501710376283, "grad_norm": 2.515625, "learning_rate": 3.2904330554406126e-06, "loss": 0.5726, "mean_token_accuracy": 0.880841001868248, "num_tokens": 200941003.0, "step": 1879 }, { "epoch": 4.282782212086659, "grad_norm": 2.71875, "learning_rate": 3.288646000601457e-06, "loss": 0.5993, "mean_token_accuracy": 0.8747004866600037, "num_tokens": 201048114.0, "step": 1880 }, { "epoch": 4.285062713797036, "grad_norm": 5.0625, "learning_rate": 3.286858498142057e-06, "loss": 0.5821, "mean_token_accuracy": 0.8779664039611816, "num_tokens": 201155149.0, "step": 1881 }, { "epoch": 4.287343215507412, "grad_norm": 3.65625, "learning_rate": 3.285070549076965e-06, "loss": 0.5883, "mean_token_accuracy": 0.877086952328682, "num_tokens": 201262291.0, "step": 1882 }, { "epoch": 4.289623717217788, "grad_norm": 3.5625, "learning_rate": 3.283282154420985e-06, "loss": 0.5723, "mean_token_accuracy": 0.8825719207525253, "num_tokens": 201369699.0, "step": 1883 }, { "epoch": 4.291904218928164, "grad_norm": 4.0625, "learning_rate": 3.2814933151891766e-06, "loss": 0.5674, "mean_token_accuracy": 0.881410762667656, "num_tokens": 201476718.0, "step": 1884 }, { "epoch": 4.29418472063854, "grad_norm": 2.859375, "learning_rate": 3.2797040323968493e-06, "loss": 0.5776, "mean_token_accuracy": 0.8789144903421402, "num_tokens": 201583789.0, "step": 1885 }, { "epoch": 4.296465222348917, "grad_norm": 3.515625, "learning_rate": 3.277914307059566e-06, "loss": 0.5567, "mean_token_accuracy": 0.886613667011261, "num_tokens": 201690953.0, "step": 1886 }, { "epoch": 4.298745724059293, "grad_norm": 3.28125, "learning_rate": 3.276124140193141e-06, "loss": 0.5685, "mean_token_accuracy": 0.882378563284874, "num_tokens": 201798723.0, "step": 1887 }, { "epoch": 4.301026225769669, "grad_norm": 2.71875, "learning_rate": 3.274333532813637e-06, "loss": 0.6009, "mean_token_accuracy": 0.8767693191766739, "num_tokens": 201905583.0, "step": 1888 }, { "epoch": 4.303306727480045, "grad_norm": 3.921875, "learning_rate": 3.272542485937369e-06, "loss": 0.5607, "mean_token_accuracy": 0.89011649787426, "num_tokens": 202012658.0, "step": 1889 }, { "epoch": 4.305587229190422, "grad_norm": 2.5, "learning_rate": 3.2707510005809005e-06, "loss": 0.5743, "mean_token_accuracy": 0.8810762017965317, "num_tokens": 202120313.0, "step": 1890 }, { "epoch": 4.307867730900798, "grad_norm": 3.1875, "learning_rate": 3.2689590777610443e-06, "loss": 0.5833, "mean_token_accuracy": 0.8805640786886215, "num_tokens": 202227525.0, "step": 1891 }, { "epoch": 4.310148232611175, "grad_norm": 6.125, "learning_rate": 3.267166718494861e-06, "loss": 0.5875, "mean_token_accuracy": 0.8817348927259445, "num_tokens": 202334247.0, "step": 1892 }, { "epoch": 4.312428734321551, "grad_norm": 4.03125, "learning_rate": 3.265373923799658e-06, "loss": 0.5604, "mean_token_accuracy": 0.8808832913637161, "num_tokens": 202441565.0, "step": 1893 }, { "epoch": 4.314709236031927, "grad_norm": 2.71875, "learning_rate": 3.263580694692992e-06, "loss": 0.5671, "mean_token_accuracy": 0.882627323269844, "num_tokens": 202549008.0, "step": 1894 }, { "epoch": 4.316989737742303, "grad_norm": 3.59375, "learning_rate": 3.261787032192666e-06, "loss": 0.5842, "mean_token_accuracy": 0.8810908943414688, "num_tokens": 202655622.0, "step": 1895 }, { "epoch": 4.319270239452679, "grad_norm": 4.625, "learning_rate": 3.259992937316727e-06, "loss": 0.5903, "mean_token_accuracy": 0.882026731967926, "num_tokens": 202761692.0, "step": 1896 }, { "epoch": 4.321550741163056, "grad_norm": 5.1875, "learning_rate": 3.258198411083469e-06, "loss": 0.5827, "mean_token_accuracy": 0.8775666505098343, "num_tokens": 202868233.0, "step": 1897 }, { "epoch": 4.323831242873432, "grad_norm": 8.4375, "learning_rate": 3.2564034545114308e-06, "loss": 0.5866, "mean_token_accuracy": 0.8776109367609024, "num_tokens": 202975704.0, "step": 1898 }, { "epoch": 4.326111744583809, "grad_norm": 2.59375, "learning_rate": 3.2546080686193947e-06, "loss": 0.5689, "mean_token_accuracy": 0.8826450109481812, "num_tokens": 203082943.0, "step": 1899 }, { "epoch": 4.328392246294185, "grad_norm": 2.703125, "learning_rate": 3.2528122544263873e-06, "loss": 0.5614, "mean_token_accuracy": 0.8836265951395035, "num_tokens": 203189879.0, "step": 1900 }, { "epoch": 4.330672748004561, "grad_norm": 3.515625, "learning_rate": 3.251016012951678e-06, "loss": 0.5576, "mean_token_accuracy": 0.8853756338357925, "num_tokens": 203296924.0, "step": 1901 }, { "epoch": 4.3329532497149374, "grad_norm": 8.4375, "learning_rate": 3.2492193452147774e-06, "loss": 0.5959, "mean_token_accuracy": 0.8769858479499817, "num_tokens": 203403322.0, "step": 1902 }, { "epoch": 4.335233751425314, "grad_norm": 5.5, "learning_rate": 3.247422252235442e-06, "loss": 0.5485, "mean_token_accuracy": 0.8848748654127121, "num_tokens": 203510621.0, "step": 1903 }, { "epoch": 4.33751425313569, "grad_norm": 2.421875, "learning_rate": 3.245624735033665e-06, "loss": 0.563, "mean_token_accuracy": 0.8844299167394638, "num_tokens": 203617576.0, "step": 1904 }, { "epoch": 4.339794754846066, "grad_norm": 4.65625, "learning_rate": 3.2438267946296836e-06, "loss": 0.5739, "mean_token_accuracy": 0.8842675089836121, "num_tokens": 203724726.0, "step": 1905 }, { "epoch": 4.342075256556442, "grad_norm": 3.109375, "learning_rate": 3.242028432043974e-06, "loss": 0.5863, "mean_token_accuracy": 0.8774498254060745, "num_tokens": 203832260.0, "step": 1906 }, { "epoch": 4.344355758266818, "grad_norm": 2.953125, "learning_rate": 3.2402296482972513e-06, "loss": 0.5718, "mean_token_accuracy": 0.8836204558610916, "num_tokens": 203940076.0, "step": 1907 }, { "epoch": 4.346636259977195, "grad_norm": 7.75, "learning_rate": 3.238430444410471e-06, "loss": 0.5722, "mean_token_accuracy": 0.8820386677980423, "num_tokens": 204047654.0, "step": 1908 }, { "epoch": 4.348916761687571, "grad_norm": 5.09375, "learning_rate": 3.2366308214048262e-06, "loss": 0.5827, "mean_token_accuracy": 0.8806447833776474, "num_tokens": 204154623.0, "step": 1909 }, { "epoch": 4.351197263397948, "grad_norm": 5.03125, "learning_rate": 3.2348307803017493e-06, "loss": 0.5682, "mean_token_accuracy": 0.886213019490242, "num_tokens": 204261536.0, "step": 1910 }, { "epoch": 4.353477765108324, "grad_norm": 4.6875, "learning_rate": 3.2330303221229078e-06, "loss": 0.5915, "mean_token_accuracy": 0.8766489624977112, "num_tokens": 204368307.0, "step": 1911 }, { "epoch": 4.3557582668187, "grad_norm": 2.6875, "learning_rate": 3.231229447890206e-06, "loss": 0.5718, "mean_token_accuracy": 0.8825319856405258, "num_tokens": 204475273.0, "step": 1912 }, { "epoch": 4.3580387685290765, "grad_norm": 4.21875, "learning_rate": 3.229428158625787e-06, "loss": 0.5708, "mean_token_accuracy": 0.8858949691057205, "num_tokens": 204582592.0, "step": 1913 }, { "epoch": 4.360319270239453, "grad_norm": 2.953125, "learning_rate": 3.2276264553520275e-06, "loss": 0.5698, "mean_token_accuracy": 0.8835407346487045, "num_tokens": 204689248.0, "step": 1914 }, { "epoch": 4.362599771949829, "grad_norm": 3.09375, "learning_rate": 3.2258243390915397e-06, "loss": 0.5872, "mean_token_accuracy": 0.8794312477111816, "num_tokens": 204796588.0, "step": 1915 }, { "epoch": 4.364880273660205, "grad_norm": 3.546875, "learning_rate": 3.2240218108671683e-06, "loss": 0.5708, "mean_token_accuracy": 0.8816234618425369, "num_tokens": 204903575.0, "step": 1916 }, { "epoch": 4.367160775370581, "grad_norm": 4.625, "learning_rate": 3.2222188717019965e-06, "loss": 0.5834, "mean_token_accuracy": 0.8821093142032623, "num_tokens": 205010359.0, "step": 1917 }, { "epoch": 4.369441277080957, "grad_norm": 3.53125, "learning_rate": 3.220415522619335e-06, "loss": 0.5984, "mean_token_accuracy": 0.8760864734649658, "num_tokens": 205117682.0, "step": 1918 }, { "epoch": 4.3717217787913345, "grad_norm": 2.34375, "learning_rate": 3.218611764642732e-06, "loss": 0.5852, "mean_token_accuracy": 0.8813271969556808, "num_tokens": 205224758.0, "step": 1919 }, { "epoch": 4.374002280501711, "grad_norm": 3.375, "learning_rate": 3.2168075987959633e-06, "loss": 0.6073, "mean_token_accuracy": 0.8754995763301849, "num_tokens": 205331474.0, "step": 1920 }, { "epoch": 4.376282782212087, "grad_norm": 4.0, "learning_rate": 3.2150030261030414e-06, "loss": 0.5779, "mean_token_accuracy": 0.8807307332754135, "num_tokens": 205438379.0, "step": 1921 }, { "epoch": 4.378563283922463, "grad_norm": 2.921875, "learning_rate": 3.2131980475882053e-06, "loss": 0.5798, "mean_token_accuracy": 0.8842781186103821, "num_tokens": 205545207.0, "step": 1922 }, { "epoch": 4.380843785632839, "grad_norm": 3.328125, "learning_rate": 3.2113926642759256e-06, "loss": 0.605, "mean_token_accuracy": 0.8773124665021896, "num_tokens": 205651526.0, "step": 1923 }, { "epoch": 4.3831242873432155, "grad_norm": 4.15625, "learning_rate": 3.2095868771909037e-06, "loss": 0.5745, "mean_token_accuracy": 0.8833547234535217, "num_tokens": 205758217.0, "step": 1924 }, { "epoch": 4.385404789053592, "grad_norm": 2.9375, "learning_rate": 3.2077806873580696e-06, "loss": 0.5551, "mean_token_accuracy": 0.8848164230585098, "num_tokens": 205865218.0, "step": 1925 }, { "epoch": 4.387685290763968, "grad_norm": 5.25, "learning_rate": 3.205974095802582e-06, "loss": 0.5773, "mean_token_accuracy": 0.8819639384746552, "num_tokens": 205971573.0, "step": 1926 }, { "epoch": 4.389965792474344, "grad_norm": 6.0, "learning_rate": 3.204167103549827e-06, "loss": 0.5684, "mean_token_accuracy": 0.8817443251609802, "num_tokens": 206078630.0, "step": 1927 }, { "epoch": 4.39224629418472, "grad_norm": 4.15625, "learning_rate": 3.2023597116254175e-06, "loss": 0.5762, "mean_token_accuracy": 0.8799740672111511, "num_tokens": 206185346.0, "step": 1928 }, { "epoch": 4.394526795895097, "grad_norm": 2.625, "learning_rate": 3.2005519210551955e-06, "loss": 0.5625, "mean_token_accuracy": 0.885019987821579, "num_tokens": 206292483.0, "step": 1929 }, { "epoch": 4.3968072976054735, "grad_norm": 2.5625, "learning_rate": 3.1987437328652287e-06, "loss": 0.5626, "mean_token_accuracy": 0.8824569880962372, "num_tokens": 206399488.0, "step": 1930 }, { "epoch": 4.39908779931585, "grad_norm": 2.671875, "learning_rate": 3.196935148081808e-06, "loss": 0.5953, "mean_token_accuracy": 0.8730659335851669, "num_tokens": 206506633.0, "step": 1931 }, { "epoch": 4.401368301026226, "grad_norm": 3.140625, "learning_rate": 3.1951261677314526e-06, "loss": 0.5769, "mean_token_accuracy": 0.8806923627853394, "num_tokens": 206614034.0, "step": 1932 }, { "epoch": 4.403648802736602, "grad_norm": 2.546875, "learning_rate": 3.1933167928409046e-06, "loss": 0.5731, "mean_token_accuracy": 0.882137656211853, "num_tokens": 206721766.0, "step": 1933 }, { "epoch": 4.405929304446978, "grad_norm": 2.46875, "learning_rate": 3.1915070244371295e-06, "loss": 0.5581, "mean_token_accuracy": 0.8834485858678818, "num_tokens": 206829226.0, "step": 1934 }, { "epoch": 4.4082098061573545, "grad_norm": 3.078125, "learning_rate": 3.1896968635473174e-06, "loss": 0.5758, "mean_token_accuracy": 0.8820080310106277, "num_tokens": 206936587.0, "step": 1935 }, { "epoch": 4.410490307867731, "grad_norm": 2.4375, "learning_rate": 3.187886311198881e-06, "loss": 0.5638, "mean_token_accuracy": 0.8846455514431, "num_tokens": 207043902.0, "step": 1936 }, { "epoch": 4.412770809578107, "grad_norm": 4.75, "learning_rate": 3.1860753684194536e-06, "loss": 0.5702, "mean_token_accuracy": 0.8813911080360413, "num_tokens": 207151288.0, "step": 1937 }, { "epoch": 4.415051311288483, "grad_norm": 3.78125, "learning_rate": 3.1842640362368932e-06, "loss": 0.5791, "mean_token_accuracy": 0.878994882106781, "num_tokens": 207258999.0, "step": 1938 }, { "epoch": 4.41733181299886, "grad_norm": 3.046875, "learning_rate": 3.182452315679276e-06, "loss": 0.545, "mean_token_accuracy": 0.8886394798755646, "num_tokens": 207366568.0, "step": 1939 }, { "epoch": 4.419612314709236, "grad_norm": 2.4375, "learning_rate": 3.1806402077748987e-06, "loss": 0.5678, "mean_token_accuracy": 0.8816164880990982, "num_tokens": 207473843.0, "step": 1940 }, { "epoch": 4.4218928164196125, "grad_norm": 2.84375, "learning_rate": 3.178827713552281e-06, "loss": 0.5808, "mean_token_accuracy": 0.8799097836017609, "num_tokens": 207580995.0, "step": 1941 }, { "epoch": 4.424173318129989, "grad_norm": 2.984375, "learning_rate": 3.177014834040158e-06, "loss": 0.5652, "mean_token_accuracy": 0.883696660399437, "num_tokens": 207688161.0, "step": 1942 }, { "epoch": 4.426453819840365, "grad_norm": 2.5625, "learning_rate": 3.1752015702674855e-06, "loss": 0.6015, "mean_token_accuracy": 0.8793037235736847, "num_tokens": 207795164.0, "step": 1943 }, { "epoch": 4.428734321550741, "grad_norm": 2.5, "learning_rate": 3.173387923263437e-06, "loss": 0.5574, "mean_token_accuracy": 0.8862407803535461, "num_tokens": 207902155.0, "step": 1944 }, { "epoch": 4.431014823261117, "grad_norm": 3.453125, "learning_rate": 3.1715738940574032e-06, "loss": 0.5604, "mean_token_accuracy": 0.8828154355287552, "num_tokens": 208009297.0, "step": 1945 }, { "epoch": 4.4332953249714935, "grad_norm": 4.0625, "learning_rate": 3.1697594836789924e-06, "loss": 0.5774, "mean_token_accuracy": 0.8778323084115982, "num_tokens": 208116608.0, "step": 1946 }, { "epoch": 4.43557582668187, "grad_norm": 2.59375, "learning_rate": 3.167944693158029e-06, "loss": 0.5788, "mean_token_accuracy": 0.8808430731296539, "num_tokens": 208223390.0, "step": 1947 }, { "epoch": 4.437856328392247, "grad_norm": 3.578125, "learning_rate": 3.166129523524553e-06, "loss": 0.6007, "mean_token_accuracy": 0.8762641549110413, "num_tokens": 208330138.0, "step": 1948 }, { "epoch": 4.440136830102623, "grad_norm": 2.890625, "learning_rate": 3.1643139758088194e-06, "loss": 0.5849, "mean_token_accuracy": 0.8787393867969513, "num_tokens": 208437552.0, "step": 1949 }, { "epoch": 4.442417331812999, "grad_norm": 4.28125, "learning_rate": 3.1624980510412984e-06, "loss": 0.5987, "mean_token_accuracy": 0.8784570544958115, "num_tokens": 208544652.0, "step": 1950 }, { "epoch": 4.444697833523375, "grad_norm": 3.484375, "learning_rate": 3.160681750252674e-06, "loss": 0.573, "mean_token_accuracy": 0.8795022666454315, "num_tokens": 208651498.0, "step": 1951 }, { "epoch": 4.4469783352337515, "grad_norm": 3.78125, "learning_rate": 3.1588650744738418e-06, "loss": 0.5617, "mean_token_accuracy": 0.8832292854785919, "num_tokens": 208758479.0, "step": 1952 }, { "epoch": 4.449258836944128, "grad_norm": 2.859375, "learning_rate": 3.1570480247359147e-06, "loss": 0.5788, "mean_token_accuracy": 0.8820718824863434, "num_tokens": 208865610.0, "step": 1953 }, { "epoch": 4.451539338654504, "grad_norm": 2.984375, "learning_rate": 3.155230602070213e-06, "loss": 0.5795, "mean_token_accuracy": 0.879115030169487, "num_tokens": 208972259.0, "step": 1954 }, { "epoch": 4.45381984036488, "grad_norm": 3.09375, "learning_rate": 3.153412807508271e-06, "loss": 0.5855, "mean_token_accuracy": 0.8792910128831863, "num_tokens": 209078252.0, "step": 1955 }, { "epoch": 4.456100342075256, "grad_norm": 2.96875, "learning_rate": 3.1515946420818343e-06, "loss": 0.5557, "mean_token_accuracy": 0.8846887648105621, "num_tokens": 209185333.0, "step": 1956 }, { "epoch": 4.4583808437856325, "grad_norm": 2.65625, "learning_rate": 3.1497761068228585e-06, "loss": 0.5741, "mean_token_accuracy": 0.8841688334941864, "num_tokens": 209292465.0, "step": 1957 }, { "epoch": 4.460661345496009, "grad_norm": 3.5, "learning_rate": 3.1479572027635085e-06, "loss": 0.5716, "mean_token_accuracy": 0.8820602297782898, "num_tokens": 209399257.0, "step": 1958 }, { "epoch": 4.462941847206386, "grad_norm": 4.15625, "learning_rate": 3.1461379309361594e-06, "loss": 0.5927, "mean_token_accuracy": 0.8786464035511017, "num_tokens": 209506400.0, "step": 1959 }, { "epoch": 4.465222348916762, "grad_norm": 2.671875, "learning_rate": 3.144318292373395e-06, "loss": 0.5487, "mean_token_accuracy": 0.8865722268819809, "num_tokens": 209613571.0, "step": 1960 }, { "epoch": 4.467502850627138, "grad_norm": 3.4375, "learning_rate": 3.142498288108007e-06, "loss": 0.569, "mean_token_accuracy": 0.8826761692762375, "num_tokens": 209720956.0, "step": 1961 }, { "epoch": 4.469783352337514, "grad_norm": 2.84375, "learning_rate": 3.1406779191729954e-06, "loss": 0.5548, "mean_token_accuracy": 0.8883082419633865, "num_tokens": 209827663.0, "step": 1962 }, { "epoch": 4.4720638540478905, "grad_norm": 3.71875, "learning_rate": 3.1388571866015645e-06, "loss": 0.5845, "mean_token_accuracy": 0.8810272365808487, "num_tokens": 209934611.0, "step": 1963 }, { "epoch": 4.474344355758267, "grad_norm": 3.703125, "learning_rate": 3.1370360914271286e-06, "loss": 0.5721, "mean_token_accuracy": 0.8840647041797638, "num_tokens": 210041595.0, "step": 1964 }, { "epoch": 4.476624857468643, "grad_norm": 3.5625, "learning_rate": 3.1352146346833057e-06, "loss": 0.5845, "mean_token_accuracy": 0.877694696187973, "num_tokens": 210148918.0, "step": 1965 }, { "epoch": 4.478905359179019, "grad_norm": 2.71875, "learning_rate": 3.133392817403919e-06, "loss": 0.5738, "mean_token_accuracy": 0.8824283182621002, "num_tokens": 210256121.0, "step": 1966 }, { "epoch": 4.481185860889395, "grad_norm": 3.1875, "learning_rate": 3.131570640622998e-06, "loss": 0.5771, "mean_token_accuracy": 0.8796381056308746, "num_tokens": 210363033.0, "step": 1967 }, { "epoch": 4.483466362599772, "grad_norm": 2.734375, "learning_rate": 3.1297481053747737e-06, "loss": 0.5526, "mean_token_accuracy": 0.8865970373153687, "num_tokens": 210470814.0, "step": 1968 }, { "epoch": 4.485746864310149, "grad_norm": 2.515625, "learning_rate": 3.127925212693682e-06, "loss": 0.5729, "mean_token_accuracy": 0.8791713118553162, "num_tokens": 210577490.0, "step": 1969 }, { "epoch": 4.488027366020525, "grad_norm": 3.21875, "learning_rate": 3.1261019636143636e-06, "loss": 0.5716, "mean_token_accuracy": 0.8812829405069351, "num_tokens": 210684376.0, "step": 1970 }, { "epoch": 4.490307867730901, "grad_norm": 2.484375, "learning_rate": 3.124278359171657e-06, "loss": 0.5712, "mean_token_accuracy": 0.8833597153425217, "num_tokens": 210791620.0, "step": 1971 }, { "epoch": 4.492588369441277, "grad_norm": 3.671875, "learning_rate": 3.122454400400606e-06, "loss": 0.5684, "mean_token_accuracy": 0.8824751228094101, "num_tokens": 210898333.0, "step": 1972 }, { "epoch": 4.494868871151653, "grad_norm": 2.34375, "learning_rate": 3.1206300883364547e-06, "loss": 0.5772, "mean_token_accuracy": 0.8802805244922638, "num_tokens": 211005919.0, "step": 1973 }, { "epoch": 4.4971493728620295, "grad_norm": 2.59375, "learning_rate": 3.1188054240146463e-06, "loss": 0.5729, "mean_token_accuracy": 0.880774512887001, "num_tokens": 211113271.0, "step": 1974 }, { "epoch": 4.499429874572406, "grad_norm": 2.59375, "learning_rate": 3.1169804084708267e-06, "loss": 0.5885, "mean_token_accuracy": 0.880780965089798, "num_tokens": 211220394.0, "step": 1975 }, { "epoch": 4.501710376282782, "grad_norm": 3.796875, "learning_rate": 3.1151550427408383e-06, "loss": 0.5885, "mean_token_accuracy": 0.8800267279148102, "num_tokens": 211327515.0, "step": 1976 }, { "epoch": 4.503990877993158, "grad_norm": 4.46875, "learning_rate": 3.1133293278607228e-06, "loss": 0.5756, "mean_token_accuracy": 0.882025346159935, "num_tokens": 211434652.0, "step": 1977 }, { "epoch": 4.506271379703534, "grad_norm": 2.671875, "learning_rate": 3.1115032648667224e-06, "loss": 0.5649, "mean_token_accuracy": 0.8816207945346832, "num_tokens": 211542180.0, "step": 1978 }, { "epoch": 4.508551881413911, "grad_norm": 3.25, "learning_rate": 3.1096768547952743e-06, "loss": 0.5811, "mean_token_accuracy": 0.8829948008060455, "num_tokens": 211649219.0, "step": 1979 }, { "epoch": 4.510832383124288, "grad_norm": 2.5, "learning_rate": 3.1078500986830134e-06, "loss": 0.5802, "mean_token_accuracy": 0.8840111643075943, "num_tokens": 211756443.0, "step": 1980 }, { "epoch": 4.510832383124288, "eval_loss": 0.5894109606742859, "eval_mean_token_accuracy": 0.8794237710677172, "eval_num_tokens": 211756443.0, "eval_runtime": 58.6388, "eval_samples_per_second": 142.994, "eval_steps_per_second": 4.485, "step": 1980 }, { "epoch": 4.513112884834664, "grad_norm": 3.09375, "learning_rate": 3.1060229975667716e-06, "loss": 0.5706, "mean_token_accuracy": 0.8810431063175201, "num_tokens": 211863529.0, "step": 1981 }, { "epoch": 4.51539338654504, "grad_norm": 5.625, "learning_rate": 3.104195552483576e-06, "loss": 0.5842, "mean_token_accuracy": 0.8787284940481186, "num_tokens": 211970403.0, "step": 1982 }, { "epoch": 4.517673888255416, "grad_norm": 5.21875, "learning_rate": 3.102367764470649e-06, "loss": 0.568, "mean_token_accuracy": 0.8812828063964844, "num_tokens": 212077504.0, "step": 1983 }, { "epoch": 4.519954389965792, "grad_norm": 3.828125, "learning_rate": 3.1005396345654087e-06, "loss": 0.5814, "mean_token_accuracy": 0.8794949948787689, "num_tokens": 212185300.0, "step": 1984 }, { "epoch": 4.5222348916761685, "grad_norm": 4.25, "learning_rate": 3.0987111638054657e-06, "loss": 0.5711, "mean_token_accuracy": 0.8824008107185364, "num_tokens": 212292097.0, "step": 1985 }, { "epoch": 4.524515393386545, "grad_norm": 5.65625, "learning_rate": 3.0968823532286246e-06, "loss": 0.5683, "mean_token_accuracy": 0.8847775161266327, "num_tokens": 212399289.0, "step": 1986 }, { "epoch": 4.526795895096921, "grad_norm": 4.15625, "learning_rate": 3.095053203872883e-06, "loss": 0.567, "mean_token_accuracy": 0.8831916600465775, "num_tokens": 212506647.0, "step": 1987 }, { "epoch": 4.529076396807298, "grad_norm": 2.453125, "learning_rate": 3.0932237167764306e-06, "loss": 0.5765, "mean_token_accuracy": 0.883448526263237, "num_tokens": 212613620.0, "step": 1988 }, { "epoch": 4.531356898517674, "grad_norm": 4.46875, "learning_rate": 3.0913938929776493e-06, "loss": 0.5899, "mean_token_accuracy": 0.8791800290346146, "num_tokens": 212720388.0, "step": 1989 }, { "epoch": 4.53363740022805, "grad_norm": 2.703125, "learning_rate": 3.0895637335151117e-06, "loss": 0.5799, "mean_token_accuracy": 0.8825742602348328, "num_tokens": 212827508.0, "step": 1990 }, { "epoch": 4.535917901938427, "grad_norm": 4.59375, "learning_rate": 3.0877332394275806e-06, "loss": 0.5885, "mean_token_accuracy": 0.878256767988205, "num_tokens": 212933843.0, "step": 1991 }, { "epoch": 4.538198403648803, "grad_norm": 2.734375, "learning_rate": 3.08590241175401e-06, "loss": 0.5732, "mean_token_accuracy": 0.8803077787160873, "num_tokens": 213040798.0, "step": 1992 }, { "epoch": 4.540478905359179, "grad_norm": 5.1875, "learning_rate": 3.0840712515335412e-06, "loss": 0.5895, "mean_token_accuracy": 0.8778508752584457, "num_tokens": 213147454.0, "step": 1993 }, { "epoch": 4.542759407069555, "grad_norm": 2.921875, "learning_rate": 3.0822397598055065e-06, "loss": 0.5857, "mean_token_accuracy": 0.8773138970136642, "num_tokens": 213255022.0, "step": 1994 }, { "epoch": 4.545039908779931, "grad_norm": 4.375, "learning_rate": 3.080407937609424e-06, "loss": 0.5926, "mean_token_accuracy": 0.8787964135408401, "num_tokens": 213362436.0, "step": 1995 }, { "epoch": 4.5473204104903076, "grad_norm": 2.609375, "learning_rate": 3.0785757859850025e-06, "loss": 0.5892, "mean_token_accuracy": 0.8769951462745667, "num_tokens": 213469549.0, "step": 1996 }, { "epoch": 4.549600912200685, "grad_norm": 3.03125, "learning_rate": 3.0767433059721338e-06, "loss": 0.5777, "mean_token_accuracy": 0.8808469474315643, "num_tokens": 213576382.0, "step": 1997 }, { "epoch": 4.55188141391106, "grad_norm": 2.921875, "learning_rate": 3.074910498610899e-06, "loss": 0.5572, "mean_token_accuracy": 0.887230396270752, "num_tokens": 213683677.0, "step": 1998 }, { "epoch": 4.554161915621437, "grad_norm": 5.03125, "learning_rate": 3.0730773649415647e-06, "loss": 0.5931, "mean_token_accuracy": 0.8774087131023407, "num_tokens": 213790372.0, "step": 1999 }, { "epoch": 4.556442417331813, "grad_norm": 6.09375, "learning_rate": 3.0712439060045818e-06, "loss": 0.5667, "mean_token_accuracy": 0.884253740310669, "num_tokens": 213897106.0, "step": 2000 }, { "epoch": 4.558722919042189, "grad_norm": 5.25, "learning_rate": 3.069410122840585e-06, "loss": 0.5908, "mean_token_accuracy": 0.8797920346260071, "num_tokens": 214004255.0, "step": 2001 }, { "epoch": 4.561003420752566, "grad_norm": 2.546875, "learning_rate": 3.0675760164903972e-06, "loss": 0.5845, "mean_token_accuracy": 0.8811387270689011, "num_tokens": 214111233.0, "step": 2002 }, { "epoch": 4.563283922462942, "grad_norm": 3.125, "learning_rate": 3.065741587995019e-06, "loss": 0.5691, "mean_token_accuracy": 0.8835022449493408, "num_tokens": 214218120.0, "step": 2003 }, { "epoch": 4.565564424173318, "grad_norm": 2.984375, "learning_rate": 3.0639068383956373e-06, "loss": 0.5763, "mean_token_accuracy": 0.880307137966156, "num_tokens": 214324888.0, "step": 2004 }, { "epoch": 4.567844925883694, "grad_norm": 2.640625, "learning_rate": 3.062071768733621e-06, "loss": 0.5561, "mean_token_accuracy": 0.886817216873169, "num_tokens": 214432054.0, "step": 2005 }, { "epoch": 4.57012542759407, "grad_norm": 3.8125, "learning_rate": 3.0602363800505198e-06, "loss": 0.5689, "mean_token_accuracy": 0.882687121629715, "num_tokens": 214539517.0, "step": 2006 }, { "epoch": 4.572405929304447, "grad_norm": 3.015625, "learning_rate": 3.0584006733880656e-06, "loss": 0.5928, "mean_token_accuracy": 0.8774117529392242, "num_tokens": 214646420.0, "step": 2007 }, { "epoch": 4.574686431014824, "grad_norm": 2.890625, "learning_rate": 3.0565646497881697e-06, "loss": 0.5592, "mean_token_accuracy": 0.8867736458778381, "num_tokens": 214753267.0, "step": 2008 }, { "epoch": 4.5769669327252, "grad_norm": 3.078125, "learning_rate": 3.0547283102929228e-06, "loss": 0.5698, "mean_token_accuracy": 0.8841065913438797, "num_tokens": 214860275.0, "step": 2009 }, { "epoch": 4.579247434435576, "grad_norm": 3.109375, "learning_rate": 3.0528916559445967e-06, "loss": 0.5824, "mean_token_accuracy": 0.8801997601985931, "num_tokens": 214967499.0, "step": 2010 }, { "epoch": 4.581527936145952, "grad_norm": 2.390625, "learning_rate": 3.05105468778564e-06, "loss": 0.5594, "mean_token_accuracy": 0.8840168863534927, "num_tokens": 215075121.0, "step": 2011 }, { "epoch": 4.583808437856328, "grad_norm": 5.3125, "learning_rate": 3.049217406858681e-06, "loss": 0.6082, "mean_token_accuracy": 0.8759740740060806, "num_tokens": 215181867.0, "step": 2012 }, { "epoch": 4.586088939566705, "grad_norm": 4.09375, "learning_rate": 3.047379814206526e-06, "loss": 0.5717, "mean_token_accuracy": 0.8838594704866409, "num_tokens": 215288922.0, "step": 2013 }, { "epoch": 4.588369441277081, "grad_norm": 2.546875, "learning_rate": 3.0455419108721556e-06, "loss": 0.564, "mean_token_accuracy": 0.8816310912370682, "num_tokens": 215395792.0, "step": 2014 }, { "epoch": 4.590649942987457, "grad_norm": 2.953125, "learning_rate": 3.043703697898728e-06, "loss": 0.569, "mean_token_accuracy": 0.8817844688892365, "num_tokens": 215503184.0, "step": 2015 }, { "epoch": 4.592930444697833, "grad_norm": 2.234375, "learning_rate": 3.041865176329579e-06, "loss": 0.5444, "mean_token_accuracy": 0.8872413337230682, "num_tokens": 215610549.0, "step": 2016 }, { "epoch": 4.59521094640821, "grad_norm": 3.609375, "learning_rate": 3.040026347208217e-06, "loss": 0.5725, "mean_token_accuracy": 0.882647380232811, "num_tokens": 215718112.0, "step": 2017 }, { "epoch": 4.5974914481185865, "grad_norm": 3.015625, "learning_rate": 3.0381872115783256e-06, "loss": 0.5929, "mean_token_accuracy": 0.8802604079246521, "num_tokens": 215825609.0, "step": 2018 }, { "epoch": 4.599771949828963, "grad_norm": 3.140625, "learning_rate": 3.0363477704837633e-06, "loss": 0.552, "mean_token_accuracy": 0.8875061571598053, "num_tokens": 215932611.0, "step": 2019 }, { "epoch": 4.602052451539339, "grad_norm": 3.1875, "learning_rate": 3.034508024968561e-06, "loss": 0.5714, "mean_token_accuracy": 0.8848912864923477, "num_tokens": 216040088.0, "step": 2020 }, { "epoch": 4.604332953249715, "grad_norm": 4.1875, "learning_rate": 3.032667976076923e-06, "loss": 0.5709, "mean_token_accuracy": 0.8811133801937103, "num_tokens": 216147228.0, "step": 2021 }, { "epoch": 4.606613454960091, "grad_norm": 2.6875, "learning_rate": 3.0308276248532244e-06, "loss": 0.5794, "mean_token_accuracy": 0.8812815397977829, "num_tokens": 216254056.0, "step": 2022 }, { "epoch": 4.608893956670467, "grad_norm": 2.984375, "learning_rate": 3.0289869723420144e-06, "loss": 0.564, "mean_token_accuracy": 0.8829955011606216, "num_tokens": 216361270.0, "step": 2023 }, { "epoch": 4.611174458380844, "grad_norm": 2.46875, "learning_rate": 3.027146019588012e-06, "loss": 0.5619, "mean_token_accuracy": 0.8840386420488358, "num_tokens": 216468575.0, "step": 2024 }, { "epoch": 4.61345496009122, "grad_norm": 5.53125, "learning_rate": 3.025304767636105e-06, "loss": 0.5558, "mean_token_accuracy": 0.8865272402763367, "num_tokens": 216576234.0, "step": 2025 }, { "epoch": 4.615735461801596, "grad_norm": 5.40625, "learning_rate": 3.0234632175313537e-06, "loss": 0.591, "mean_token_accuracy": 0.8783838450908661, "num_tokens": 216683054.0, "step": 2026 }, { "epoch": 4.618015963511972, "grad_norm": 2.703125, "learning_rate": 3.0216213703189856e-06, "loss": 0.5862, "mean_token_accuracy": 0.8805306851863861, "num_tokens": 216790089.0, "step": 2027 }, { "epoch": 4.620296465222349, "grad_norm": 2.46875, "learning_rate": 3.019779227044398e-06, "loss": 0.567, "mean_token_accuracy": 0.8831235766410828, "num_tokens": 216897778.0, "step": 2028 }, { "epoch": 4.6225769669327255, "grad_norm": 2.78125, "learning_rate": 3.0179367887531567e-06, "loss": 0.5734, "mean_token_accuracy": 0.8810001909732819, "num_tokens": 217004442.0, "step": 2029 }, { "epoch": 4.624857468643102, "grad_norm": 2.515625, "learning_rate": 3.016094056490993e-06, "loss": 0.5503, "mean_token_accuracy": 0.8854062855243683, "num_tokens": 217111586.0, "step": 2030 }, { "epoch": 4.627137970353478, "grad_norm": 2.40625, "learning_rate": 3.0142510313038057e-06, "loss": 0.5533, "mean_token_accuracy": 0.8880758136510849, "num_tokens": 217219168.0, "step": 2031 }, { "epoch": 4.629418472063854, "grad_norm": 2.5625, "learning_rate": 3.012407714237662e-06, "loss": 0.5751, "mean_token_accuracy": 0.8846538215875626, "num_tokens": 217326211.0, "step": 2032 }, { "epoch": 4.63169897377423, "grad_norm": 3.4375, "learning_rate": 3.010564106338791e-06, "loss": 0.5731, "mean_token_accuracy": 0.8847674578428268, "num_tokens": 217433707.0, "step": 2033 }, { "epoch": 4.633979475484606, "grad_norm": 2.375, "learning_rate": 3.0087202086535915e-06, "loss": 0.5532, "mean_token_accuracy": 0.8850553631782532, "num_tokens": 217541577.0, "step": 2034 }, { "epoch": 4.636259977194983, "grad_norm": 3.1875, "learning_rate": 3.006876022228622e-06, "loss": 0.5832, "mean_token_accuracy": 0.882301077246666, "num_tokens": 217648764.0, "step": 2035 }, { "epoch": 4.638540478905359, "grad_norm": 2.59375, "learning_rate": 3.0050315481106074e-06, "loss": 0.5741, "mean_token_accuracy": 0.8841645568609238, "num_tokens": 217755467.0, "step": 2036 }, { "epoch": 4.640820980615736, "grad_norm": 3.390625, "learning_rate": 3.0031867873464372e-06, "loss": 0.5864, "mean_token_accuracy": 0.8814872205257416, "num_tokens": 217862349.0, "step": 2037 }, { "epoch": 4.643101482326112, "grad_norm": 2.875, "learning_rate": 3.00134174098316e-06, "loss": 0.6096, "mean_token_accuracy": 0.8733366578817368, "num_tokens": 217968209.0, "step": 2038 }, { "epoch": 4.645381984036488, "grad_norm": 2.578125, "learning_rate": 2.999496410067989e-06, "loss": 0.5549, "mean_token_accuracy": 0.8875339925289154, "num_tokens": 218076237.0, "step": 2039 }, { "epoch": 4.6476624857468645, "grad_norm": 3.640625, "learning_rate": 2.9976507956482996e-06, "loss": 0.5847, "mean_token_accuracy": 0.8795648515224457, "num_tokens": 218183464.0, "step": 2040 }, { "epoch": 4.649942987457241, "grad_norm": 2.46875, "learning_rate": 2.9958048987716266e-06, "loss": 0.6071, "mean_token_accuracy": 0.8746256977319717, "num_tokens": 218290055.0, "step": 2041 }, { "epoch": 4.652223489167617, "grad_norm": 3.703125, "learning_rate": 2.993958720485664e-06, "loss": 0.5912, "mean_token_accuracy": 0.8757225871086121, "num_tokens": 218397051.0, "step": 2042 }, { "epoch": 4.654503990877993, "grad_norm": 2.828125, "learning_rate": 2.9921122618382687e-06, "loss": 0.5815, "mean_token_accuracy": 0.8802944868803024, "num_tokens": 218504413.0, "step": 2043 }, { "epoch": 4.656784492588369, "grad_norm": 2.71875, "learning_rate": 2.9902655238774537e-06, "loss": 0.575, "mean_token_accuracy": 0.8826518207788467, "num_tokens": 218611326.0, "step": 2044 }, { "epoch": 4.659064994298745, "grad_norm": 3.78125, "learning_rate": 2.988418507651392e-06, "loss": 0.5768, "mean_token_accuracy": 0.8791725039482117, "num_tokens": 218718666.0, "step": 2045 }, { "epoch": 4.661345496009122, "grad_norm": 3.28125, "learning_rate": 2.9865712142084145e-06, "loss": 0.5578, "mean_token_accuracy": 0.8884040713310242, "num_tokens": 218826339.0, "step": 2046 }, { "epoch": 4.663625997719498, "grad_norm": 3.46875, "learning_rate": 2.98472364459701e-06, "loss": 0.564, "mean_token_accuracy": 0.8853495866060257, "num_tokens": 218933583.0, "step": 2047 }, { "epoch": 4.665906499429875, "grad_norm": 3.703125, "learning_rate": 2.982875799865823e-06, "loss": 0.5685, "mean_token_accuracy": 0.8821221739053726, "num_tokens": 219040654.0, "step": 2048 }, { "epoch": 4.668187001140251, "grad_norm": 3.328125, "learning_rate": 2.9810276810636535e-06, "loss": 0.6013, "mean_token_accuracy": 0.8774379193782806, "num_tokens": 219147467.0, "step": 2049 }, { "epoch": 4.670467502850627, "grad_norm": 3.546875, "learning_rate": 2.97917928923946e-06, "loss": 0.5834, "mean_token_accuracy": 0.879958301782608, "num_tokens": 219254877.0, "step": 2050 }, { "epoch": 4.6727480045610035, "grad_norm": 2.578125, "learning_rate": 2.977330625442352e-06, "loss": 0.5609, "mean_token_accuracy": 0.8850719779729843, "num_tokens": 219362293.0, "step": 2051 }, { "epoch": 4.67502850627138, "grad_norm": 4.09375, "learning_rate": 2.9754816907215963e-06, "loss": 0.5811, "mean_token_accuracy": 0.8815977722406387, "num_tokens": 219469187.0, "step": 2052 }, { "epoch": 4.677309007981756, "grad_norm": 2.5625, "learning_rate": 2.9736324861266125e-06, "loss": 0.5835, "mean_token_accuracy": 0.8775934129953384, "num_tokens": 219576287.0, "step": 2053 }, { "epoch": 4.679589509692132, "grad_norm": 5.3125, "learning_rate": 2.9717830127069734e-06, "loss": 0.5726, "mean_token_accuracy": 0.8794510513544083, "num_tokens": 219683068.0, "step": 2054 }, { "epoch": 4.681870011402508, "grad_norm": 2.796875, "learning_rate": 2.969933271512404e-06, "loss": 0.5688, "mean_token_accuracy": 0.8821385949850082, "num_tokens": 219790016.0, "step": 2055 }, { "epoch": 4.684150513112884, "grad_norm": 2.890625, "learning_rate": 2.9680832635927824e-06, "loss": 0.5908, "mean_token_accuracy": 0.8790445029735565, "num_tokens": 219896728.0, "step": 2056 }, { "epoch": 4.6864310148232615, "grad_norm": 2.75, "learning_rate": 2.9662329899981375e-06, "loss": 0.5726, "mean_token_accuracy": 0.8832840174436569, "num_tokens": 220004033.0, "step": 2057 }, { "epoch": 4.688711516533638, "grad_norm": 3.3125, "learning_rate": 2.964382451778648e-06, "loss": 0.5654, "mean_token_accuracy": 0.8850243538618088, "num_tokens": 220111159.0, "step": 2058 }, { "epoch": 4.690992018244014, "grad_norm": 3.984375, "learning_rate": 2.9625316499846444e-06, "loss": 0.582, "mean_token_accuracy": 0.8815952986478806, "num_tokens": 220218001.0, "step": 2059 }, { "epoch": 4.69327251995439, "grad_norm": 2.8125, "learning_rate": 2.9606805856666053e-06, "loss": 0.6012, "mean_token_accuracy": 0.8748088330030441, "num_tokens": 220325346.0, "step": 2060 }, { "epoch": 4.695553021664766, "grad_norm": 2.34375, "learning_rate": 2.95882925987516e-06, "loss": 0.5824, "mean_token_accuracy": 0.8794578313827515, "num_tokens": 220432554.0, "step": 2061 }, { "epoch": 4.6978335233751425, "grad_norm": 3.390625, "learning_rate": 2.9569776736610855e-06, "loss": 0.5848, "mean_token_accuracy": 0.8808020949363708, "num_tokens": 220539630.0, "step": 2062 }, { "epoch": 4.700114025085519, "grad_norm": 2.96875, "learning_rate": 2.9551258280753046e-06, "loss": 0.5779, "mean_token_accuracy": 0.8800801783800125, "num_tokens": 220646814.0, "step": 2063 }, { "epoch": 4.702394526795895, "grad_norm": 3.8125, "learning_rate": 2.953273724168891e-06, "loss": 0.568, "mean_token_accuracy": 0.8822778612375259, "num_tokens": 220753926.0, "step": 2064 }, { "epoch": 4.704675028506271, "grad_norm": 3.53125, "learning_rate": 2.9514213629930614e-06, "loss": 0.5749, "mean_token_accuracy": 0.8794926106929779, "num_tokens": 220860498.0, "step": 2065 }, { "epoch": 4.706955530216648, "grad_norm": 2.890625, "learning_rate": 2.949568745599182e-06, "loss": 0.5579, "mean_token_accuracy": 0.8850435167551041, "num_tokens": 220967240.0, "step": 2066 }, { "epoch": 4.7092360319270234, "grad_norm": 4.21875, "learning_rate": 2.9477158730387615e-06, "loss": 0.5811, "mean_token_accuracy": 0.8826223760843277, "num_tokens": 221074023.0, "step": 2067 }, { "epoch": 4.7115165336374005, "grad_norm": 2.90625, "learning_rate": 2.945862746363455e-06, "loss": 0.5591, "mean_token_accuracy": 0.8868841081857681, "num_tokens": 221181425.0, "step": 2068 }, { "epoch": 4.713797035347777, "grad_norm": 3.4375, "learning_rate": 2.944009366625061e-06, "loss": 0.5795, "mean_token_accuracy": 0.8801351636648178, "num_tokens": 221288251.0, "step": 2069 }, { "epoch": 4.716077537058153, "grad_norm": 2.765625, "learning_rate": 2.942155734875523e-06, "loss": 0.5802, "mean_token_accuracy": 0.8805908411741257, "num_tokens": 221395168.0, "step": 2070 }, { "epoch": 4.718358038768529, "grad_norm": 6.375, "learning_rate": 2.9403018521669256e-06, "loss": 0.5966, "mean_token_accuracy": 0.8779006004333496, "num_tokens": 221502170.0, "step": 2071 }, { "epoch": 4.720638540478905, "grad_norm": 3.71875, "learning_rate": 2.938447719551498e-06, "loss": 0.5822, "mean_token_accuracy": 0.8802389800548553, "num_tokens": 221608850.0, "step": 2072 }, { "epoch": 4.7229190421892815, "grad_norm": 3.3125, "learning_rate": 2.9365933380816092e-06, "loss": 0.5826, "mean_token_accuracy": 0.8805797696113586, "num_tokens": 221715991.0, "step": 2073 }, { "epoch": 4.725199543899658, "grad_norm": 5.28125, "learning_rate": 2.93473870880977e-06, "loss": 0.591, "mean_token_accuracy": 0.8762219101190567, "num_tokens": 221823317.0, "step": 2074 }, { "epoch": 4.727480045610034, "grad_norm": 3.578125, "learning_rate": 2.932883832788633e-06, "loss": 0.5591, "mean_token_accuracy": 0.8835586160421371, "num_tokens": 221930410.0, "step": 2075 }, { "epoch": 4.72976054732041, "grad_norm": 2.609375, "learning_rate": 2.9310287110709895e-06, "loss": 0.5716, "mean_token_accuracy": 0.881802573800087, "num_tokens": 222037632.0, "step": 2076 }, { "epoch": 4.732041049030787, "grad_norm": 3.828125, "learning_rate": 2.9291733447097714e-06, "loss": 0.5588, "mean_token_accuracy": 0.8866459727287292, "num_tokens": 222144620.0, "step": 2077 }, { "epoch": 4.734321550741163, "grad_norm": 3.625, "learning_rate": 2.927317734758047e-06, "loss": 0.5811, "mean_token_accuracy": 0.879328653216362, "num_tokens": 222251771.0, "step": 2078 }, { "epoch": 4.7366020524515395, "grad_norm": 4.21875, "learning_rate": 2.925461882269027e-06, "loss": 0.5795, "mean_token_accuracy": 0.8794370591640472, "num_tokens": 222359272.0, "step": 2079 }, { "epoch": 4.738882554161916, "grad_norm": 2.671875, "learning_rate": 2.9236057882960567e-06, "loss": 0.5677, "mean_token_accuracy": 0.8789798021316528, "num_tokens": 222466677.0, "step": 2080 }, { "epoch": 4.741163055872292, "grad_norm": 3.09375, "learning_rate": 2.921749453892618e-06, "loss": 0.579, "mean_token_accuracy": 0.8830148726701736, "num_tokens": 222573940.0, "step": 2081 }, { "epoch": 4.743443557582668, "grad_norm": 2.765625, "learning_rate": 2.919892880112332e-06, "loss": 0.5811, "mean_token_accuracy": 0.8783297091722488, "num_tokens": 222680851.0, "step": 2082 }, { "epoch": 4.745724059293044, "grad_norm": 3.109375, "learning_rate": 2.9180360680089542e-06, "loss": 0.574, "mean_token_accuracy": 0.8823717683553696, "num_tokens": 222787611.0, "step": 2083 }, { "epoch": 4.7480045610034205, "grad_norm": 2.75, "learning_rate": 2.9161790186363746e-06, "loss": 0.5668, "mean_token_accuracy": 0.8819119483232498, "num_tokens": 222894602.0, "step": 2084 }, { "epoch": 4.750285062713797, "grad_norm": 2.8125, "learning_rate": 2.9143217330486186e-06, "loss": 0.56, "mean_token_accuracy": 0.8828014135360718, "num_tokens": 223001331.0, "step": 2085 }, { "epoch": 4.752565564424174, "grad_norm": 3.078125, "learning_rate": 2.9124642122998453e-06, "loss": 0.5765, "mean_token_accuracy": 0.8834296762943268, "num_tokens": 223108795.0, "step": 2086 }, { "epoch": 4.75484606613455, "grad_norm": 4.53125, "learning_rate": 2.9106064574443477e-06, "loss": 0.5772, "mean_token_accuracy": 0.8811309486627579, "num_tokens": 223215668.0, "step": 2087 }, { "epoch": 4.757126567844926, "grad_norm": 6.09375, "learning_rate": 2.9087484695365523e-06, "loss": 0.5878, "mean_token_accuracy": 0.8807103782892227, "num_tokens": 223322617.0, "step": 2088 }, { "epoch": 4.759407069555302, "grad_norm": 5.6875, "learning_rate": 2.906890249631017e-06, "loss": 0.5737, "mean_token_accuracy": 0.8810094445943832, "num_tokens": 223429512.0, "step": 2089 }, { "epoch": 4.7616875712656785, "grad_norm": 2.953125, "learning_rate": 2.905031798782431e-06, "loss": 0.5522, "mean_token_accuracy": 0.8895687907934189, "num_tokens": 223536691.0, "step": 2090 }, { "epoch": 4.763968072976055, "grad_norm": 2.75, "learning_rate": 2.903173118045616e-06, "loss": 0.5738, "mean_token_accuracy": 0.8834663927555084, "num_tokens": 223643826.0, "step": 2091 }, { "epoch": 4.766248574686431, "grad_norm": 4.40625, "learning_rate": 2.901314208475522e-06, "loss": 0.5904, "mean_token_accuracy": 0.8782705664634705, "num_tokens": 223750903.0, "step": 2092 }, { "epoch": 4.768529076396807, "grad_norm": 4.28125, "learning_rate": 2.8994550711272317e-06, "loss": 0.5789, "mean_token_accuracy": 0.8804999589920044, "num_tokens": 223858002.0, "step": 2093 }, { "epoch": 4.770809578107183, "grad_norm": 2.921875, "learning_rate": 2.897595707055954e-06, "loss": 0.5673, "mean_token_accuracy": 0.8842459321022034, "num_tokens": 223965056.0, "step": 2094 }, { "epoch": 4.7730900798175595, "grad_norm": 3.21875, "learning_rate": 2.8957361173170297e-06, "loss": 0.5757, "mean_token_accuracy": 0.8789258599281311, "num_tokens": 224071784.0, "step": 2095 }, { "epoch": 4.775370581527936, "grad_norm": 3.015625, "learning_rate": 2.893876302965925e-06, "loss": 0.5705, "mean_token_accuracy": 0.8828511238098145, "num_tokens": 224178796.0, "step": 2096 }, { "epoch": 4.777651083238313, "grad_norm": 2.96875, "learning_rate": 2.8920162650582344e-06, "loss": 0.5772, "mean_token_accuracy": 0.8815236538648605, "num_tokens": 224287157.0, "step": 2097 }, { "epoch": 4.779931584948689, "grad_norm": 4.625, "learning_rate": 2.8901560046496797e-06, "loss": 0.5749, "mean_token_accuracy": 0.8796006739139557, "num_tokens": 224394174.0, "step": 2098 }, { "epoch": 4.782212086659065, "grad_norm": 4.34375, "learning_rate": 2.8882955227961098e-06, "loss": 0.5891, "mean_token_accuracy": 0.8783185184001923, "num_tokens": 224501173.0, "step": 2099 }, { "epoch": 4.784492588369441, "grad_norm": 3.6875, "learning_rate": 2.886434820553497e-06, "loss": 0.5731, "mean_token_accuracy": 0.8836559951305389, "num_tokens": 224608530.0, "step": 2100 }, { "epoch": 4.7867730900798175, "grad_norm": 3.25, "learning_rate": 2.884573898977941e-06, "loss": 0.5632, "mean_token_accuracy": 0.8841763436794281, "num_tokens": 224715326.0, "step": 2101 }, { "epoch": 4.789053591790194, "grad_norm": 2.5, "learning_rate": 2.882712759125664e-06, "loss": 0.5582, "mean_token_accuracy": 0.8851722776889801, "num_tokens": 224822759.0, "step": 2102 }, { "epoch": 4.79133409350057, "grad_norm": 2.65625, "learning_rate": 2.8808514020530127e-06, "loss": 0.5735, "mean_token_accuracy": 0.8818291127681732, "num_tokens": 224929583.0, "step": 2103 }, { "epoch": 4.793614595210946, "grad_norm": 3.859375, "learning_rate": 2.8789898288164595e-06, "loss": 0.5628, "mean_token_accuracy": 0.8850146681070328, "num_tokens": 225036349.0, "step": 2104 }, { "epoch": 4.795895096921322, "grad_norm": 3.890625, "learning_rate": 2.8771280404725953e-06, "loss": 0.5702, "mean_token_accuracy": 0.8813169300556183, "num_tokens": 225142845.0, "step": 2105 }, { "epoch": 4.798175598631699, "grad_norm": 5.875, "learning_rate": 2.8752660380781367e-06, "loss": 0.5736, "mean_token_accuracy": 0.8808184266090393, "num_tokens": 225250066.0, "step": 2106 }, { "epoch": 4.800456100342076, "grad_norm": 4.0, "learning_rate": 2.8734038226899198e-06, "loss": 0.5559, "mean_token_accuracy": 0.8856227844953537, "num_tokens": 225356570.0, "step": 2107 }, { "epoch": 4.802736602052452, "grad_norm": 2.8125, "learning_rate": 2.8715413953649012e-06, "loss": 0.569, "mean_token_accuracy": 0.8814336508512497, "num_tokens": 225463833.0, "step": 2108 }, { "epoch": 4.805017103762828, "grad_norm": 4.125, "learning_rate": 2.8696787571601597e-06, "loss": 0.5868, "mean_token_accuracy": 0.8789616823196411, "num_tokens": 225570947.0, "step": 2109 }, { "epoch": 4.807297605473204, "grad_norm": 5.4375, "learning_rate": 2.8678159091328926e-06, "loss": 0.5651, "mean_token_accuracy": 0.8813024163246155, "num_tokens": 225678130.0, "step": 2110 }, { "epoch": 4.80957810718358, "grad_norm": 2.8125, "learning_rate": 2.865952852340417e-06, "loss": 0.5783, "mean_token_accuracy": 0.8798904716968536, "num_tokens": 225785246.0, "step": 2111 }, { "epoch": 4.811858608893957, "grad_norm": 6.53125, "learning_rate": 2.864089587840167e-06, "loss": 0.5956, "mean_token_accuracy": 0.8791421502828598, "num_tokens": 225891598.0, "step": 2112 }, { "epoch": 4.814139110604333, "grad_norm": 3.25, "learning_rate": 2.862226116689696e-06, "loss": 0.5793, "mean_token_accuracy": 0.8808872699737549, "num_tokens": 225998427.0, "step": 2113 }, { "epoch": 4.816419612314709, "grad_norm": 2.515625, "learning_rate": 2.8603624399466732e-06, "loss": 0.561, "mean_token_accuracy": 0.8853624612092972, "num_tokens": 226106239.0, "step": 2114 }, { "epoch": 4.818700114025085, "grad_norm": 2.46875, "learning_rate": 2.858498558668888e-06, "loss": 0.549, "mean_token_accuracy": 0.886398509144783, "num_tokens": 226213512.0, "step": 2115 }, { "epoch": 4.820980615735461, "grad_norm": 2.984375, "learning_rate": 2.856634473914242e-06, "loss": 0.5705, "mean_token_accuracy": 0.8814459592103958, "num_tokens": 226321236.0, "step": 2116 }, { "epoch": 4.823261117445838, "grad_norm": 4.0625, "learning_rate": 2.854770186740753e-06, "loss": 0.5703, "mean_token_accuracy": 0.8803770244121552, "num_tokens": 226428875.0, "step": 2117 }, { "epoch": 4.825541619156215, "grad_norm": 6.09375, "learning_rate": 2.8529056982065557e-06, "loss": 0.5628, "mean_token_accuracy": 0.8832362145185471, "num_tokens": 226535810.0, "step": 2118 }, { "epoch": 4.827822120866591, "grad_norm": 5.875, "learning_rate": 2.8510410093698966e-06, "loss": 0.5848, "mean_token_accuracy": 0.8776679933071136, "num_tokens": 226642320.0, "step": 2119 }, { "epoch": 4.830102622576967, "grad_norm": 6.03125, "learning_rate": 2.849176121289138e-06, "loss": 0.5934, "mean_token_accuracy": 0.8798095434904099, "num_tokens": 226750138.0, "step": 2120 }, { "epoch": 4.832383124287343, "grad_norm": 2.765625, "learning_rate": 2.8473110350227536e-06, "loss": 0.5673, "mean_token_accuracy": 0.8817552924156189, "num_tokens": 226857421.0, "step": 2121 }, { "epoch": 4.834663625997719, "grad_norm": 4.125, "learning_rate": 2.845445751629331e-06, "loss": 0.5896, "mean_token_accuracy": 0.8786935806274414, "num_tokens": 226965168.0, "step": 2122 }, { "epoch": 4.836944127708096, "grad_norm": 3.390625, "learning_rate": 2.843580272167569e-06, "loss": 0.5703, "mean_token_accuracy": 0.8808483481407166, "num_tokens": 227072404.0, "step": 2123 }, { "epoch": 4.839224629418472, "grad_norm": 6.625, "learning_rate": 2.8417145976962773e-06, "loss": 0.5926, "mean_token_accuracy": 0.8761008530855179, "num_tokens": 227179076.0, "step": 2124 }, { "epoch": 4.841505131128848, "grad_norm": 4.75, "learning_rate": 2.8398487292743772e-06, "loss": 0.5744, "mean_token_accuracy": 0.8841915279626846, "num_tokens": 227285768.0, "step": 2125 }, { "epoch": 4.843785632839225, "grad_norm": 2.6875, "learning_rate": 2.8379826679609e-06, "loss": 0.5736, "mean_token_accuracy": 0.8802362382411957, "num_tokens": 227392431.0, "step": 2126 }, { "epoch": 4.846066134549601, "grad_norm": 2.609375, "learning_rate": 2.836116414814985e-06, "loss": 0.6062, "mean_token_accuracy": 0.8763725161552429, "num_tokens": 227499046.0, "step": 2127 }, { "epoch": 4.848346636259977, "grad_norm": 3.203125, "learning_rate": 2.8342499708958827e-06, "loss": 0.5858, "mean_token_accuracy": 0.8813872933387756, "num_tokens": 227606044.0, "step": 2128 }, { "epoch": 4.850627137970354, "grad_norm": 3.546875, "learning_rate": 2.8323833372629485e-06, "loss": 0.5857, "mean_token_accuracy": 0.8817006945610046, "num_tokens": 227713533.0, "step": 2129 }, { "epoch": 4.85290763968073, "grad_norm": 5.625, "learning_rate": 2.8305165149756496e-06, "loss": 0.5596, "mean_token_accuracy": 0.8857055157423019, "num_tokens": 227820658.0, "step": 2130 }, { "epoch": 4.855188141391106, "grad_norm": 3.25, "learning_rate": 2.828649505093558e-06, "loss": 0.5768, "mean_token_accuracy": 0.8798857480287552, "num_tokens": 227927701.0, "step": 2131 }, { "epoch": 4.857468643101482, "grad_norm": 4.53125, "learning_rate": 2.826782308676351e-06, "loss": 0.5818, "mean_token_accuracy": 0.8805892914533615, "num_tokens": 228034374.0, "step": 2132 }, { "epoch": 4.859749144811858, "grad_norm": 2.75, "learning_rate": 2.824914926783815e-06, "loss": 0.5683, "mean_token_accuracy": 0.8796881288290024, "num_tokens": 228141770.0, "step": 2133 }, { "epoch": 4.862029646522235, "grad_norm": 2.53125, "learning_rate": 2.82304736047584e-06, "loss": 0.5865, "mean_token_accuracy": 0.8820720165967941, "num_tokens": 228249025.0, "step": 2134 }, { "epoch": 4.864310148232612, "grad_norm": 4.84375, "learning_rate": 2.821179610812419e-06, "loss": 0.5874, "mean_token_accuracy": 0.8788727074861526, "num_tokens": 228356604.0, "step": 2135 }, { "epoch": 4.866590649942988, "grad_norm": 6.71875, "learning_rate": 2.819311678853652e-06, "loss": 0.5729, "mean_token_accuracy": 0.8829772472381592, "num_tokens": 228464360.0, "step": 2136 }, { "epoch": 4.868871151653364, "grad_norm": 3.046875, "learning_rate": 2.8174435656597403e-06, "loss": 0.557, "mean_token_accuracy": 0.8821916729211807, "num_tokens": 228571499.0, "step": 2137 }, { "epoch": 4.87115165336374, "grad_norm": 4.78125, "learning_rate": 2.8155752722909896e-06, "loss": 0.5777, "mean_token_accuracy": 0.8816975653171539, "num_tokens": 228678610.0, "step": 2138 }, { "epoch": 4.873432155074116, "grad_norm": 2.953125, "learning_rate": 2.8137067998078073e-06, "loss": 0.5852, "mean_token_accuracy": 0.8804251700639725, "num_tokens": 228785226.0, "step": 2139 }, { "epoch": 4.875712656784493, "grad_norm": 2.421875, "learning_rate": 2.8118381492707004e-06, "loss": 0.5822, "mean_token_accuracy": 0.8774864822626114, "num_tokens": 228891949.0, "step": 2140 }, { "epoch": 4.877993158494869, "grad_norm": 3.6875, "learning_rate": 2.8099693217402807e-06, "loss": 0.5764, "mean_token_accuracy": 0.8809685558080673, "num_tokens": 228999525.0, "step": 2141 }, { "epoch": 4.880273660205245, "grad_norm": 3.859375, "learning_rate": 2.808100318277258e-06, "loss": 0.566, "mean_token_accuracy": 0.8823101967573166, "num_tokens": 229106700.0, "step": 2142 }, { "epoch": 4.882554161915621, "grad_norm": 2.359375, "learning_rate": 2.806231139942443e-06, "loss": 0.564, "mean_token_accuracy": 0.8847738802433014, "num_tokens": 229214479.0, "step": 2143 }, { "epoch": 4.884834663625997, "grad_norm": 2.8125, "learning_rate": 2.8043617877967456e-06, "loss": 0.5809, "mean_token_accuracy": 0.8798518478870392, "num_tokens": 229321261.0, "step": 2144 }, { "epoch": 4.887115165336374, "grad_norm": 3.390625, "learning_rate": 2.8024922629011727e-06, "loss": 0.571, "mean_token_accuracy": 0.883740559220314, "num_tokens": 229428653.0, "step": 2145 }, { "epoch": 4.889395667046751, "grad_norm": 3.046875, "learning_rate": 2.800622566316831e-06, "loss": 0.5782, "mean_token_accuracy": 0.8811280727386475, "num_tokens": 229535592.0, "step": 2146 }, { "epoch": 4.891676168757127, "grad_norm": 4.03125, "learning_rate": 2.798752699104925e-06, "loss": 0.5643, "mean_token_accuracy": 0.8848204910755157, "num_tokens": 229642187.0, "step": 2147 }, { "epoch": 4.893956670467503, "grad_norm": 2.8125, "learning_rate": 2.7968826623267542e-06, "loss": 0.5684, "mean_token_accuracy": 0.8824999332427979, "num_tokens": 229749302.0, "step": 2148 }, { "epoch": 4.896237172177879, "grad_norm": 2.8125, "learning_rate": 2.7950124570437163e-06, "loss": 0.573, "mean_token_accuracy": 0.8834424167871475, "num_tokens": 229856402.0, "step": 2149 }, { "epoch": 4.898517673888255, "grad_norm": 3.59375, "learning_rate": 2.793142084317303e-06, "loss": 0.5748, "mean_token_accuracy": 0.8832953125238419, "num_tokens": 229963106.0, "step": 2150 }, { "epoch": 4.900798175598632, "grad_norm": 2.578125, "learning_rate": 2.7912715452091014e-06, "loss": 0.5629, "mean_token_accuracy": 0.8826733380556107, "num_tokens": 230070472.0, "step": 2151 }, { "epoch": 4.903078677309008, "grad_norm": 3.109375, "learning_rate": 2.789400840780795e-06, "loss": 0.5928, "mean_token_accuracy": 0.8755721002817154, "num_tokens": 230177193.0, "step": 2152 }, { "epoch": 4.905359179019384, "grad_norm": 3.40625, "learning_rate": 2.7875299720941577e-06, "loss": 0.5809, "mean_token_accuracy": 0.8802897483110428, "num_tokens": 230284630.0, "step": 2153 }, { "epoch": 4.90763968072976, "grad_norm": 3.296875, "learning_rate": 2.785658940211059e-06, "loss": 0.5876, "mean_token_accuracy": 0.8808315396308899, "num_tokens": 230391476.0, "step": 2154 }, { "epoch": 4.909920182440137, "grad_norm": 4.4375, "learning_rate": 2.7837877461934616e-06, "loss": 0.5761, "mean_token_accuracy": 0.881686270236969, "num_tokens": 230498008.0, "step": 2155 }, { "epoch": 4.9122006841505135, "grad_norm": 2.578125, "learning_rate": 2.7819163911034175e-06, "loss": 0.571, "mean_token_accuracy": 0.882976695895195, "num_tokens": 230605141.0, "step": 2156 }, { "epoch": 4.91448118586089, "grad_norm": 4.46875, "learning_rate": 2.7800448760030724e-06, "loss": 0.5672, "mean_token_accuracy": 0.8807835131883621, "num_tokens": 230712400.0, "step": 2157 }, { "epoch": 4.916761687571266, "grad_norm": 7.28125, "learning_rate": 2.7781732019546625e-06, "loss": 0.5847, "mean_token_accuracy": 0.8794967532157898, "num_tokens": 230819368.0, "step": 2158 }, { "epoch": 4.919042189281642, "grad_norm": 2.71875, "learning_rate": 2.776301370020513e-06, "loss": 0.5858, "mean_token_accuracy": 0.8819598108530045, "num_tokens": 230925639.0, "step": 2159 }, { "epoch": 4.921322690992018, "grad_norm": 2.765625, "learning_rate": 2.7744293812630412e-06, "loss": 0.5743, "mean_token_accuracy": 0.881621241569519, "num_tokens": 231032712.0, "step": 2160 }, { "epoch": 4.923603192702394, "grad_norm": 3.859375, "learning_rate": 2.77255723674475e-06, "loss": 0.5747, "mean_token_accuracy": 0.8840135484933853, "num_tokens": 231139944.0, "step": 2161 }, { "epoch": 4.925883694412771, "grad_norm": 3.28125, "learning_rate": 2.770684937528233e-06, "loss": 0.5815, "mean_token_accuracy": 0.8798476308584213, "num_tokens": 231246948.0, "step": 2162 }, { "epoch": 4.928164196123147, "grad_norm": 2.921875, "learning_rate": 2.7688124846761716e-06, "loss": 0.5845, "mean_token_accuracy": 0.8791171163320541, "num_tokens": 231353810.0, "step": 2163 }, { "epoch": 4.930444697833523, "grad_norm": 3.96875, "learning_rate": 2.766939879251333e-06, "loss": 0.5873, "mean_token_accuracy": 0.876966580748558, "num_tokens": 231461394.0, "step": 2164 }, { "epoch": 4.932725199543899, "grad_norm": 2.765625, "learning_rate": 2.7650671223165726e-06, "loss": 0.5547, "mean_token_accuracy": 0.8889574855566025, "num_tokens": 231568225.0, "step": 2165 }, { "epoch": 4.935005701254276, "grad_norm": 3.0, "learning_rate": 2.7631942149348313e-06, "loss": 0.5643, "mean_token_accuracy": 0.8834359645843506, "num_tokens": 231675236.0, "step": 2166 }, { "epoch": 4.9372862029646525, "grad_norm": 4.78125, "learning_rate": 2.761321158169134e-06, "loss": 0.5827, "mean_token_accuracy": 0.8793386965990067, "num_tokens": 231782454.0, "step": 2167 }, { "epoch": 4.939566704675029, "grad_norm": 4.78125, "learning_rate": 2.759447953082593e-06, "loss": 0.5755, "mean_token_accuracy": 0.8823465257883072, "num_tokens": 231889063.0, "step": 2168 }, { "epoch": 4.941847206385405, "grad_norm": 3.015625, "learning_rate": 2.757574600738402e-06, "loss": 0.5814, "mean_token_accuracy": 0.8793519288301468, "num_tokens": 231995594.0, "step": 2169 }, { "epoch": 4.944127708095781, "grad_norm": 4.78125, "learning_rate": 2.755701102199841e-06, "loss": 0.5736, "mean_token_accuracy": 0.8815154582262039, "num_tokens": 232102587.0, "step": 2170 }, { "epoch": 4.946408209806157, "grad_norm": 3.28125, "learning_rate": 2.7538274585302707e-06, "loss": 0.5541, "mean_token_accuracy": 0.8854823410511017, "num_tokens": 232209760.0, "step": 2171 }, { "epoch": 4.9486887115165334, "grad_norm": 3.34375, "learning_rate": 2.751953670793135e-06, "loss": 0.5729, "mean_token_accuracy": 0.8862673789262772, "num_tokens": 232316636.0, "step": 2172 }, { "epoch": 4.95096921322691, "grad_norm": 3.1875, "learning_rate": 2.7500797400519595e-06, "loss": 0.5792, "mean_token_accuracy": 0.8799717128276825, "num_tokens": 232423639.0, "step": 2173 }, { "epoch": 4.953249714937286, "grad_norm": 2.90625, "learning_rate": 2.7482056673703526e-06, "loss": 0.589, "mean_token_accuracy": 0.8792567402124405, "num_tokens": 232530768.0, "step": 2174 }, { "epoch": 4.955530216647663, "grad_norm": 4.53125, "learning_rate": 2.746331453812e-06, "loss": 0.5748, "mean_token_accuracy": 0.8782864660024643, "num_tokens": 232637783.0, "step": 2175 }, { "epoch": 4.957810718358039, "grad_norm": 3.125, "learning_rate": 2.74445710044067e-06, "loss": 0.5983, "mean_token_accuracy": 0.8764726370573044, "num_tokens": 232745202.0, "step": 2176 }, { "epoch": 4.960091220068415, "grad_norm": 3.09375, "learning_rate": 2.7425826083202096e-06, "loss": 0.5832, "mean_token_accuracy": 0.8824016451835632, "num_tokens": 232852068.0, "step": 2177 }, { "epoch": 4.9623717217787915, "grad_norm": 2.84375, "learning_rate": 2.740707978514543e-06, "loss": 0.5526, "mean_token_accuracy": 0.8894577920436859, "num_tokens": 232959452.0, "step": 2178 }, { "epoch": 4.964652223489168, "grad_norm": 4.34375, "learning_rate": 2.738833212087676e-06, "loss": 0.5825, "mean_token_accuracy": 0.8816526979207993, "num_tokens": 233066740.0, "step": 2179 }, { "epoch": 4.966932725199544, "grad_norm": 2.703125, "learning_rate": 2.736958310103688e-06, "loss": 0.5726, "mean_token_accuracy": 0.8829785734415054, "num_tokens": 233173753.0, "step": 2180 }, { "epoch": 4.96921322690992, "grad_norm": 2.421875, "learning_rate": 2.735083273626738e-06, "loss": 0.5815, "mean_token_accuracy": 0.8816793113946915, "num_tokens": 233280384.0, "step": 2181 }, { "epoch": 4.971493728620296, "grad_norm": 5.0, "learning_rate": 2.7332081037210607e-06, "loss": 0.5743, "mean_token_accuracy": 0.8826627433300018, "num_tokens": 233388025.0, "step": 2182 }, { "epoch": 4.9737742303306725, "grad_norm": 3.875, "learning_rate": 2.7313328014509653e-06, "loss": 0.5789, "mean_token_accuracy": 0.8806509524583817, "num_tokens": 233494917.0, "step": 2183 }, { "epoch": 4.976054732041049, "grad_norm": 3.65625, "learning_rate": 2.729457367880838e-06, "loss": 0.5431, "mean_token_accuracy": 0.886359453201294, "num_tokens": 233602067.0, "step": 2184 }, { "epoch": 4.978335233751425, "grad_norm": 2.8125, "learning_rate": 2.727581804075139e-06, "loss": 0.5653, "mean_token_accuracy": 0.8818333595991135, "num_tokens": 233708533.0, "step": 2185 }, { "epoch": 4.980615735461802, "grad_norm": 2.578125, "learning_rate": 2.7257061110984005e-06, "loss": 0.5594, "mean_token_accuracy": 0.8846048712730408, "num_tokens": 233815478.0, "step": 2186 }, { "epoch": 4.982896237172178, "grad_norm": 2.53125, "learning_rate": 2.7238302900152327e-06, "loss": 0.5774, "mean_token_accuracy": 0.8843528181314468, "num_tokens": 233922328.0, "step": 2187 }, { "epoch": 4.985176738882554, "grad_norm": 5.40625, "learning_rate": 2.7219543418903115e-06, "loss": 0.6009, "mean_token_accuracy": 0.8763009756803513, "num_tokens": 234028772.0, "step": 2188 }, { "epoch": 4.9874572405929305, "grad_norm": 2.4375, "learning_rate": 2.720078267788392e-06, "loss": 0.5895, "mean_token_accuracy": 0.8788257986307144, "num_tokens": 234135491.0, "step": 2189 }, { "epoch": 4.989737742303307, "grad_norm": 3.921875, "learning_rate": 2.718202068774296e-06, "loss": 0.5923, "mean_token_accuracy": 0.8749925345182419, "num_tokens": 234243399.0, "step": 2190 }, { "epoch": 4.992018244013683, "grad_norm": 2.8125, "learning_rate": 2.7163257459129184e-06, "loss": 0.5639, "mean_token_accuracy": 0.8842321634292603, "num_tokens": 234350519.0, "step": 2191 }, { "epoch": 4.994298745724059, "grad_norm": 3.484375, "learning_rate": 2.7144493002692242e-06, "loss": 0.5813, "mean_token_accuracy": 0.8810799866914749, "num_tokens": 234457691.0, "step": 2192 }, { "epoch": 4.996579247434435, "grad_norm": 3.546875, "learning_rate": 2.7125727329082474e-06, "loss": 0.568, "mean_token_accuracy": 0.8835117220878601, "num_tokens": 234564624.0, "step": 2193 }, { "epoch": 4.9988597491448115, "grad_norm": 4.625, "learning_rate": 2.7106960448950904e-06, "loss": 0.5658, "mean_token_accuracy": 0.8846527189016342, "num_tokens": 234671664.0, "step": 2194 }, { "epoch": 5.0, "grad_norm": 3.8125, "learning_rate": 2.7088192372949267e-06, "loss": 0.5468, "mean_token_accuracy": 0.893031507730484, "num_tokens": 234711160.0, "step": 2195 }, { "epoch": 5.002280501710376, "grad_norm": 2.40625, "learning_rate": 2.7069423111729948e-06, "loss": 0.5723, "mean_token_accuracy": 0.8820315301418304, "num_tokens": 234818290.0, "step": 2196 }, { "epoch": 5.004561003420752, "grad_norm": 2.625, "learning_rate": 2.705065267594602e-06, "loss": 0.578, "mean_token_accuracy": 0.8801742941141129, "num_tokens": 234925842.0, "step": 2197 }, { "epoch": 5.006841505131129, "grad_norm": 3.171875, "learning_rate": 2.703188107625123e-06, "loss": 0.5885, "mean_token_accuracy": 0.8789117336273193, "num_tokens": 235032605.0, "step": 2198 }, { "epoch": 5.009122006841505, "grad_norm": 2.984375, "learning_rate": 2.701310832329996e-06, "loss": 0.5738, "mean_token_accuracy": 0.8840703815221786, "num_tokens": 235139505.0, "step": 2199 }, { "epoch": 5.011402508551882, "grad_norm": 2.84375, "learning_rate": 2.6994334427747276e-06, "loss": 0.5724, "mean_token_accuracy": 0.8826315701007843, "num_tokens": 235246340.0, "step": 2200 }, { "epoch": 5.011402508551882, "eval_loss": 0.5884966850280762, "eval_mean_token_accuracy": 0.8795779687370184, "eval_num_tokens": 235246340.0, "eval_runtime": 58.673, "eval_samples_per_second": 142.911, "eval_steps_per_second": 4.482, "step": 2200 }, { "epoch": 5.013683010262258, "grad_norm": 3.234375, "learning_rate": 2.6975559400248876e-06, "loss": 0.5671, "mean_token_accuracy": 0.8832818120718002, "num_tokens": 235354018.0, "step": 2201 }, { "epoch": 5.015963511972634, "grad_norm": 3.625, "learning_rate": 2.6956783251461093e-06, "loss": 0.5853, "mean_token_accuracy": 0.8836443275213242, "num_tokens": 235461263.0, "step": 2202 }, { "epoch": 5.01824401368301, "grad_norm": 5.09375, "learning_rate": 2.6938005992040923e-06, "loss": 0.5658, "mean_token_accuracy": 0.88588847219944, "num_tokens": 235568391.0, "step": 2203 }, { "epoch": 5.020524515393387, "grad_norm": 2.703125, "learning_rate": 2.6919227632645963e-06, "loss": 0.5662, "mean_token_accuracy": 0.8812144100666046, "num_tokens": 235675302.0, "step": 2204 }, { "epoch": 5.022805017103763, "grad_norm": 5.46875, "learning_rate": 2.690044818393444e-06, "loss": 0.5707, "mean_token_accuracy": 0.8809924572706223, "num_tokens": 235781916.0, "step": 2205 }, { "epoch": 5.025085518814139, "grad_norm": 2.65625, "learning_rate": 2.688166765656523e-06, "loss": 0.5886, "mean_token_accuracy": 0.8800527006387711, "num_tokens": 235889095.0, "step": 2206 }, { "epoch": 5.027366020524515, "grad_norm": 2.65625, "learning_rate": 2.686288606119778e-06, "loss": 0.5671, "mean_token_accuracy": 0.8826051503419876, "num_tokens": 235995817.0, "step": 2207 }, { "epoch": 5.029646522234891, "grad_norm": 2.828125, "learning_rate": 2.6844103408492165e-06, "loss": 0.5672, "mean_token_accuracy": 0.8821920156478882, "num_tokens": 236102620.0, "step": 2208 }, { "epoch": 5.031927023945268, "grad_norm": 4.28125, "learning_rate": 2.682531970910906e-06, "loss": 0.5657, "mean_token_accuracy": 0.882752850651741, "num_tokens": 236210097.0, "step": 2209 }, { "epoch": 5.034207525655645, "grad_norm": 3.609375, "learning_rate": 2.6806534973709723e-06, "loss": 0.5709, "mean_token_accuracy": 0.8835909813642502, "num_tokens": 236317553.0, "step": 2210 }, { "epoch": 5.036488027366021, "grad_norm": 2.9375, "learning_rate": 2.6787749212956023e-06, "loss": 0.5567, "mean_token_accuracy": 0.8868236839771271, "num_tokens": 236424162.0, "step": 2211 }, { "epoch": 5.038768529076397, "grad_norm": 3.90625, "learning_rate": 2.676896243751037e-06, "loss": 0.6022, "mean_token_accuracy": 0.8755950033664703, "num_tokens": 236531359.0, "step": 2212 }, { "epoch": 5.041049030786773, "grad_norm": 2.1875, "learning_rate": 2.6750174658035793e-06, "loss": 0.559, "mean_token_accuracy": 0.8848745375871658, "num_tokens": 236638705.0, "step": 2213 }, { "epoch": 5.043329532497149, "grad_norm": 4.5, "learning_rate": 2.673138588519587e-06, "loss": 0.6018, "mean_token_accuracy": 0.8795148730278015, "num_tokens": 236745788.0, "step": 2214 }, { "epoch": 5.045610034207526, "grad_norm": 3.984375, "learning_rate": 2.671259612965475e-06, "loss": 0.5824, "mean_token_accuracy": 0.8808691650629044, "num_tokens": 236852575.0, "step": 2215 }, { "epoch": 5.047890535917902, "grad_norm": 2.9375, "learning_rate": 2.6693805402077123e-06, "loss": 0.5619, "mean_token_accuracy": 0.8838084042072296, "num_tokens": 236959458.0, "step": 2216 }, { "epoch": 5.050171037628278, "grad_norm": 2.953125, "learning_rate": 2.6675013713128252e-06, "loss": 0.563, "mean_token_accuracy": 0.8833956867456436, "num_tokens": 237067046.0, "step": 2217 }, { "epoch": 5.052451539338654, "grad_norm": 3.0, "learning_rate": 2.665622107347393e-06, "loss": 0.5689, "mean_token_accuracy": 0.8833256661891937, "num_tokens": 237174163.0, "step": 2218 }, { "epoch": 5.05473204104903, "grad_norm": 3.046875, "learning_rate": 2.6637427493780503e-06, "loss": 0.5704, "mean_token_accuracy": 0.8838570863008499, "num_tokens": 237280734.0, "step": 2219 }, { "epoch": 5.0570125427594075, "grad_norm": 5.21875, "learning_rate": 2.6618632984714843e-06, "loss": 0.5725, "mean_token_accuracy": 0.8816571533679962, "num_tokens": 237388128.0, "step": 2220 }, { "epoch": 5.059293044469784, "grad_norm": 2.671875, "learning_rate": 2.6599837556944353e-06, "loss": 0.6045, "mean_token_accuracy": 0.8754912316799164, "num_tokens": 237494752.0, "step": 2221 }, { "epoch": 5.06157354618016, "grad_norm": 4.53125, "learning_rate": 2.658104122113695e-06, "loss": 0.58, "mean_token_accuracy": 0.8795156031847, "num_tokens": 237601646.0, "step": 2222 }, { "epoch": 5.063854047890536, "grad_norm": 2.6875, "learning_rate": 2.6562243987961066e-06, "loss": 0.5483, "mean_token_accuracy": 0.889316976070404, "num_tokens": 237709147.0, "step": 2223 }, { "epoch": 5.066134549600912, "grad_norm": 3.90625, "learning_rate": 2.6543445868085665e-06, "loss": 0.5756, "mean_token_accuracy": 0.8827229887247086, "num_tokens": 237816406.0, "step": 2224 }, { "epoch": 5.068415051311288, "grad_norm": 2.796875, "learning_rate": 2.652464687218018e-06, "loss": 0.5996, "mean_token_accuracy": 0.8769262731075287, "num_tokens": 237923355.0, "step": 2225 }, { "epoch": 5.070695553021665, "grad_norm": 3.921875, "learning_rate": 2.6505847010914575e-06, "loss": 0.5856, "mean_token_accuracy": 0.8756005167961121, "num_tokens": 238030151.0, "step": 2226 }, { "epoch": 5.072976054732041, "grad_norm": 4.34375, "learning_rate": 2.6487046294959275e-06, "loss": 0.5846, "mean_token_accuracy": 0.8792047202587128, "num_tokens": 238137145.0, "step": 2227 }, { "epoch": 5.075256556442417, "grad_norm": 3.09375, "learning_rate": 2.64682447349852e-06, "loss": 0.5668, "mean_token_accuracy": 0.8841764777898788, "num_tokens": 238243562.0, "step": 2228 }, { "epoch": 5.077537058152793, "grad_norm": 3.375, "learning_rate": 2.6449442341663755e-06, "loss": 0.5626, "mean_token_accuracy": 0.8807887881994247, "num_tokens": 238350893.0, "step": 2229 }, { "epoch": 5.07981755986317, "grad_norm": 3.5, "learning_rate": 2.643063912566683e-06, "loss": 0.5751, "mean_token_accuracy": 0.8790174126625061, "num_tokens": 238457625.0, "step": 2230 }, { "epoch": 5.0820980615735465, "grad_norm": 3.390625, "learning_rate": 2.641183509766675e-06, "loss": 0.5586, "mean_token_accuracy": 0.8864348530769348, "num_tokens": 238564821.0, "step": 2231 }, { "epoch": 5.084378563283923, "grad_norm": 3.859375, "learning_rate": 2.639303026833632e-06, "loss": 0.5594, "mean_token_accuracy": 0.8856208771467209, "num_tokens": 238671917.0, "step": 2232 }, { "epoch": 5.086659064994299, "grad_norm": 5.46875, "learning_rate": 2.6374224648348815e-06, "loss": 0.5597, "mean_token_accuracy": 0.8855437189340591, "num_tokens": 238779139.0, "step": 2233 }, { "epoch": 5.088939566704675, "grad_norm": 4.53125, "learning_rate": 2.6355418248377928e-06, "loss": 0.567, "mean_token_accuracy": 0.8826903700828552, "num_tokens": 238886730.0, "step": 2234 }, { "epoch": 5.091220068415051, "grad_norm": 2.640625, "learning_rate": 2.633661107909781e-06, "loss": 0.5568, "mean_token_accuracy": 0.8886370956897736, "num_tokens": 238993760.0, "step": 2235 }, { "epoch": 5.0935005701254275, "grad_norm": 2.515625, "learning_rate": 2.6317803151183053e-06, "loss": 0.5576, "mean_token_accuracy": 0.8850623667240143, "num_tokens": 239100806.0, "step": 2236 }, { "epoch": 5.095781071835804, "grad_norm": 2.625, "learning_rate": 2.629899447530866e-06, "loss": 0.5615, "mean_token_accuracy": 0.8851571381092072, "num_tokens": 239209162.0, "step": 2237 }, { "epoch": 5.09806157354618, "grad_norm": 4.15625, "learning_rate": 2.6280185062150084e-06, "loss": 0.5565, "mean_token_accuracy": 0.8866082727909088, "num_tokens": 239316377.0, "step": 2238 }, { "epoch": 5.100342075256556, "grad_norm": 3.84375, "learning_rate": 2.6261374922383176e-06, "loss": 0.5711, "mean_token_accuracy": 0.8804745078086853, "num_tokens": 239423635.0, "step": 2239 }, { "epoch": 5.102622576966933, "grad_norm": 3.921875, "learning_rate": 2.6242564066684217e-06, "loss": 0.5942, "mean_token_accuracy": 0.8799638897180557, "num_tokens": 239530264.0, "step": 2240 }, { "epoch": 5.104903078677309, "grad_norm": 4.0625, "learning_rate": 2.6223752505729884e-06, "loss": 0.581, "mean_token_accuracy": 0.8811380565166473, "num_tokens": 239637865.0, "step": 2241 }, { "epoch": 5.1071835803876855, "grad_norm": 3.15625, "learning_rate": 2.6204940250197253e-06, "loss": 0.5634, "mean_token_accuracy": 0.8820418566465378, "num_tokens": 239744908.0, "step": 2242 }, { "epoch": 5.109464082098062, "grad_norm": 2.84375, "learning_rate": 2.61861273107638e-06, "loss": 0.564, "mean_token_accuracy": 0.8828686624765396, "num_tokens": 239852149.0, "step": 2243 }, { "epoch": 5.111744583808438, "grad_norm": 6.84375, "learning_rate": 2.6167313698107385e-06, "loss": 0.5729, "mean_token_accuracy": 0.880137637257576, "num_tokens": 239959160.0, "step": 2244 }, { "epoch": 5.114025085518814, "grad_norm": 4.15625, "learning_rate": 2.6148499422906243e-06, "loss": 0.5639, "mean_token_accuracy": 0.8834390491247177, "num_tokens": 240066499.0, "step": 2245 }, { "epoch": 5.11630558722919, "grad_norm": 4.4375, "learning_rate": 2.6129684495839013e-06, "loss": 0.5682, "mean_token_accuracy": 0.8807038068771362, "num_tokens": 240173494.0, "step": 2246 }, { "epoch": 5.1185860889395665, "grad_norm": 4.5, "learning_rate": 2.611086892758467e-06, "loss": 0.5687, "mean_token_accuracy": 0.883940801024437, "num_tokens": 240280234.0, "step": 2247 }, { "epoch": 5.120866590649943, "grad_norm": 3.171875, "learning_rate": 2.6092052728822564e-06, "loss": 0.5826, "mean_token_accuracy": 0.8823877573013306, "num_tokens": 240387350.0, "step": 2248 }, { "epoch": 5.123147092360319, "grad_norm": 3.53125, "learning_rate": 2.607323591023242e-06, "loss": 0.5754, "mean_token_accuracy": 0.8841145634651184, "num_tokens": 240494417.0, "step": 2249 }, { "epoch": 5.125427594070696, "grad_norm": 2.734375, "learning_rate": 2.605441848249428e-06, "loss": 0.5501, "mean_token_accuracy": 0.8849078863859177, "num_tokens": 240601696.0, "step": 2250 }, { "epoch": 5.127708095781072, "grad_norm": 6.34375, "learning_rate": 2.6035600456288573e-06, "loss": 0.5773, "mean_token_accuracy": 0.8804211169481277, "num_tokens": 240708216.0, "step": 2251 }, { "epoch": 5.129988597491448, "grad_norm": 4.34375, "learning_rate": 2.6016781842296044e-06, "loss": 0.5737, "mean_token_accuracy": 0.881677657365799, "num_tokens": 240814705.0, "step": 2252 }, { "epoch": 5.1322690992018245, "grad_norm": 3.296875, "learning_rate": 2.599796265119777e-06, "loss": 0.5827, "mean_token_accuracy": 0.881735309958458, "num_tokens": 240921724.0, "step": 2253 }, { "epoch": 5.134549600912201, "grad_norm": 4.46875, "learning_rate": 2.597914289367516e-06, "loss": 0.5723, "mean_token_accuracy": 0.8814171552658081, "num_tokens": 241029263.0, "step": 2254 }, { "epoch": 5.136830102622577, "grad_norm": 4.28125, "learning_rate": 2.596032258040994e-06, "loss": 0.5699, "mean_token_accuracy": 0.8828539550304413, "num_tokens": 241136198.0, "step": 2255 }, { "epoch": 5.139110604332953, "grad_norm": 3.375, "learning_rate": 2.594150172208417e-06, "loss": 0.5987, "mean_token_accuracy": 0.872844010591507, "num_tokens": 241243108.0, "step": 2256 }, { "epoch": 5.141391106043329, "grad_norm": 3.921875, "learning_rate": 2.59226803293802e-06, "loss": 0.5773, "mean_token_accuracy": 0.881134495139122, "num_tokens": 241350038.0, "step": 2257 }, { "epoch": 5.1436716077537055, "grad_norm": 3.484375, "learning_rate": 2.5903858412980688e-06, "loss": 0.5745, "mean_token_accuracy": 0.882486030459404, "num_tokens": 241456705.0, "step": 2258 }, { "epoch": 5.145952109464082, "grad_norm": 3.21875, "learning_rate": 2.5885035983568584e-06, "loss": 0.5873, "mean_token_accuracy": 0.8809113800525665, "num_tokens": 241564205.0, "step": 2259 }, { "epoch": 5.148232611174459, "grad_norm": 8.3125, "learning_rate": 2.5866213051827148e-06, "loss": 0.5681, "mean_token_accuracy": 0.8836153000593185, "num_tokens": 241671302.0, "step": 2260 }, { "epoch": 5.150513112884835, "grad_norm": 4.96875, "learning_rate": 2.5847389628439905e-06, "loss": 0.5745, "mean_token_accuracy": 0.8828669935464859, "num_tokens": 241778608.0, "step": 2261 }, { "epoch": 5.152793614595211, "grad_norm": 6.4375, "learning_rate": 2.5828565724090672e-06, "loss": 0.5635, "mean_token_accuracy": 0.881557047367096, "num_tokens": 241886293.0, "step": 2262 }, { "epoch": 5.155074116305587, "grad_norm": 4.25, "learning_rate": 2.5809741349463526e-06, "loss": 0.5754, "mean_token_accuracy": 0.882814958691597, "num_tokens": 241993190.0, "step": 2263 }, { "epoch": 5.1573546180159635, "grad_norm": 3.59375, "learning_rate": 2.579091651524282e-06, "loss": 0.5731, "mean_token_accuracy": 0.8815194815397263, "num_tokens": 242100380.0, "step": 2264 }, { "epoch": 5.15963511972634, "grad_norm": 3.484375, "learning_rate": 2.5772091232113176e-06, "loss": 0.5659, "mean_token_accuracy": 0.8794299364089966, "num_tokens": 242207689.0, "step": 2265 }, { "epoch": 5.161915621436716, "grad_norm": 4.28125, "learning_rate": 2.575326551075945e-06, "loss": 0.5774, "mean_token_accuracy": 0.8856062591075897, "num_tokens": 242314652.0, "step": 2266 }, { "epoch": 5.164196123147092, "grad_norm": 3.203125, "learning_rate": 2.5734439361866762e-06, "loss": 0.5641, "mean_token_accuracy": 0.8838386088609695, "num_tokens": 242422494.0, "step": 2267 }, { "epoch": 5.166476624857468, "grad_norm": 5.875, "learning_rate": 2.571561279612047e-06, "loss": 0.5742, "mean_token_accuracy": 0.8841647505760193, "num_tokens": 242529583.0, "step": 2268 }, { "epoch": 5.168757126567845, "grad_norm": 5.1875, "learning_rate": 2.5696785824206177e-06, "loss": 0.5852, "mean_token_accuracy": 0.8775971680879593, "num_tokens": 242636422.0, "step": 2269 }, { "epoch": 5.1710376282782216, "grad_norm": 5.15625, "learning_rate": 2.5677958456809703e-06, "loss": 0.5768, "mean_token_accuracy": 0.8794868886470795, "num_tokens": 242742862.0, "step": 2270 }, { "epoch": 5.173318129988598, "grad_norm": 3.59375, "learning_rate": 2.5659130704617092e-06, "loss": 0.579, "mean_token_accuracy": 0.8852169960737228, "num_tokens": 242850283.0, "step": 2271 }, { "epoch": 5.175598631698974, "grad_norm": 2.703125, "learning_rate": 2.5640302578314614e-06, "loss": 0.5804, "mean_token_accuracy": 0.8777539879083633, "num_tokens": 242957389.0, "step": 2272 }, { "epoch": 5.17787913340935, "grad_norm": 2.546875, "learning_rate": 2.562147408858876e-06, "loss": 0.5718, "mean_token_accuracy": 0.8816378861665726, "num_tokens": 243064479.0, "step": 2273 }, { "epoch": 5.180159635119726, "grad_norm": 2.796875, "learning_rate": 2.5602645246126207e-06, "loss": 0.5922, "mean_token_accuracy": 0.877925843000412, "num_tokens": 243171634.0, "step": 2274 }, { "epoch": 5.1824401368301025, "grad_norm": 3.125, "learning_rate": 2.5583816061613847e-06, "loss": 0.566, "mean_token_accuracy": 0.885333925485611, "num_tokens": 243278723.0, "step": 2275 }, { "epoch": 5.184720638540479, "grad_norm": 2.296875, "learning_rate": 2.5564986545738767e-06, "loss": 0.5537, "mean_token_accuracy": 0.8853138238191605, "num_tokens": 243386191.0, "step": 2276 }, { "epoch": 5.187001140250855, "grad_norm": 2.6875, "learning_rate": 2.554615670918823e-06, "loss": 0.5912, "mean_token_accuracy": 0.8801163583993912, "num_tokens": 243492478.0, "step": 2277 }, { "epoch": 5.189281641961231, "grad_norm": 3.0, "learning_rate": 2.552732656264969e-06, "loss": 0.5707, "mean_token_accuracy": 0.8835473358631134, "num_tokens": 243599878.0, "step": 2278 }, { "epoch": 5.191562143671608, "grad_norm": 3.84375, "learning_rate": 2.5508496116810766e-06, "loss": 0.6136, "mean_token_accuracy": 0.8747565001249313, "num_tokens": 243706837.0, "step": 2279 }, { "epoch": 5.193842645381984, "grad_norm": 3.1875, "learning_rate": 2.548966538235927e-06, "loss": 0.5746, "mean_token_accuracy": 0.8776230216026306, "num_tokens": 243813746.0, "step": 2280 }, { "epoch": 5.196123147092361, "grad_norm": 4.34375, "learning_rate": 2.547083436998316e-06, "loss": 0.5647, "mean_token_accuracy": 0.8837112933397293, "num_tokens": 243920433.0, "step": 2281 }, { "epoch": 5.198403648802737, "grad_norm": 3.234375, "learning_rate": 2.5452003090370543e-06, "loss": 0.5639, "mean_token_accuracy": 0.8835209310054779, "num_tokens": 244027394.0, "step": 2282 }, { "epoch": 5.200684150513113, "grad_norm": 3.4375, "learning_rate": 2.5433171554209694e-06, "loss": 0.5585, "mean_token_accuracy": 0.8822397440671921, "num_tokens": 244134310.0, "step": 2283 }, { "epoch": 5.202964652223489, "grad_norm": 3.1875, "learning_rate": 2.5414339772189045e-06, "loss": 0.5558, "mean_token_accuracy": 0.8861335813999176, "num_tokens": 244241565.0, "step": 2284 }, { "epoch": 5.205245153933865, "grad_norm": 3.90625, "learning_rate": 2.5395507754997135e-06, "loss": 0.5562, "mean_token_accuracy": 0.8825538158416748, "num_tokens": 244349038.0, "step": 2285 }, { "epoch": 5.2075256556442415, "grad_norm": 2.375, "learning_rate": 2.5376675513322665e-06, "loss": 0.5813, "mean_token_accuracy": 0.8824332058429718, "num_tokens": 244455939.0, "step": 2286 }, { "epoch": 5.209806157354618, "grad_norm": 3.265625, "learning_rate": 2.535784305785443e-06, "loss": 0.5862, "mean_token_accuracy": 0.8791725635528564, "num_tokens": 244563202.0, "step": 2287 }, { "epoch": 5.212086659064994, "grad_norm": 2.71875, "learning_rate": 2.5339010399281394e-06, "loss": 0.5615, "mean_token_accuracy": 0.8839509189128876, "num_tokens": 244670656.0, "step": 2288 }, { "epoch": 5.214367160775371, "grad_norm": 2.859375, "learning_rate": 2.53201775482926e-06, "loss": 0.5729, "mean_token_accuracy": 0.8803385496139526, "num_tokens": 244777602.0, "step": 2289 }, { "epoch": 5.216647662485747, "grad_norm": 3.28125, "learning_rate": 2.530134451557722e-06, "loss": 0.5667, "mean_token_accuracy": 0.8833064287900925, "num_tokens": 244884757.0, "step": 2290 }, { "epoch": 5.218928164196123, "grad_norm": 2.859375, "learning_rate": 2.52825113118245e-06, "loss": 0.5753, "mean_token_accuracy": 0.8829501569271088, "num_tokens": 244992076.0, "step": 2291 }, { "epoch": 5.2212086659065, "grad_norm": 4.15625, "learning_rate": 2.5263677947723813e-06, "loss": 0.5773, "mean_token_accuracy": 0.8804136514663696, "num_tokens": 245099111.0, "step": 2292 }, { "epoch": 5.223489167616876, "grad_norm": 3.484375, "learning_rate": 2.5244844433964615e-06, "loss": 0.5481, "mean_token_accuracy": 0.8848851323127747, "num_tokens": 245206635.0, "step": 2293 }, { "epoch": 5.225769669327252, "grad_norm": 2.484375, "learning_rate": 2.522601078123645e-06, "loss": 0.5586, "mean_token_accuracy": 0.8856381624937057, "num_tokens": 245313995.0, "step": 2294 }, { "epoch": 5.228050171037628, "grad_norm": 2.84375, "learning_rate": 2.5207177000228916e-06, "loss": 0.5651, "mean_token_accuracy": 0.8863477110862732, "num_tokens": 245420590.0, "step": 2295 }, { "epoch": 5.230330672748004, "grad_norm": 2.90625, "learning_rate": 2.5188343101631717e-06, "loss": 0.5998, "mean_token_accuracy": 0.8750515878200531, "num_tokens": 245527620.0, "step": 2296 }, { "epoch": 5.2326111744583805, "grad_norm": 4.84375, "learning_rate": 2.516950909613462e-06, "loss": 0.5574, "mean_token_accuracy": 0.8836745172739029, "num_tokens": 245634856.0, "step": 2297 }, { "epoch": 5.234891676168757, "grad_norm": 2.65625, "learning_rate": 2.5150674994427427e-06, "loss": 0.5914, "mean_token_accuracy": 0.8803335726261139, "num_tokens": 245741637.0, "step": 2298 }, { "epoch": 5.237172177879134, "grad_norm": 6.40625, "learning_rate": 2.5131840807200015e-06, "loss": 0.5603, "mean_token_accuracy": 0.8841415345668793, "num_tokens": 245850208.0, "step": 2299 }, { "epoch": 5.23945267958951, "grad_norm": 2.796875, "learning_rate": 2.511300654514231e-06, "loss": 0.6017, "mean_token_accuracy": 0.8761951625347137, "num_tokens": 245956827.0, "step": 2300 }, { "epoch": 5.241733181299886, "grad_norm": 2.578125, "learning_rate": 2.5094172218944276e-06, "loss": 0.5628, "mean_token_accuracy": 0.8835689127445221, "num_tokens": 246064233.0, "step": 2301 }, { "epoch": 5.244013683010262, "grad_norm": 2.5625, "learning_rate": 2.5075337839295903e-06, "loss": 0.5553, "mean_token_accuracy": 0.881672739982605, "num_tokens": 246171103.0, "step": 2302 }, { "epoch": 5.246294184720639, "grad_norm": 6.4375, "learning_rate": 2.5056503416887222e-06, "loss": 0.5695, "mean_token_accuracy": 0.8831684589385986, "num_tokens": 246278163.0, "step": 2303 }, { "epoch": 5.248574686431015, "grad_norm": 3.375, "learning_rate": 2.5037668962408295e-06, "loss": 0.5733, "mean_token_accuracy": 0.8839241862297058, "num_tokens": 246385460.0, "step": 2304 }, { "epoch": 5.250855188141391, "grad_norm": 2.828125, "learning_rate": 2.5018834486549198e-06, "loss": 0.5729, "mean_token_accuracy": 0.8784928619861603, "num_tokens": 246492696.0, "step": 2305 }, { "epoch": 5.253135689851767, "grad_norm": 3.1875, "learning_rate": 2.5e-06, "loss": 0.5691, "mean_token_accuracy": 0.884750634431839, "num_tokens": 246599773.0, "step": 2306 }, { "epoch": 5.255416191562143, "grad_norm": 3.6875, "learning_rate": 2.4981165513450807e-06, "loss": 0.5908, "mean_token_accuracy": 0.8797119110822678, "num_tokens": 246706234.0, "step": 2307 }, { "epoch": 5.2576966932725195, "grad_norm": 4.09375, "learning_rate": 2.4962331037591705e-06, "loss": 0.5682, "mean_token_accuracy": 0.8851300626993179, "num_tokens": 246813408.0, "step": 2308 }, { "epoch": 5.259977194982897, "grad_norm": 4.53125, "learning_rate": 2.494349658311279e-06, "loss": 0.5752, "mean_token_accuracy": 0.8815591186285019, "num_tokens": 246920031.0, "step": 2309 }, { "epoch": 5.262257696693273, "grad_norm": 5.46875, "learning_rate": 2.492466216070411e-06, "loss": 0.5806, "mean_token_accuracy": 0.8796799033880234, "num_tokens": 247027919.0, "step": 2310 }, { "epoch": 5.264538198403649, "grad_norm": 3.390625, "learning_rate": 2.4905827781055733e-06, "loss": 0.5827, "mean_token_accuracy": 0.8780756443738937, "num_tokens": 247134526.0, "step": 2311 }, { "epoch": 5.266818700114025, "grad_norm": 2.765625, "learning_rate": 2.4886993454857696e-06, "loss": 0.5598, "mean_token_accuracy": 0.8812786191701889, "num_tokens": 247242308.0, "step": 2312 }, { "epoch": 5.269099201824401, "grad_norm": 2.875, "learning_rate": 2.486815919279999e-06, "loss": 0.5608, "mean_token_accuracy": 0.8846468031406403, "num_tokens": 247349006.0, "step": 2313 }, { "epoch": 5.271379703534778, "grad_norm": 4.03125, "learning_rate": 2.4849325005572573e-06, "loss": 0.5587, "mean_token_accuracy": 0.8841832727193832, "num_tokens": 247455756.0, "step": 2314 }, { "epoch": 5.273660205245154, "grad_norm": 3.234375, "learning_rate": 2.483049090386539e-06, "loss": 0.5726, "mean_token_accuracy": 0.8810638189315796, "num_tokens": 247562448.0, "step": 2315 }, { "epoch": 5.27594070695553, "grad_norm": 5.25, "learning_rate": 2.4811656898368287e-06, "loss": 0.5921, "mean_token_accuracy": 0.87621209025383, "num_tokens": 247669493.0, "step": 2316 }, { "epoch": 5.278221208665906, "grad_norm": 2.640625, "learning_rate": 2.4792822999771092e-06, "loss": 0.5819, "mean_token_accuracy": 0.8810609132051468, "num_tokens": 247776380.0, "step": 2317 }, { "epoch": 5.280501710376283, "grad_norm": 2.9375, "learning_rate": 2.477398921876356e-06, "loss": 0.5727, "mean_token_accuracy": 0.8778067380189896, "num_tokens": 247883246.0, "step": 2318 }, { "epoch": 5.282782212086659, "grad_norm": 4.5, "learning_rate": 2.475515556603539e-06, "loss": 0.5648, "mean_token_accuracy": 0.882491260766983, "num_tokens": 247990244.0, "step": 2319 }, { "epoch": 5.285062713797036, "grad_norm": 5.875, "learning_rate": 2.47363220522762e-06, "loss": 0.5675, "mean_token_accuracy": 0.8826608210802078, "num_tokens": 248097490.0, "step": 2320 }, { "epoch": 5.287343215507412, "grad_norm": 4.40625, "learning_rate": 2.4717488688175513e-06, "loss": 0.576, "mean_token_accuracy": 0.8805565237998962, "num_tokens": 248204076.0, "step": 2321 }, { "epoch": 5.289623717217788, "grad_norm": 2.75, "learning_rate": 2.469865548442279e-06, "loss": 0.5795, "mean_token_accuracy": 0.8818432092666626, "num_tokens": 248311212.0, "step": 2322 }, { "epoch": 5.291904218928164, "grad_norm": 3.453125, "learning_rate": 2.4679822451707404e-06, "loss": 0.5693, "mean_token_accuracy": 0.8799884766340256, "num_tokens": 248418165.0, "step": 2323 }, { "epoch": 5.29418472063854, "grad_norm": 3.015625, "learning_rate": 2.4660989600718606e-06, "loss": 0.5607, "mean_token_accuracy": 0.8832896500825882, "num_tokens": 248525308.0, "step": 2324 }, { "epoch": 5.296465222348917, "grad_norm": 3.484375, "learning_rate": 2.4642156942145577e-06, "loss": 0.5588, "mean_token_accuracy": 0.8862069100141525, "num_tokens": 248632300.0, "step": 2325 }, { "epoch": 5.298745724059293, "grad_norm": 3.28125, "learning_rate": 2.4623324486677352e-06, "loss": 0.5821, "mean_token_accuracy": 0.8789513111114502, "num_tokens": 248739272.0, "step": 2326 }, { "epoch": 5.301026225769669, "grad_norm": 2.734375, "learning_rate": 2.4604492245002873e-06, "loss": 0.5729, "mean_token_accuracy": 0.881064772605896, "num_tokens": 248846325.0, "step": 2327 }, { "epoch": 5.303306727480045, "grad_norm": 2.453125, "learning_rate": 2.4585660227810963e-06, "loss": 0.5797, "mean_token_accuracy": 0.8820571899414062, "num_tokens": 248953637.0, "step": 2328 }, { "epoch": 5.305587229190422, "grad_norm": 3.21875, "learning_rate": 2.4566828445790306e-06, "loss": 0.5838, "mean_token_accuracy": 0.8833505213260651, "num_tokens": 249060027.0, "step": 2329 }, { "epoch": 5.307867730900798, "grad_norm": 2.984375, "learning_rate": 2.454799690962946e-06, "loss": 0.5628, "mean_token_accuracy": 0.8810784071683884, "num_tokens": 249166827.0, "step": 2330 }, { "epoch": 5.310148232611175, "grad_norm": 2.96875, "learning_rate": 2.4529165630016855e-06, "loss": 0.5819, "mean_token_accuracy": 0.8803735673427582, "num_tokens": 249273474.0, "step": 2331 }, { "epoch": 5.312428734321551, "grad_norm": 3.5625, "learning_rate": 2.4510334617640733e-06, "loss": 0.581, "mean_token_accuracy": 0.8823344260454178, "num_tokens": 249380314.0, "step": 2332 }, { "epoch": 5.314709236031927, "grad_norm": 3.046875, "learning_rate": 2.4491503883189242e-06, "loss": 0.5597, "mean_token_accuracy": 0.8847284018993378, "num_tokens": 249487519.0, "step": 2333 }, { "epoch": 5.316989737742303, "grad_norm": 4.625, "learning_rate": 2.447267343735032e-06, "loss": 0.5818, "mean_token_accuracy": 0.8809306025505066, "num_tokens": 249594226.0, "step": 2334 }, { "epoch": 5.319270239452679, "grad_norm": 2.890625, "learning_rate": 2.4453843290811772e-06, "loss": 0.5786, "mean_token_accuracy": 0.8837304264307022, "num_tokens": 249701458.0, "step": 2335 }, { "epoch": 5.321550741163056, "grad_norm": 3.8125, "learning_rate": 2.4435013454261246e-06, "loss": 0.5898, "mean_token_accuracy": 0.8787836730480194, "num_tokens": 249807967.0, "step": 2336 }, { "epoch": 5.323831242873432, "grad_norm": 3.625, "learning_rate": 2.4416183938386157e-06, "loss": 0.5645, "mean_token_accuracy": 0.8804430663585663, "num_tokens": 249915439.0, "step": 2337 }, { "epoch": 5.326111744583809, "grad_norm": 2.875, "learning_rate": 2.4397354753873797e-06, "loss": 0.5745, "mean_token_accuracy": 0.8802332282066345, "num_tokens": 250022617.0, "step": 2338 }, { "epoch": 5.328392246294185, "grad_norm": 4.28125, "learning_rate": 2.4378525911411246e-06, "loss": 0.5686, "mean_token_accuracy": 0.8810295462608337, "num_tokens": 250129347.0, "step": 2339 }, { "epoch": 5.330672748004561, "grad_norm": 4.34375, "learning_rate": 2.435969742168539e-06, "loss": 0.5752, "mean_token_accuracy": 0.8794845640659332, "num_tokens": 250236273.0, "step": 2340 }, { "epoch": 5.3329532497149374, "grad_norm": 3.75, "learning_rate": 2.4340869295382924e-06, "loss": 0.5837, "mean_token_accuracy": 0.8810597956180573, "num_tokens": 250343235.0, "step": 2341 }, { "epoch": 5.335233751425314, "grad_norm": 5.25, "learning_rate": 2.432204154319031e-06, "loss": 0.5673, "mean_token_accuracy": 0.8840866684913635, "num_tokens": 250450686.0, "step": 2342 }, { "epoch": 5.33751425313569, "grad_norm": 3.0, "learning_rate": 2.4303214175793827e-06, "loss": 0.599, "mean_token_accuracy": 0.8780745565891266, "num_tokens": 250557785.0, "step": 2343 }, { "epoch": 5.339794754846066, "grad_norm": 5.71875, "learning_rate": 2.4284387203879536e-06, "loss": 0.5853, "mean_token_accuracy": 0.8825893849134445, "num_tokens": 250665664.0, "step": 2344 }, { "epoch": 5.342075256556442, "grad_norm": 5.09375, "learning_rate": 2.426556063813324e-06, "loss": 0.5485, "mean_token_accuracy": 0.8859640210866928, "num_tokens": 250773371.0, "step": 2345 }, { "epoch": 5.344355758266818, "grad_norm": 5.15625, "learning_rate": 2.4246734489240554e-06, "loss": 0.5664, "mean_token_accuracy": 0.883133664727211, "num_tokens": 250880142.0, "step": 2346 }, { "epoch": 5.346636259977195, "grad_norm": 5.15625, "learning_rate": 2.4227908767886837e-06, "loss": 0.5622, "mean_token_accuracy": 0.8815203011035919, "num_tokens": 250987935.0, "step": 2347 }, { "epoch": 5.348916761687571, "grad_norm": 4.21875, "learning_rate": 2.420908348475719e-06, "loss": 0.5691, "mean_token_accuracy": 0.8813404738903046, "num_tokens": 251094697.0, "step": 2348 }, { "epoch": 5.351197263397948, "grad_norm": 4.9375, "learning_rate": 2.4190258650536483e-06, "loss": 0.5722, "mean_token_accuracy": 0.883138045668602, "num_tokens": 251201631.0, "step": 2349 }, { "epoch": 5.353477765108324, "grad_norm": 2.421875, "learning_rate": 2.417143427590933e-06, "loss": 0.5414, "mean_token_accuracy": 0.8879837691783905, "num_tokens": 251309130.0, "step": 2350 }, { "epoch": 5.3557582668187, "grad_norm": 3.40625, "learning_rate": 2.4152610371560095e-06, "loss": 0.567, "mean_token_accuracy": 0.886492133140564, "num_tokens": 251416268.0, "step": 2351 }, { "epoch": 5.3580387685290765, "grad_norm": 7.125, "learning_rate": 2.413378694817286e-06, "loss": 0.5553, "mean_token_accuracy": 0.8863291144371033, "num_tokens": 251523690.0, "step": 2352 }, { "epoch": 5.360319270239453, "grad_norm": 6.96875, "learning_rate": 2.411496401643142e-06, "loss": 0.59, "mean_token_accuracy": 0.8787893354892731, "num_tokens": 251630852.0, "step": 2353 }, { "epoch": 5.362599771949829, "grad_norm": 4.25, "learning_rate": 2.409614158701932e-06, "loss": 0.5525, "mean_token_accuracy": 0.888729602098465, "num_tokens": 251738309.0, "step": 2354 }, { "epoch": 5.364880273660205, "grad_norm": 5.0, "learning_rate": 2.407731967061981e-06, "loss": 0.559, "mean_token_accuracy": 0.8860864639282227, "num_tokens": 251845628.0, "step": 2355 }, { "epoch": 5.367160775370581, "grad_norm": 5.15625, "learning_rate": 2.4058498277915835e-06, "loss": 0.5791, "mean_token_accuracy": 0.8805464655160904, "num_tokens": 251951993.0, "step": 2356 }, { "epoch": 5.369441277080957, "grad_norm": 3.453125, "learning_rate": 2.4039677419590064e-06, "loss": 0.5592, "mean_token_accuracy": 0.8828859180212021, "num_tokens": 252059284.0, "step": 2357 }, { "epoch": 5.3717217787913345, "grad_norm": 3.46875, "learning_rate": 2.4020857106324853e-06, "loss": 0.6042, "mean_token_accuracy": 0.872930184006691, "num_tokens": 252166320.0, "step": 2358 }, { "epoch": 5.374002280501711, "grad_norm": 4.375, "learning_rate": 2.4002037348802245e-06, "loss": 0.5629, "mean_token_accuracy": 0.8799397349357605, "num_tokens": 252273488.0, "step": 2359 }, { "epoch": 5.376282782212087, "grad_norm": 6.0625, "learning_rate": 2.3983218157703964e-06, "loss": 0.6086, "mean_token_accuracy": 0.8736053854227066, "num_tokens": 252380458.0, "step": 2360 }, { "epoch": 5.378563283922463, "grad_norm": 5.28125, "learning_rate": 2.3964399543711427e-06, "loss": 0.59, "mean_token_accuracy": 0.8807500749826431, "num_tokens": 252487767.0, "step": 2361 }, { "epoch": 5.380843785632839, "grad_norm": 5.84375, "learning_rate": 2.394558151750572e-06, "loss": 0.5971, "mean_token_accuracy": 0.8786357641220093, "num_tokens": 252595094.0, "step": 2362 }, { "epoch": 5.3831242873432155, "grad_norm": 5.84375, "learning_rate": 2.3926764089767594e-06, "loss": 0.571, "mean_token_accuracy": 0.8844779282808304, "num_tokens": 252702007.0, "step": 2363 }, { "epoch": 5.385404789053592, "grad_norm": 6.125, "learning_rate": 2.3907947271177444e-06, "loss": 0.5826, "mean_token_accuracy": 0.8814315497875214, "num_tokens": 252808779.0, "step": 2364 }, { "epoch": 5.387685290763968, "grad_norm": 2.703125, "learning_rate": 2.388913107241534e-06, "loss": 0.5825, "mean_token_accuracy": 0.8800296932458878, "num_tokens": 252915277.0, "step": 2365 }, { "epoch": 5.389965792474344, "grad_norm": 3.375, "learning_rate": 2.3870315504160995e-06, "loss": 0.5783, "mean_token_accuracy": 0.8823367357254028, "num_tokens": 253022601.0, "step": 2366 }, { "epoch": 5.39224629418472, "grad_norm": 2.703125, "learning_rate": 2.3851500577093757e-06, "loss": 0.5541, "mean_token_accuracy": 0.8841412514448166, "num_tokens": 253129731.0, "step": 2367 }, { "epoch": 5.394526795895097, "grad_norm": 4.8125, "learning_rate": 2.3832686301892628e-06, "loss": 0.5722, "mean_token_accuracy": 0.8834680169820786, "num_tokens": 253236662.0, "step": 2368 }, { "epoch": 5.3968072976054735, "grad_norm": 4.3125, "learning_rate": 2.381387268923621e-06, "loss": 0.594, "mean_token_accuracy": 0.8795076310634613, "num_tokens": 253343900.0, "step": 2369 }, { "epoch": 5.39908779931585, "grad_norm": 3.5625, "learning_rate": 2.3795059749802756e-06, "loss": 0.5778, "mean_token_accuracy": 0.8820982426404953, "num_tokens": 253450924.0, "step": 2370 }, { "epoch": 5.401368301026226, "grad_norm": 3.515625, "learning_rate": 2.377624749427012e-06, "loss": 0.5701, "mean_token_accuracy": 0.884630486369133, "num_tokens": 253558210.0, "step": 2371 }, { "epoch": 5.403648802736602, "grad_norm": 4.84375, "learning_rate": 2.3757435933315787e-06, "loss": 0.5739, "mean_token_accuracy": 0.878990963101387, "num_tokens": 253665052.0, "step": 2372 }, { "epoch": 5.405929304446978, "grad_norm": 5.03125, "learning_rate": 2.3738625077616837e-06, "loss": 0.5629, "mean_token_accuracy": 0.8815456032752991, "num_tokens": 253772336.0, "step": 2373 }, { "epoch": 5.4082098061573545, "grad_norm": 3.046875, "learning_rate": 2.371981493784993e-06, "loss": 0.5833, "mean_token_accuracy": 0.8819355368614197, "num_tokens": 253878690.0, "step": 2374 }, { "epoch": 5.410490307867731, "grad_norm": 2.953125, "learning_rate": 2.370100552469135e-06, "loss": 0.5625, "mean_token_accuracy": 0.8823709636926651, "num_tokens": 253985411.0, "step": 2375 }, { "epoch": 5.412770809578107, "grad_norm": 4.6875, "learning_rate": 2.3682196848816955e-06, "loss": 0.5734, "mean_token_accuracy": 0.881668359041214, "num_tokens": 254092624.0, "step": 2376 }, { "epoch": 5.415051311288483, "grad_norm": 2.765625, "learning_rate": 2.3663388920902198e-06, "loss": 0.5749, "mean_token_accuracy": 0.8824621737003326, "num_tokens": 254199578.0, "step": 2377 }, { "epoch": 5.41733181299886, "grad_norm": 4.6875, "learning_rate": 2.3644581751622076e-06, "loss": 0.5751, "mean_token_accuracy": 0.8811022043228149, "num_tokens": 254306615.0, "step": 2378 }, { "epoch": 5.419612314709236, "grad_norm": 7.15625, "learning_rate": 2.3625775351651193e-06, "loss": 0.5955, "mean_token_accuracy": 0.8790160417556763, "num_tokens": 254413447.0, "step": 2379 }, { "epoch": 5.4218928164196125, "grad_norm": 3.796875, "learning_rate": 2.3606969731663683e-06, "loss": 0.5955, "mean_token_accuracy": 0.8772003203630447, "num_tokens": 254520623.0, "step": 2380 }, { "epoch": 5.424173318129989, "grad_norm": 2.859375, "learning_rate": 2.358816490233326e-06, "loss": 0.5646, "mean_token_accuracy": 0.8835051357746124, "num_tokens": 254627369.0, "step": 2381 }, { "epoch": 5.426453819840365, "grad_norm": 3.40625, "learning_rate": 2.356936087433318e-06, "loss": 0.5711, "mean_token_accuracy": 0.8843741118907928, "num_tokens": 254734806.0, "step": 2382 }, { "epoch": 5.428734321550741, "grad_norm": 3.203125, "learning_rate": 2.3550557658336245e-06, "loss": 0.5727, "mean_token_accuracy": 0.881999284029007, "num_tokens": 254841536.0, "step": 2383 }, { "epoch": 5.431014823261117, "grad_norm": 2.84375, "learning_rate": 2.3531755265014818e-06, "loss": 0.5481, "mean_token_accuracy": 0.8869676440954208, "num_tokens": 254949872.0, "step": 2384 }, { "epoch": 5.4332953249714935, "grad_norm": 3.234375, "learning_rate": 2.3512953705040737e-06, "loss": 0.525, "mean_token_accuracy": 0.894694447517395, "num_tokens": 255057725.0, "step": 2385 }, { "epoch": 5.43557582668187, "grad_norm": 4.90625, "learning_rate": 2.3494152989085433e-06, "loss": 0.5388, "mean_token_accuracy": 0.888728454709053, "num_tokens": 255165920.0, "step": 2386 }, { "epoch": 5.437856328392247, "grad_norm": 4.28125, "learning_rate": 2.3475353127819827e-06, "loss": 0.5708, "mean_token_accuracy": 0.8810894787311554, "num_tokens": 255273480.0, "step": 2387 }, { "epoch": 5.440136830102623, "grad_norm": 3.328125, "learning_rate": 2.345655413191434e-06, "loss": 0.5962, "mean_token_accuracy": 0.880548432469368, "num_tokens": 255380729.0, "step": 2388 }, { "epoch": 5.442417331812999, "grad_norm": 3.296875, "learning_rate": 2.3437756012038933e-06, "loss": 0.5678, "mean_token_accuracy": 0.882551446557045, "num_tokens": 255488672.0, "step": 2389 }, { "epoch": 5.444697833523375, "grad_norm": 2.609375, "learning_rate": 2.341895877886306e-06, "loss": 0.571, "mean_token_accuracy": 0.8806143552064896, "num_tokens": 255595607.0, "step": 2390 }, { "epoch": 5.4469783352337515, "grad_norm": 2.671875, "learning_rate": 2.3400162443055655e-06, "loss": 0.5729, "mean_token_accuracy": 0.8818208873271942, "num_tokens": 255702745.0, "step": 2391 }, { "epoch": 5.449258836944128, "grad_norm": 2.625, "learning_rate": 2.338136701528516e-06, "loss": 0.5701, "mean_token_accuracy": 0.8838256001472473, "num_tokens": 255810202.0, "step": 2392 }, { "epoch": 5.451539338654504, "grad_norm": 3.859375, "learning_rate": 2.33625725062195e-06, "loss": 0.6016, "mean_token_accuracy": 0.8813562989234924, "num_tokens": 255916916.0, "step": 2393 }, { "epoch": 5.45381984036488, "grad_norm": 2.765625, "learning_rate": 2.3343778926526074e-06, "loss": 0.5654, "mean_token_accuracy": 0.8832430243492126, "num_tokens": 256023471.0, "step": 2394 }, { "epoch": 5.456100342075256, "grad_norm": 3.125, "learning_rate": 2.332498628687176e-06, "loss": 0.5833, "mean_token_accuracy": 0.8809223771095276, "num_tokens": 256130280.0, "step": 2395 }, { "epoch": 5.4583808437856325, "grad_norm": 3.71875, "learning_rate": 2.330619459792289e-06, "loss": 0.5938, "mean_token_accuracy": 0.8768937885761261, "num_tokens": 256236939.0, "step": 2396 }, { "epoch": 5.460661345496009, "grad_norm": 2.703125, "learning_rate": 2.328740387034526e-06, "loss": 0.587, "mean_token_accuracy": 0.8795687556266785, "num_tokens": 256344069.0, "step": 2397 }, { "epoch": 5.462941847206386, "grad_norm": 2.609375, "learning_rate": 2.326861411480414e-06, "loss": 0.545, "mean_token_accuracy": 0.8894121795892715, "num_tokens": 256450947.0, "step": 2398 }, { "epoch": 5.465222348916762, "grad_norm": 2.65625, "learning_rate": 2.324982534196421e-06, "loss": 0.5617, "mean_token_accuracy": 0.8822938203811646, "num_tokens": 256558445.0, "step": 2399 }, { "epoch": 5.467502850627138, "grad_norm": 3.3125, "learning_rate": 2.3231037562489636e-06, "loss": 0.5599, "mean_token_accuracy": 0.8841269761323929, "num_tokens": 256665737.0, "step": 2400 }, { "epoch": 5.469783352337514, "grad_norm": 2.90625, "learning_rate": 2.321225078704399e-06, "loss": 0.5657, "mean_token_accuracy": 0.8798850178718567, "num_tokens": 256772658.0, "step": 2401 }, { "epoch": 5.4720638540478905, "grad_norm": 2.9375, "learning_rate": 2.319346502629028e-06, "loss": 0.5642, "mean_token_accuracy": 0.883221909403801, "num_tokens": 256879693.0, "step": 2402 }, { "epoch": 5.474344355758267, "grad_norm": 3.78125, "learning_rate": 2.3174680290890945e-06, "loss": 0.596, "mean_token_accuracy": 0.882306694984436, "num_tokens": 256986979.0, "step": 2403 }, { "epoch": 5.476624857468643, "grad_norm": 2.890625, "learning_rate": 2.315589659150784e-06, "loss": 0.5844, "mean_token_accuracy": 0.8786382675170898, "num_tokens": 257093930.0, "step": 2404 }, { "epoch": 5.478905359179019, "grad_norm": 4.03125, "learning_rate": 2.3137113938802224e-06, "loss": 0.5936, "mean_token_accuracy": 0.8781066983938217, "num_tokens": 257201360.0, "step": 2405 }, { "epoch": 5.481185860889395, "grad_norm": 3.96875, "learning_rate": 2.311833234343478e-06, "loss": 0.5852, "mean_token_accuracy": 0.8780955076217651, "num_tokens": 257308294.0, "step": 2406 }, { "epoch": 5.483466362599772, "grad_norm": 2.71875, "learning_rate": 2.3099551816065563e-06, "loss": 0.6028, "mean_token_accuracy": 0.8727833181619644, "num_tokens": 257414698.0, "step": 2407 }, { "epoch": 5.485746864310149, "grad_norm": 3.125, "learning_rate": 2.3080772367354046e-06, "loss": 0.5579, "mean_token_accuracy": 0.888285905122757, "num_tokens": 257521674.0, "step": 2408 }, { "epoch": 5.488027366020525, "grad_norm": 3.9375, "learning_rate": 2.3061994007959086e-06, "loss": 0.5819, "mean_token_accuracy": 0.8786954879760742, "num_tokens": 257628383.0, "step": 2409 }, { "epoch": 5.490307867730901, "grad_norm": 2.515625, "learning_rate": 2.304321674853891e-06, "loss": 0.5604, "mean_token_accuracy": 0.8847887217998505, "num_tokens": 257735646.0, "step": 2410 }, { "epoch": 5.492588369441277, "grad_norm": 4.65625, "learning_rate": 2.3024440599751132e-06, "loss": 0.5828, "mean_token_accuracy": 0.8823476582765579, "num_tokens": 257843020.0, "step": 2411 }, { "epoch": 5.494868871151653, "grad_norm": 2.984375, "learning_rate": 2.3005665572252732e-06, "loss": 0.6067, "mean_token_accuracy": 0.8751996457576752, "num_tokens": 257949651.0, "step": 2412 }, { "epoch": 5.4971493728620295, "grad_norm": 3.390625, "learning_rate": 2.2986891676700042e-06, "loss": 0.5749, "mean_token_accuracy": 0.8818869143724442, "num_tokens": 258056645.0, "step": 2413 }, { "epoch": 5.499429874572406, "grad_norm": 3.921875, "learning_rate": 2.296811892374878e-06, "loss": 0.563, "mean_token_accuracy": 0.8861507475376129, "num_tokens": 258163786.0, "step": 2414 }, { "epoch": 5.501710376282782, "grad_norm": 2.796875, "learning_rate": 2.294934732405398e-06, "loss": 0.573, "mean_token_accuracy": 0.8789652734994888, "num_tokens": 258271199.0, "step": 2415 }, { "epoch": 5.503990877993158, "grad_norm": 3.40625, "learning_rate": 2.293057688827007e-06, "loss": 0.5625, "mean_token_accuracy": 0.8831315487623215, "num_tokens": 258378688.0, "step": 2416 }, { "epoch": 5.506271379703534, "grad_norm": 2.78125, "learning_rate": 2.2911807627050745e-06, "loss": 0.5657, "mean_token_accuracy": 0.8817677646875381, "num_tokens": 258486085.0, "step": 2417 }, { "epoch": 5.508551881413911, "grad_norm": 3.265625, "learning_rate": 2.2893039551049104e-06, "loss": 0.5869, "mean_token_accuracy": 0.8783139288425446, "num_tokens": 258593269.0, "step": 2418 }, { "epoch": 5.510832383124288, "grad_norm": 2.78125, "learning_rate": 2.2874272670917534e-06, "loss": 0.5944, "mean_token_accuracy": 0.8795038163661957, "num_tokens": 258700704.0, "step": 2419 }, { "epoch": 5.513112884834664, "grad_norm": 4.40625, "learning_rate": 2.2855506997307766e-06, "loss": 0.5625, "mean_token_accuracy": 0.8858631104230881, "num_tokens": 258807861.0, "step": 2420 }, { "epoch": 5.513112884834664, "eval_loss": 0.5877215266227722, "eval_mean_token_accuracy": 0.8795312627186793, "eval_num_tokens": 258807861.0, "eval_runtime": 58.5506, "eval_samples_per_second": 143.21, "eval_steps_per_second": 4.492, "step": 2420 }, { "epoch": 5.51539338654504, "grad_norm": 5.46875, "learning_rate": 2.283674254087082e-06, "loss": 0.5588, "mean_token_accuracy": 0.8837638050317764, "num_tokens": 258914737.0, "step": 2421 }, { "epoch": 5.517673888255416, "grad_norm": 3.390625, "learning_rate": 2.281797931225705e-06, "loss": 0.6097, "mean_token_accuracy": 0.8738208562135696, "num_tokens": 259021178.0, "step": 2422 }, { "epoch": 5.519954389965792, "grad_norm": 2.6875, "learning_rate": 2.279921732211609e-06, "loss": 0.5914, "mean_token_accuracy": 0.8749384880065918, "num_tokens": 259127761.0, "step": 2423 }, { "epoch": 5.5222348916761685, "grad_norm": 2.453125, "learning_rate": 2.278045658109689e-06, "loss": 0.5707, "mean_token_accuracy": 0.8829948306083679, "num_tokens": 259234696.0, "step": 2424 }, { "epoch": 5.524515393386545, "grad_norm": 3.21875, "learning_rate": 2.2761697099847686e-06, "loss": 0.59, "mean_token_accuracy": 0.8792230039834976, "num_tokens": 259341237.0, "step": 2425 }, { "epoch": 5.526795895096921, "grad_norm": 3.796875, "learning_rate": 2.274293888901599e-06, "loss": 0.5621, "mean_token_accuracy": 0.8843846768140793, "num_tokens": 259448732.0, "step": 2426 }, { "epoch": 5.529076396807298, "grad_norm": 3.484375, "learning_rate": 2.2724181959248627e-06, "loss": 0.5692, "mean_token_accuracy": 0.8816651403903961, "num_tokens": 259555843.0, "step": 2427 }, { "epoch": 5.531356898517674, "grad_norm": 3.46875, "learning_rate": 2.270542632119163e-06, "loss": 0.5845, "mean_token_accuracy": 0.8784616589546204, "num_tokens": 259662832.0, "step": 2428 }, { "epoch": 5.53363740022805, "grad_norm": 2.703125, "learning_rate": 2.2686671985490355e-06, "loss": 0.5587, "mean_token_accuracy": 0.8840401917695999, "num_tokens": 259770125.0, "step": 2429 }, { "epoch": 5.535917901938427, "grad_norm": 5.78125, "learning_rate": 2.26679189627894e-06, "loss": 0.5922, "mean_token_accuracy": 0.8798678070306778, "num_tokens": 259876661.0, "step": 2430 }, { "epoch": 5.538198403648803, "grad_norm": 2.859375, "learning_rate": 2.264916726373263e-06, "loss": 0.5936, "mean_token_accuracy": 0.8790270835161209, "num_tokens": 259982852.0, "step": 2431 }, { "epoch": 5.540478905359179, "grad_norm": 5.375, "learning_rate": 2.263041689896313e-06, "loss": 0.5722, "mean_token_accuracy": 0.881781816482544, "num_tokens": 260089664.0, "step": 2432 }, { "epoch": 5.542759407069555, "grad_norm": 3.796875, "learning_rate": 2.261166787912325e-06, "loss": 0.5976, "mean_token_accuracy": 0.8784068375825882, "num_tokens": 260196663.0, "step": 2433 }, { "epoch": 5.545039908779931, "grad_norm": 2.96875, "learning_rate": 2.2592920214854573e-06, "loss": 0.5744, "mean_token_accuracy": 0.8845723420381546, "num_tokens": 260303381.0, "step": 2434 }, { "epoch": 5.5473204104903076, "grad_norm": 3.109375, "learning_rate": 2.2574173916797912e-06, "loss": 0.5798, "mean_token_accuracy": 0.8818473219871521, "num_tokens": 260410524.0, "step": 2435 }, { "epoch": 5.549600912200685, "grad_norm": 2.765625, "learning_rate": 2.2555428995593303e-06, "loss": 0.5764, "mean_token_accuracy": 0.8806977868080139, "num_tokens": 260517537.0, "step": 2436 }, { "epoch": 5.55188141391106, "grad_norm": 6.15625, "learning_rate": 2.253668546188e-06, "loss": 0.5816, "mean_token_accuracy": 0.8790431469678879, "num_tokens": 260624932.0, "step": 2437 }, { "epoch": 5.554161915621437, "grad_norm": 4.15625, "learning_rate": 2.2517943326296487e-06, "loss": 0.5477, "mean_token_accuracy": 0.8867908716201782, "num_tokens": 260732712.0, "step": 2438 }, { "epoch": 5.556442417331813, "grad_norm": 3.8125, "learning_rate": 2.249920259948041e-06, "loss": 0.5552, "mean_token_accuracy": 0.8851701766252518, "num_tokens": 260840731.0, "step": 2439 }, { "epoch": 5.558722919042189, "grad_norm": 3.828125, "learning_rate": 2.2480463292068655e-06, "loss": 0.5642, "mean_token_accuracy": 0.8820369690656662, "num_tokens": 260947477.0, "step": 2440 }, { "epoch": 5.561003420752566, "grad_norm": 3.453125, "learning_rate": 2.24617254146973e-06, "loss": 0.5957, "mean_token_accuracy": 0.879251167178154, "num_tokens": 261054404.0, "step": 2441 }, { "epoch": 5.563283922462942, "grad_norm": 4.0625, "learning_rate": 2.2442988978001594e-06, "loss": 0.5664, "mean_token_accuracy": 0.8821232914924622, "num_tokens": 261162432.0, "step": 2442 }, { "epoch": 5.565564424173318, "grad_norm": 3.046875, "learning_rate": 2.2424253992615983e-06, "loss": 0.5748, "mean_token_accuracy": 0.8804102689027786, "num_tokens": 261269127.0, "step": 2443 }, { "epoch": 5.567844925883694, "grad_norm": 5.65625, "learning_rate": 2.2405520469174084e-06, "loss": 0.5802, "mean_token_accuracy": 0.8801386058330536, "num_tokens": 261376055.0, "step": 2444 }, { "epoch": 5.57012542759407, "grad_norm": 2.546875, "learning_rate": 2.238678841830867e-06, "loss": 0.5766, "mean_token_accuracy": 0.8823980838060379, "num_tokens": 261482935.0, "step": 2445 }, { "epoch": 5.572405929304447, "grad_norm": 5.5625, "learning_rate": 2.23680578506517e-06, "loss": 0.583, "mean_token_accuracy": 0.8805437237024307, "num_tokens": 261589878.0, "step": 2446 }, { "epoch": 5.574686431014824, "grad_norm": 2.65625, "learning_rate": 2.234932877683428e-06, "loss": 0.5763, "mean_token_accuracy": 0.878250241279602, "num_tokens": 261696700.0, "step": 2447 }, { "epoch": 5.5769669327252, "grad_norm": 4.28125, "learning_rate": 2.233060120748667e-06, "loss": 0.5807, "mean_token_accuracy": 0.8817458301782608, "num_tokens": 261803758.0, "step": 2448 }, { "epoch": 5.579247434435576, "grad_norm": 3.421875, "learning_rate": 2.2311875153238296e-06, "loss": 0.5845, "mean_token_accuracy": 0.8814815580844879, "num_tokens": 261910576.0, "step": 2449 }, { "epoch": 5.581527936145952, "grad_norm": 2.8125, "learning_rate": 2.229315062471768e-06, "loss": 0.5653, "mean_token_accuracy": 0.8858354687690735, "num_tokens": 262017627.0, "step": 2450 }, { "epoch": 5.583808437856328, "grad_norm": 5.59375, "learning_rate": 2.2274427632552507e-06, "loss": 0.5912, "mean_token_accuracy": 0.8767371773719788, "num_tokens": 262124172.0, "step": 2451 }, { "epoch": 5.586088939566705, "grad_norm": 5.96875, "learning_rate": 2.2255706187369596e-06, "loss": 0.5664, "mean_token_accuracy": 0.8821327090263367, "num_tokens": 262231439.0, "step": 2452 }, { "epoch": 5.588369441277081, "grad_norm": 5.46875, "learning_rate": 2.223698629979487e-06, "loss": 0.5721, "mean_token_accuracy": 0.8792487680912018, "num_tokens": 262338613.0, "step": 2453 }, { "epoch": 5.590649942987457, "grad_norm": 5.65625, "learning_rate": 2.221826798045338e-06, "loss": 0.5621, "mean_token_accuracy": 0.8853352516889572, "num_tokens": 262445246.0, "step": 2454 }, { "epoch": 5.592930444697833, "grad_norm": 5.34375, "learning_rate": 2.2199551239969284e-06, "loss": 0.5763, "mean_token_accuracy": 0.8816657811403275, "num_tokens": 262552338.0, "step": 2455 }, { "epoch": 5.59521094640821, "grad_norm": 3.3125, "learning_rate": 2.2180836088965833e-06, "loss": 0.5937, "mean_token_accuracy": 0.8782707005739212, "num_tokens": 262659476.0, "step": 2456 }, { "epoch": 5.5974914481185865, "grad_norm": 2.9375, "learning_rate": 2.216212253806539e-06, "loss": 0.567, "mean_token_accuracy": 0.88326196372509, "num_tokens": 262766605.0, "step": 2457 }, { "epoch": 5.599771949828963, "grad_norm": 3.140625, "learning_rate": 2.214341059788941e-06, "loss": 0.5598, "mean_token_accuracy": 0.8842233419418335, "num_tokens": 262873710.0, "step": 2458 }, { "epoch": 5.602052451539339, "grad_norm": 3.21875, "learning_rate": 2.2124700279058435e-06, "loss": 0.5855, "mean_token_accuracy": 0.8748954981565475, "num_tokens": 262980715.0, "step": 2459 }, { "epoch": 5.604332953249715, "grad_norm": 5.28125, "learning_rate": 2.2105991592192063e-06, "loss": 0.5897, "mean_token_accuracy": 0.8786115646362305, "num_tokens": 263087540.0, "step": 2460 }, { "epoch": 5.606613454960091, "grad_norm": 3.53125, "learning_rate": 2.208728454790899e-06, "loss": 0.57, "mean_token_accuracy": 0.8810366690158844, "num_tokens": 263194654.0, "step": 2461 }, { "epoch": 5.608893956670467, "grad_norm": 4.3125, "learning_rate": 2.2068579156826974e-06, "loss": 0.5828, "mean_token_accuracy": 0.8800259828567505, "num_tokens": 263302451.0, "step": 2462 }, { "epoch": 5.611174458380844, "grad_norm": 2.65625, "learning_rate": 2.2049875429562845e-06, "loss": 0.5813, "mean_token_accuracy": 0.8829780966043472, "num_tokens": 263409487.0, "step": 2463 }, { "epoch": 5.61345496009122, "grad_norm": 2.640625, "learning_rate": 2.203117337673246e-06, "loss": 0.5514, "mean_token_accuracy": 0.8871753662824631, "num_tokens": 263516373.0, "step": 2464 }, { "epoch": 5.615735461801596, "grad_norm": 3.734375, "learning_rate": 2.2012473008950756e-06, "loss": 0.5481, "mean_token_accuracy": 0.8859710693359375, "num_tokens": 263624001.0, "step": 2465 }, { "epoch": 5.618015963511972, "grad_norm": 4.4375, "learning_rate": 2.1993774336831696e-06, "loss": 0.5768, "mean_token_accuracy": 0.8810239285230637, "num_tokens": 263731549.0, "step": 2466 }, { "epoch": 5.620296465222349, "grad_norm": 3.171875, "learning_rate": 2.197507737098828e-06, "loss": 0.5722, "mean_token_accuracy": 0.8845034837722778, "num_tokens": 263838875.0, "step": 2467 }, { "epoch": 5.6225769669327255, "grad_norm": 4.59375, "learning_rate": 2.195638212203255e-06, "loss": 0.5716, "mean_token_accuracy": 0.881766065955162, "num_tokens": 263945489.0, "step": 2468 }, { "epoch": 5.624857468643102, "grad_norm": 3.09375, "learning_rate": 2.193768860057557e-06, "loss": 0.5533, "mean_token_accuracy": 0.8849270343780518, "num_tokens": 264052839.0, "step": 2469 }, { "epoch": 5.627137970353478, "grad_norm": 2.984375, "learning_rate": 2.191899681722743e-06, "loss": 0.5856, "mean_token_accuracy": 0.8784494251012802, "num_tokens": 264159607.0, "step": 2470 }, { "epoch": 5.629418472063854, "grad_norm": 4.15625, "learning_rate": 2.19003067825972e-06, "loss": 0.5743, "mean_token_accuracy": 0.8801737427711487, "num_tokens": 264266733.0, "step": 2471 }, { "epoch": 5.63169897377423, "grad_norm": 3.125, "learning_rate": 2.1881618507293004e-06, "loss": 0.5579, "mean_token_accuracy": 0.8845822513103485, "num_tokens": 264374129.0, "step": 2472 }, { "epoch": 5.633979475484606, "grad_norm": 2.546875, "learning_rate": 2.186293200192194e-06, "loss": 0.5713, "mean_token_accuracy": 0.8821998536586761, "num_tokens": 264481221.0, "step": 2473 }, { "epoch": 5.636259977194983, "grad_norm": 2.640625, "learning_rate": 2.1844247277090113e-06, "loss": 0.5764, "mean_token_accuracy": 0.8810071498155594, "num_tokens": 264588602.0, "step": 2474 }, { "epoch": 5.638540478905359, "grad_norm": 3.90625, "learning_rate": 2.1825564343402606e-06, "loss": 0.5958, "mean_token_accuracy": 0.8767081648111343, "num_tokens": 264695289.0, "step": 2475 }, { "epoch": 5.640820980615736, "grad_norm": 4.71875, "learning_rate": 2.180688321146349e-06, "loss": 0.5983, "mean_token_accuracy": 0.8806226551532745, "num_tokens": 264801458.0, "step": 2476 }, { "epoch": 5.643101482326112, "grad_norm": 3.484375, "learning_rate": 2.1788203891875818e-06, "loss": 0.566, "mean_token_accuracy": 0.8851145952939987, "num_tokens": 264908510.0, "step": 2477 }, { "epoch": 5.645381984036488, "grad_norm": 2.890625, "learning_rate": 2.176952639524161e-06, "loss": 0.5673, "mean_token_accuracy": 0.8833800852298737, "num_tokens": 265015635.0, "step": 2478 }, { "epoch": 5.6476624857468645, "grad_norm": 3.796875, "learning_rate": 2.175085073216185e-06, "loss": 0.5733, "mean_token_accuracy": 0.8817281723022461, "num_tokens": 265122707.0, "step": 2479 }, { "epoch": 5.649942987457241, "grad_norm": 2.6875, "learning_rate": 2.173217691323649e-06, "loss": 0.5607, "mean_token_accuracy": 0.8875688314437866, "num_tokens": 265229807.0, "step": 2480 }, { "epoch": 5.652223489167617, "grad_norm": 2.78125, "learning_rate": 2.1713504949064433e-06, "loss": 0.5366, "mean_token_accuracy": 0.8914333879947662, "num_tokens": 265337470.0, "step": 2481 }, { "epoch": 5.654503990877993, "grad_norm": 3.125, "learning_rate": 2.169483485024351e-06, "loss": 0.5678, "mean_token_accuracy": 0.8841661512851715, "num_tokens": 265444298.0, "step": 2482 }, { "epoch": 5.656784492588369, "grad_norm": 3.125, "learning_rate": 2.167616662737052e-06, "loss": 0.5528, "mean_token_accuracy": 0.8871481865644455, "num_tokens": 265551726.0, "step": 2483 }, { "epoch": 5.659064994298745, "grad_norm": 4.0, "learning_rate": 2.1657500291041185e-06, "loss": 0.5541, "mean_token_accuracy": 0.8866736739873886, "num_tokens": 265659350.0, "step": 2484 }, { "epoch": 5.661345496009122, "grad_norm": 3.75, "learning_rate": 2.1638835851850155e-06, "loss": 0.583, "mean_token_accuracy": 0.8781305104494095, "num_tokens": 265766187.0, "step": 2485 }, { "epoch": 5.663625997719498, "grad_norm": 2.8125, "learning_rate": 2.1620173320391007e-06, "loss": 0.5734, "mean_token_accuracy": 0.8814955800771713, "num_tokens": 265873426.0, "step": 2486 }, { "epoch": 5.665906499429875, "grad_norm": 2.65625, "learning_rate": 2.160151270725623e-06, "loss": 0.5756, "mean_token_accuracy": 0.8812149912118912, "num_tokens": 265980661.0, "step": 2487 }, { "epoch": 5.668187001140251, "grad_norm": 5.96875, "learning_rate": 2.158285402303723e-06, "loss": 0.5938, "mean_token_accuracy": 0.8758508861064911, "num_tokens": 266087907.0, "step": 2488 }, { "epoch": 5.670467502850627, "grad_norm": 3.203125, "learning_rate": 2.1564197278324317e-06, "loss": 0.5639, "mean_token_accuracy": 0.8836510181427002, "num_tokens": 266195252.0, "step": 2489 }, { "epoch": 5.6727480045610035, "grad_norm": 3.296875, "learning_rate": 2.1545542483706694e-06, "loss": 0.5682, "mean_token_accuracy": 0.8824823647737503, "num_tokens": 266302369.0, "step": 2490 }, { "epoch": 5.67502850627138, "grad_norm": 2.625, "learning_rate": 2.1526889649772477e-06, "loss": 0.5612, "mean_token_accuracy": 0.885822594165802, "num_tokens": 266409450.0, "step": 2491 }, { "epoch": 5.677309007981756, "grad_norm": 3.578125, "learning_rate": 2.1508238787108633e-06, "loss": 0.574, "mean_token_accuracy": 0.8812492936849594, "num_tokens": 266516611.0, "step": 2492 }, { "epoch": 5.679589509692132, "grad_norm": 5.28125, "learning_rate": 2.1489589906301046e-06, "loss": 0.5951, "mean_token_accuracy": 0.8759992122650146, "num_tokens": 266623332.0, "step": 2493 }, { "epoch": 5.681870011402508, "grad_norm": 3.453125, "learning_rate": 2.1470943017934455e-06, "loss": 0.5674, "mean_token_accuracy": 0.8840955346822739, "num_tokens": 266730787.0, "step": 2494 }, { "epoch": 5.684150513112884, "grad_norm": 3.265625, "learning_rate": 2.145229813259248e-06, "loss": 0.5672, "mean_token_accuracy": 0.8826257884502411, "num_tokens": 266838139.0, "step": 2495 }, { "epoch": 5.6864310148232615, "grad_norm": 2.90625, "learning_rate": 2.143365526085759e-06, "loss": 0.5761, "mean_token_accuracy": 0.88275146484375, "num_tokens": 266944920.0, "step": 2496 }, { "epoch": 5.688711516533638, "grad_norm": 2.765625, "learning_rate": 2.1415014413311126e-06, "loss": 0.5877, "mean_token_accuracy": 0.880971685051918, "num_tokens": 267051841.0, "step": 2497 }, { "epoch": 5.690992018244014, "grad_norm": 3.09375, "learning_rate": 2.139637560053327e-06, "loss": 0.5747, "mean_token_accuracy": 0.8808932155370712, "num_tokens": 267159338.0, "step": 2498 }, { "epoch": 5.69327251995439, "grad_norm": 3.65625, "learning_rate": 2.137773883310305e-06, "loss": 0.5713, "mean_token_accuracy": 0.8823635280132294, "num_tokens": 267266461.0, "step": 2499 }, { "epoch": 5.695553021664766, "grad_norm": 3.28125, "learning_rate": 2.1359104121598337e-06, "loss": 0.58, "mean_token_accuracy": 0.881933718919754, "num_tokens": 267373196.0, "step": 2500 }, { "epoch": 5.6978335233751425, "grad_norm": 4.0, "learning_rate": 2.1340471476595836e-06, "loss": 0.5806, "mean_token_accuracy": 0.8774979561567307, "num_tokens": 267480795.0, "step": 2501 }, { "epoch": 5.700114025085519, "grad_norm": 2.671875, "learning_rate": 2.1321840908671082e-06, "loss": 0.5798, "mean_token_accuracy": 0.8825899064540863, "num_tokens": 267587894.0, "step": 2502 }, { "epoch": 5.702394526795895, "grad_norm": 3.96875, "learning_rate": 2.1303212428398407e-06, "loss": 0.5757, "mean_token_accuracy": 0.8776924163103104, "num_tokens": 267694535.0, "step": 2503 }, { "epoch": 5.704675028506271, "grad_norm": 3.296875, "learning_rate": 2.1284586046350996e-06, "loss": 0.5826, "mean_token_accuracy": 0.8794005513191223, "num_tokens": 267801843.0, "step": 2504 }, { "epoch": 5.706955530216648, "grad_norm": 2.921875, "learning_rate": 2.126596177310081e-06, "loss": 0.5546, "mean_token_accuracy": 0.8848482072353363, "num_tokens": 267909136.0, "step": 2505 }, { "epoch": 5.7092360319270234, "grad_norm": 3.125, "learning_rate": 2.124733961921864e-06, "loss": 0.5662, "mean_token_accuracy": 0.8841657936573029, "num_tokens": 268016448.0, "step": 2506 }, { "epoch": 5.7115165336374005, "grad_norm": 3.484375, "learning_rate": 2.1228719595274056e-06, "loss": 0.6123, "mean_token_accuracy": 0.8750414401292801, "num_tokens": 268123331.0, "step": 2507 }, { "epoch": 5.713797035347777, "grad_norm": 3.9375, "learning_rate": 2.1210101711835413e-06, "loss": 0.5645, "mean_token_accuracy": 0.8833545595407486, "num_tokens": 268230339.0, "step": 2508 }, { "epoch": 5.716077537058153, "grad_norm": 5.53125, "learning_rate": 2.1191485979469877e-06, "loss": 0.5788, "mean_token_accuracy": 0.880239725112915, "num_tokens": 268337047.0, "step": 2509 }, { "epoch": 5.718358038768529, "grad_norm": 3.765625, "learning_rate": 2.1172872408743374e-06, "loss": 0.5785, "mean_token_accuracy": 0.8804043382406235, "num_tokens": 268444006.0, "step": 2510 }, { "epoch": 5.720638540478905, "grad_norm": 4.5625, "learning_rate": 2.11542610102206e-06, "loss": 0.6014, "mean_token_accuracy": 0.8754177987575531, "num_tokens": 268550836.0, "step": 2511 }, { "epoch": 5.7229190421892815, "grad_norm": 3.390625, "learning_rate": 2.1135651794465032e-06, "loss": 0.5698, "mean_token_accuracy": 0.8840764611959457, "num_tokens": 268658140.0, "step": 2512 }, { "epoch": 5.725199543899658, "grad_norm": 4.09375, "learning_rate": 2.1117044772038915e-06, "loss": 0.5779, "mean_token_accuracy": 0.8794988542795181, "num_tokens": 268764953.0, "step": 2513 }, { "epoch": 5.727480045610034, "grad_norm": 4.40625, "learning_rate": 2.1098439953503207e-06, "loss": 0.5787, "mean_token_accuracy": 0.8798924386501312, "num_tokens": 268872046.0, "step": 2514 }, { "epoch": 5.72976054732041, "grad_norm": 5.96875, "learning_rate": 2.1079837349417664e-06, "loss": 0.58, "mean_token_accuracy": 0.8788573145866394, "num_tokens": 268978631.0, "step": 2515 }, { "epoch": 5.732041049030787, "grad_norm": 4.375, "learning_rate": 2.1061236970340756e-06, "loss": 0.5553, "mean_token_accuracy": 0.8862823694944382, "num_tokens": 269085803.0, "step": 2516 }, { "epoch": 5.734321550741163, "grad_norm": 3.171875, "learning_rate": 2.104263882682971e-06, "loss": 0.5867, "mean_token_accuracy": 0.8813868910074234, "num_tokens": 269192710.0, "step": 2517 }, { "epoch": 5.7366020524515395, "grad_norm": 3.671875, "learning_rate": 2.1024042929440465e-06, "loss": 0.5592, "mean_token_accuracy": 0.8835171908140182, "num_tokens": 269299690.0, "step": 2518 }, { "epoch": 5.738882554161916, "grad_norm": 3.0, "learning_rate": 2.1005449288727696e-06, "loss": 0.5593, "mean_token_accuracy": 0.8817470818758011, "num_tokens": 269406982.0, "step": 2519 }, { "epoch": 5.741163055872292, "grad_norm": 3.109375, "learning_rate": 2.0986857915244787e-06, "loss": 0.5702, "mean_token_accuracy": 0.8834887593984604, "num_tokens": 269514394.0, "step": 2520 }, { "epoch": 5.743443557582668, "grad_norm": 3.140625, "learning_rate": 2.096826881954385e-06, "loss": 0.5795, "mean_token_accuracy": 0.8815822005271912, "num_tokens": 269621655.0, "step": 2521 }, { "epoch": 5.745724059293044, "grad_norm": 3.296875, "learning_rate": 2.0949682012175693e-06, "loss": 0.5816, "mean_token_accuracy": 0.8810020089149475, "num_tokens": 269729019.0, "step": 2522 }, { "epoch": 5.7480045610034205, "grad_norm": 2.734375, "learning_rate": 2.093109750368983e-06, "loss": 0.5615, "mean_token_accuracy": 0.8838683366775513, "num_tokens": 269836494.0, "step": 2523 }, { "epoch": 5.750285062713797, "grad_norm": 3.890625, "learning_rate": 2.0912515304634485e-06, "loss": 0.5744, "mean_token_accuracy": 0.8854202926158905, "num_tokens": 269943579.0, "step": 2524 }, { "epoch": 5.752565564424174, "grad_norm": 3.234375, "learning_rate": 2.089393542555653e-06, "loss": 0.5693, "mean_token_accuracy": 0.8842357248067856, "num_tokens": 270050301.0, "step": 2525 }, { "epoch": 5.75484606613455, "grad_norm": 3.328125, "learning_rate": 2.0875357877001556e-06, "loss": 0.5587, "mean_token_accuracy": 0.8842559903860092, "num_tokens": 270157064.0, "step": 2526 }, { "epoch": 5.757126567844926, "grad_norm": 4.625, "learning_rate": 2.085678266951382e-06, "loss": 0.5951, "mean_token_accuracy": 0.8778895437717438, "num_tokens": 270263910.0, "step": 2527 }, { "epoch": 5.759407069555302, "grad_norm": 4.5, "learning_rate": 2.083820981363626e-06, "loss": 0.5729, "mean_token_accuracy": 0.8812290579080582, "num_tokens": 270371067.0, "step": 2528 }, { "epoch": 5.7616875712656785, "grad_norm": 2.5, "learning_rate": 2.0819639319910466e-06, "loss": 0.5658, "mean_token_accuracy": 0.8800959140062332, "num_tokens": 270478072.0, "step": 2529 }, { "epoch": 5.763968072976055, "grad_norm": 2.6875, "learning_rate": 2.0801071198876684e-06, "loss": 0.5703, "mean_token_accuracy": 0.8809636831283569, "num_tokens": 270584725.0, "step": 2530 }, { "epoch": 5.766248574686431, "grad_norm": 3.03125, "learning_rate": 2.0782505461073822e-06, "loss": 0.5835, "mean_token_accuracy": 0.8816069960594177, "num_tokens": 270691204.0, "step": 2531 }, { "epoch": 5.768529076396807, "grad_norm": 5.15625, "learning_rate": 2.076394211703944e-06, "loss": 0.5505, "mean_token_accuracy": 0.8873765766620636, "num_tokens": 270798291.0, "step": 2532 }, { "epoch": 5.770809578107183, "grad_norm": 4.5625, "learning_rate": 2.0745381177309732e-06, "loss": 0.5627, "mean_token_accuracy": 0.8861078023910522, "num_tokens": 270905483.0, "step": 2533 }, { "epoch": 5.7730900798175595, "grad_norm": 2.984375, "learning_rate": 2.072682265241954e-06, "loss": 0.5536, "mean_token_accuracy": 0.8886701166629791, "num_tokens": 271012509.0, "step": 2534 }, { "epoch": 5.775370581527936, "grad_norm": 5.21875, "learning_rate": 2.0708266552902303e-06, "loss": 0.5645, "mean_token_accuracy": 0.8834939152002335, "num_tokens": 271119387.0, "step": 2535 }, { "epoch": 5.777651083238313, "grad_norm": 4.34375, "learning_rate": 2.0689712889290114e-06, "loss": 0.5758, "mean_token_accuracy": 0.8821908384561539, "num_tokens": 271226607.0, "step": 2536 }, { "epoch": 5.779931584948689, "grad_norm": 3.25, "learning_rate": 2.0671161672113677e-06, "loss": 0.5895, "mean_token_accuracy": 0.8805604577064514, "num_tokens": 271333818.0, "step": 2537 }, { "epoch": 5.782212086659065, "grad_norm": 2.375, "learning_rate": 2.06526129119023e-06, "loss": 0.5637, "mean_token_accuracy": 0.8876101672649384, "num_tokens": 271440910.0, "step": 2538 }, { "epoch": 5.784492588369441, "grad_norm": 2.78125, "learning_rate": 2.063406661918391e-06, "loss": 0.5708, "mean_token_accuracy": 0.8823280781507492, "num_tokens": 271548224.0, "step": 2539 }, { "epoch": 5.7867730900798175, "grad_norm": 3.234375, "learning_rate": 2.0615522804485027e-06, "loss": 0.5788, "mean_token_accuracy": 0.8831372708082199, "num_tokens": 271655042.0, "step": 2540 }, { "epoch": 5.789053591790194, "grad_norm": 2.5, "learning_rate": 2.059698147833075e-06, "loss": 0.5802, "mean_token_accuracy": 0.8820475935935974, "num_tokens": 271762363.0, "step": 2541 }, { "epoch": 5.79133409350057, "grad_norm": 3.484375, "learning_rate": 2.0578442651244774e-06, "loss": 0.5628, "mean_token_accuracy": 0.8825207501649857, "num_tokens": 271869380.0, "step": 2542 }, { "epoch": 5.793614595210946, "grad_norm": 5.1875, "learning_rate": 2.0559906333749392e-06, "loss": 0.5647, "mean_token_accuracy": 0.8834405839443207, "num_tokens": 271976735.0, "step": 2543 }, { "epoch": 5.795895096921322, "grad_norm": 3.1875, "learning_rate": 2.054137253636545e-06, "loss": 0.578, "mean_token_accuracy": 0.8806865364313126, "num_tokens": 272084686.0, "step": 2544 }, { "epoch": 5.798175598631699, "grad_norm": 2.859375, "learning_rate": 2.0522841269612397e-06, "loss": 0.5912, "mean_token_accuracy": 0.8805492520332336, "num_tokens": 272192178.0, "step": 2545 }, { "epoch": 5.800456100342076, "grad_norm": 3.671875, "learning_rate": 2.0504312544008193e-06, "loss": 0.5647, "mean_token_accuracy": 0.883878156542778, "num_tokens": 272298806.0, "step": 2546 }, { "epoch": 5.802736602052452, "grad_norm": 3.640625, "learning_rate": 2.048578637006939e-06, "loss": 0.5506, "mean_token_accuracy": 0.8853472024202347, "num_tokens": 272405957.0, "step": 2547 }, { "epoch": 5.805017103762828, "grad_norm": 2.6875, "learning_rate": 2.04672627583111e-06, "loss": 0.5702, "mean_token_accuracy": 0.8789206445217133, "num_tokens": 272513063.0, "step": 2548 }, { "epoch": 5.807297605473204, "grad_norm": 2.78125, "learning_rate": 2.0448741719246962e-06, "loss": 0.569, "mean_token_accuracy": 0.883039727807045, "num_tokens": 272619770.0, "step": 2549 }, { "epoch": 5.80957810718358, "grad_norm": 2.75, "learning_rate": 2.043022326338916e-06, "loss": 0.5868, "mean_token_accuracy": 0.8768695294857025, "num_tokens": 272726957.0, "step": 2550 }, { "epoch": 5.811858608893957, "grad_norm": 3.65625, "learning_rate": 2.0411707401248406e-06, "loss": 0.5602, "mean_token_accuracy": 0.8837281614542007, "num_tokens": 272834440.0, "step": 2551 }, { "epoch": 5.814139110604333, "grad_norm": 3.046875, "learning_rate": 2.0393194143333956e-06, "loss": 0.5583, "mean_token_accuracy": 0.8858349472284317, "num_tokens": 272941653.0, "step": 2552 }, { "epoch": 5.816419612314709, "grad_norm": 4.21875, "learning_rate": 2.0374683500153564e-06, "loss": 0.567, "mean_token_accuracy": 0.884008064866066, "num_tokens": 273048925.0, "step": 2553 }, { "epoch": 5.818700114025085, "grad_norm": 5.0625, "learning_rate": 2.0356175482213523e-06, "loss": 0.5693, "mean_token_accuracy": 0.8851135820150375, "num_tokens": 273155880.0, "step": 2554 }, { "epoch": 5.820980615735461, "grad_norm": 2.8125, "learning_rate": 2.033767010001863e-06, "loss": 0.5922, "mean_token_accuracy": 0.8770723342895508, "num_tokens": 273262101.0, "step": 2555 }, { "epoch": 5.823261117445838, "grad_norm": 3.0625, "learning_rate": 2.0319167364072184e-06, "loss": 0.5795, "mean_token_accuracy": 0.8830315619707108, "num_tokens": 273369542.0, "step": 2556 }, { "epoch": 5.825541619156215, "grad_norm": 3.40625, "learning_rate": 2.0300667284875965e-06, "loss": 0.5795, "mean_token_accuracy": 0.8794274926185608, "num_tokens": 273477240.0, "step": 2557 }, { "epoch": 5.827822120866591, "grad_norm": 3.765625, "learning_rate": 2.0282169872930275e-06, "loss": 0.5663, "mean_token_accuracy": 0.8839514553546906, "num_tokens": 273584941.0, "step": 2558 }, { "epoch": 5.830102622576967, "grad_norm": 2.5625, "learning_rate": 2.026367513873388e-06, "loss": 0.5544, "mean_token_accuracy": 0.8844399601221085, "num_tokens": 273692501.0, "step": 2559 }, { "epoch": 5.832383124287343, "grad_norm": 3.015625, "learning_rate": 2.0245183092784046e-06, "loss": 0.5856, "mean_token_accuracy": 0.8805771768093109, "num_tokens": 273799051.0, "step": 2560 }, { "epoch": 5.834663625997719, "grad_norm": 3.265625, "learning_rate": 2.0226693745576494e-06, "loss": 0.5662, "mean_token_accuracy": 0.8822827190160751, "num_tokens": 273906681.0, "step": 2561 }, { "epoch": 5.836944127708096, "grad_norm": 3.046875, "learning_rate": 2.020820710760541e-06, "loss": 0.5538, "mean_token_accuracy": 0.8871154636144638, "num_tokens": 274014464.0, "step": 2562 }, { "epoch": 5.839224629418472, "grad_norm": 3.3125, "learning_rate": 2.018972318936347e-06, "loss": 0.5931, "mean_token_accuracy": 0.8768787384033203, "num_tokens": 274121211.0, "step": 2563 }, { "epoch": 5.841505131128848, "grad_norm": 3.25, "learning_rate": 2.017124200134178e-06, "loss": 0.5581, "mean_token_accuracy": 0.8847302943468094, "num_tokens": 274228462.0, "step": 2564 }, { "epoch": 5.843785632839225, "grad_norm": 3.484375, "learning_rate": 2.01527635540299e-06, "loss": 0.5809, "mean_token_accuracy": 0.8782871812582016, "num_tokens": 274335776.0, "step": 2565 }, { "epoch": 5.846066134549601, "grad_norm": 2.609375, "learning_rate": 2.0134287857915864e-06, "loss": 0.5794, "mean_token_accuracy": 0.8803008496761322, "num_tokens": 274443604.0, "step": 2566 }, { "epoch": 5.848346636259977, "grad_norm": 2.53125, "learning_rate": 2.0115814923486093e-06, "loss": 0.557, "mean_token_accuracy": 0.8840547949075699, "num_tokens": 274550591.0, "step": 2567 }, { "epoch": 5.850627137970354, "grad_norm": 2.6875, "learning_rate": 2.009734476122547e-06, "loss": 0.559, "mean_token_accuracy": 0.8849809020757675, "num_tokens": 274657799.0, "step": 2568 }, { "epoch": 5.85290763968073, "grad_norm": 2.875, "learning_rate": 2.007887738161732e-06, "loss": 0.5731, "mean_token_accuracy": 0.8857974410057068, "num_tokens": 274765249.0, "step": 2569 }, { "epoch": 5.855188141391106, "grad_norm": 2.546875, "learning_rate": 2.006041279514336e-06, "loss": 0.5747, "mean_token_accuracy": 0.8802428096532822, "num_tokens": 274871966.0, "step": 2570 }, { "epoch": 5.857468643101482, "grad_norm": 3.65625, "learning_rate": 2.004195101228374e-06, "loss": 0.5453, "mean_token_accuracy": 0.8866171091794968, "num_tokens": 274979490.0, "step": 2571 }, { "epoch": 5.859749144811858, "grad_norm": 2.890625, "learning_rate": 2.002349204351701e-06, "loss": 0.5668, "mean_token_accuracy": 0.8811267167329788, "num_tokens": 275086036.0, "step": 2572 }, { "epoch": 5.862029646522235, "grad_norm": 3.46875, "learning_rate": 2.0005035899320115e-06, "loss": 0.5726, "mean_token_accuracy": 0.8833982795476913, "num_tokens": 275193239.0, "step": 2573 }, { "epoch": 5.864310148232612, "grad_norm": 3.046875, "learning_rate": 1.998658259016841e-06, "loss": 0.5751, "mean_token_accuracy": 0.8820087909698486, "num_tokens": 275300065.0, "step": 2574 }, { "epoch": 5.866590649942988, "grad_norm": 2.84375, "learning_rate": 1.996813212653564e-06, "loss": 0.592, "mean_token_accuracy": 0.8738995492458344, "num_tokens": 275407231.0, "step": 2575 }, { "epoch": 5.868871151653364, "grad_norm": 3.25, "learning_rate": 1.9949684518893926e-06, "loss": 0.6003, "mean_token_accuracy": 0.8753588199615479, "num_tokens": 275514097.0, "step": 2576 }, { "epoch": 5.87115165336374, "grad_norm": 2.828125, "learning_rate": 1.9931239777713794e-06, "loss": 0.5453, "mean_token_accuracy": 0.8908671289682388, "num_tokens": 275621304.0, "step": 2577 }, { "epoch": 5.873432155074116, "grad_norm": 2.546875, "learning_rate": 1.9912797913464098e-06, "loss": 0.5424, "mean_token_accuracy": 0.8852016925811768, "num_tokens": 275728465.0, "step": 2578 }, { "epoch": 5.875712656784493, "grad_norm": 3.125, "learning_rate": 1.989435893661209e-06, "loss": 0.5527, "mean_token_accuracy": 0.8849275410175323, "num_tokens": 275835525.0, "step": 2579 }, { "epoch": 5.877993158494869, "grad_norm": 2.796875, "learning_rate": 1.9875922857623387e-06, "loss": 0.5584, "mean_token_accuracy": 0.8849629461765289, "num_tokens": 275942316.0, "step": 2580 }, { "epoch": 5.880273660205245, "grad_norm": 3.171875, "learning_rate": 1.985748968696194e-06, "loss": 0.5834, "mean_token_accuracy": 0.8805515915155411, "num_tokens": 276048810.0, "step": 2581 }, { "epoch": 5.882554161915621, "grad_norm": 3.171875, "learning_rate": 1.9839059435090073e-06, "loss": 0.5616, "mean_token_accuracy": 0.8853743970394135, "num_tokens": 276155430.0, "step": 2582 }, { "epoch": 5.884834663625997, "grad_norm": 3.921875, "learning_rate": 1.9820632112468437e-06, "loss": 0.5613, "mean_token_accuracy": 0.8853378593921661, "num_tokens": 276262832.0, "step": 2583 }, { "epoch": 5.887115165336374, "grad_norm": 3.625, "learning_rate": 1.9802207729556023e-06, "loss": 0.5857, "mean_token_accuracy": 0.876113772392273, "num_tokens": 276369762.0, "step": 2584 }, { "epoch": 5.889395667046751, "grad_norm": 2.734375, "learning_rate": 1.9783786296810148e-06, "loss": 0.5496, "mean_token_accuracy": 0.8896660208702087, "num_tokens": 276477298.0, "step": 2585 }, { "epoch": 5.891676168757127, "grad_norm": 3.765625, "learning_rate": 1.9765367824686467e-06, "loss": 0.5847, "mean_token_accuracy": 0.8809142112731934, "num_tokens": 276583840.0, "step": 2586 }, { "epoch": 5.893956670467503, "grad_norm": 3.015625, "learning_rate": 1.974695232363895e-06, "loss": 0.5606, "mean_token_accuracy": 0.8866210728883743, "num_tokens": 276690816.0, "step": 2587 }, { "epoch": 5.896237172177879, "grad_norm": 2.859375, "learning_rate": 1.9728539804119893e-06, "loss": 0.5868, "mean_token_accuracy": 0.8767704367637634, "num_tokens": 276797914.0, "step": 2588 }, { "epoch": 5.898517673888255, "grad_norm": 4.53125, "learning_rate": 1.9710130276579864e-06, "loss": 0.586, "mean_token_accuracy": 0.8765504211187363, "num_tokens": 276904386.0, "step": 2589 }, { "epoch": 5.900798175598632, "grad_norm": 3.046875, "learning_rate": 1.969172375146776e-06, "loss": 0.5591, "mean_token_accuracy": 0.8872781246900558, "num_tokens": 277011618.0, "step": 2590 }, { "epoch": 5.903078677309008, "grad_norm": 2.875, "learning_rate": 1.9673320239230783e-06, "loss": 0.5783, "mean_token_accuracy": 0.8807976096868515, "num_tokens": 277118651.0, "step": 2591 }, { "epoch": 5.905359179019384, "grad_norm": 2.5, "learning_rate": 1.9654919750314396e-06, "loss": 0.5609, "mean_token_accuracy": 0.8855329602956772, "num_tokens": 277225439.0, "step": 2592 }, { "epoch": 5.90763968072976, "grad_norm": 3.75, "learning_rate": 1.9636522295162375e-06, "loss": 0.5705, "mean_token_accuracy": 0.8821413516998291, "num_tokens": 277332844.0, "step": 2593 }, { "epoch": 5.909920182440137, "grad_norm": 4.3125, "learning_rate": 1.9618127884216753e-06, "loss": 0.5734, "mean_token_accuracy": 0.8815776854753494, "num_tokens": 277439892.0, "step": 2594 }, { "epoch": 5.9122006841505135, "grad_norm": 2.828125, "learning_rate": 1.959973652791784e-06, "loss": 0.5554, "mean_token_accuracy": 0.885757714509964, "num_tokens": 277547646.0, "step": 2595 }, { "epoch": 5.91448118586089, "grad_norm": 2.625, "learning_rate": 1.9581348236704217e-06, "loss": 0.5701, "mean_token_accuracy": 0.8831838816404343, "num_tokens": 277654799.0, "step": 2596 }, { "epoch": 5.916761687571266, "grad_norm": 3.171875, "learning_rate": 1.9562963021012723e-06, "loss": 0.581, "mean_token_accuracy": 0.8817932456731796, "num_tokens": 277761954.0, "step": 2597 }, { "epoch": 5.919042189281642, "grad_norm": 3.4375, "learning_rate": 1.954458089127845e-06, "loss": 0.567, "mean_token_accuracy": 0.8825689405202866, "num_tokens": 277869215.0, "step": 2598 }, { "epoch": 5.921322690992018, "grad_norm": 3.140625, "learning_rate": 1.952620185793475e-06, "loss": 0.572, "mean_token_accuracy": 0.8813809603452682, "num_tokens": 277975771.0, "step": 2599 }, { "epoch": 5.923603192702394, "grad_norm": 4.46875, "learning_rate": 1.9507825931413193e-06, "loss": 0.5823, "mean_token_accuracy": 0.8809202462434769, "num_tokens": 278082716.0, "step": 2600 }, { "epoch": 5.925883694412771, "grad_norm": 2.5, "learning_rate": 1.9489453122143605e-06, "loss": 0.5458, "mean_token_accuracy": 0.8862514644861221, "num_tokens": 278189379.0, "step": 2601 }, { "epoch": 5.928164196123147, "grad_norm": 2.671875, "learning_rate": 1.947108344055404e-06, "loss": 0.5578, "mean_token_accuracy": 0.8841505497694016, "num_tokens": 278296342.0, "step": 2602 }, { "epoch": 5.930444697833523, "grad_norm": 3.84375, "learning_rate": 1.9452716897070785e-06, "loss": 0.5847, "mean_token_accuracy": 0.8791919499635696, "num_tokens": 278403387.0, "step": 2603 }, { "epoch": 5.932725199543899, "grad_norm": 2.578125, "learning_rate": 1.943435350211832e-06, "loss": 0.5815, "mean_token_accuracy": 0.8786250501871109, "num_tokens": 278510323.0, "step": 2604 }, { "epoch": 5.935005701254276, "grad_norm": 6.0, "learning_rate": 1.941599326611935e-06, "loss": 0.5688, "mean_token_accuracy": 0.8808638751506805, "num_tokens": 278616886.0, "step": 2605 }, { "epoch": 5.9372862029646525, "grad_norm": 3.0, "learning_rate": 1.939763619949481e-06, "loss": 0.5821, "mean_token_accuracy": 0.880496084690094, "num_tokens": 278724068.0, "step": 2606 }, { "epoch": 5.939566704675029, "grad_norm": 2.5625, "learning_rate": 1.9379282312663797e-06, "loss": 0.5737, "mean_token_accuracy": 0.8817182034254074, "num_tokens": 278830958.0, "step": 2607 }, { "epoch": 5.941847206385405, "grad_norm": 3.125, "learning_rate": 1.936093161604363e-06, "loss": 0.536, "mean_token_accuracy": 0.8876519352197647, "num_tokens": 278938000.0, "step": 2608 }, { "epoch": 5.944127708095781, "grad_norm": 2.953125, "learning_rate": 1.9342584120049824e-06, "loss": 0.6064, "mean_token_accuracy": 0.876665860414505, "num_tokens": 279044994.0, "step": 2609 }, { "epoch": 5.946408209806157, "grad_norm": 2.890625, "learning_rate": 1.9324239835096044e-06, "loss": 0.5781, "mean_token_accuracy": 0.8785818070173264, "num_tokens": 279152387.0, "step": 2610 }, { "epoch": 5.9486887115165334, "grad_norm": 3.46875, "learning_rate": 1.930589877159415e-06, "loss": 0.5713, "mean_token_accuracy": 0.8864869177341461, "num_tokens": 279259832.0, "step": 2611 }, { "epoch": 5.95096921322691, "grad_norm": 2.640625, "learning_rate": 1.928756093995419e-06, "loss": 0.5698, "mean_token_accuracy": 0.8803494274616241, "num_tokens": 279366863.0, "step": 2612 }, { "epoch": 5.953249714937286, "grad_norm": 2.71875, "learning_rate": 1.9269226350584357e-06, "loss": 0.5807, "mean_token_accuracy": 0.8808418363332748, "num_tokens": 279473717.0, "step": 2613 }, { "epoch": 5.955530216647663, "grad_norm": 3.375, "learning_rate": 1.9250895013891015e-06, "loss": 0.5787, "mean_token_accuracy": 0.8824566006660461, "num_tokens": 279580810.0, "step": 2614 }, { "epoch": 5.957810718358039, "grad_norm": 3.078125, "learning_rate": 1.9232566940278675e-06, "loss": 0.5545, "mean_token_accuracy": 0.8856438547372818, "num_tokens": 279687888.0, "step": 2615 }, { "epoch": 5.960091220068415, "grad_norm": 3.703125, "learning_rate": 1.9214242140149987e-06, "loss": 0.5858, "mean_token_accuracy": 0.8792087882757187, "num_tokens": 279794989.0, "step": 2616 }, { "epoch": 5.9623717217787915, "grad_norm": 2.828125, "learning_rate": 1.9195920623905766e-06, "loss": 0.5757, "mean_token_accuracy": 0.8819312304258347, "num_tokens": 279901750.0, "step": 2617 }, { "epoch": 5.964652223489168, "grad_norm": 3.46875, "learning_rate": 1.9177602401944943e-06, "loss": 0.5786, "mean_token_accuracy": 0.8836159706115723, "num_tokens": 280008000.0, "step": 2618 }, { "epoch": 5.966932725199544, "grad_norm": 4.03125, "learning_rate": 1.915928748466459e-06, "loss": 0.5794, "mean_token_accuracy": 0.8819707930088043, "num_tokens": 280115535.0, "step": 2619 }, { "epoch": 5.96921322690992, "grad_norm": 3.21875, "learning_rate": 1.9140975882459912e-06, "loss": 0.5797, "mean_token_accuracy": 0.8795643150806427, "num_tokens": 280222510.0, "step": 2620 }, { "epoch": 5.971493728620296, "grad_norm": 4.53125, "learning_rate": 1.9122667605724202e-06, "loss": 0.5781, "mean_token_accuracy": 0.8824938386678696, "num_tokens": 280329229.0, "step": 2621 }, { "epoch": 5.9737742303306725, "grad_norm": 4.375, "learning_rate": 1.910436266484889e-06, "loss": 0.5898, "mean_token_accuracy": 0.8779114931821823, "num_tokens": 280436155.0, "step": 2622 }, { "epoch": 5.976054732041049, "grad_norm": 5.21875, "learning_rate": 1.908606107022351e-06, "loss": 0.5658, "mean_token_accuracy": 0.8845522552728653, "num_tokens": 280543656.0, "step": 2623 }, { "epoch": 5.978335233751425, "grad_norm": 3.625, "learning_rate": 1.9067762832235698e-06, "loss": 0.5732, "mean_token_accuracy": 0.8839434236288071, "num_tokens": 280651076.0, "step": 2624 }, { "epoch": 5.980615735461802, "grad_norm": 4.6875, "learning_rate": 1.9049467961271184e-06, "loss": 0.5698, "mean_token_accuracy": 0.881288930773735, "num_tokens": 280758080.0, "step": 2625 }, { "epoch": 5.982896237172178, "grad_norm": 2.71875, "learning_rate": 1.9031176467713763e-06, "loss": 0.5581, "mean_token_accuracy": 0.8865546584129333, "num_tokens": 280865575.0, "step": 2626 }, { "epoch": 5.985176738882554, "grad_norm": 3.921875, "learning_rate": 1.9012888361945354e-06, "loss": 0.5608, "mean_token_accuracy": 0.8823821991682053, "num_tokens": 280972852.0, "step": 2627 }, { "epoch": 5.9874572405929305, "grad_norm": 4.90625, "learning_rate": 1.8994603654345917e-06, "loss": 0.5856, "mean_token_accuracy": 0.8786684423685074, "num_tokens": 281079620.0, "step": 2628 }, { "epoch": 5.989737742303307, "grad_norm": 4.9375, "learning_rate": 1.897632235529351e-06, "loss": 0.5979, "mean_token_accuracy": 0.8792405128479004, "num_tokens": 281186453.0, "step": 2629 }, { "epoch": 5.992018244013683, "grad_norm": 2.84375, "learning_rate": 1.8958044475164242e-06, "loss": 0.5692, "mean_token_accuracy": 0.8830309361219406, "num_tokens": 281293819.0, "step": 2630 }, { "epoch": 5.994298745724059, "grad_norm": 5.46875, "learning_rate": 1.8939770024332294e-06, "loss": 0.5425, "mean_token_accuracy": 0.8879213184118271, "num_tokens": 281400651.0, "step": 2631 }, { "epoch": 5.996579247434435, "grad_norm": 2.875, "learning_rate": 1.8921499013169876e-06, "loss": 0.577, "mean_token_accuracy": 0.8807700723409653, "num_tokens": 281507623.0, "step": 2632 }, { "epoch": 5.9988597491448115, "grad_norm": 2.65625, "learning_rate": 1.8903231452047265e-06, "loss": 0.5645, "mean_token_accuracy": 0.8816178441047668, "num_tokens": 281614487.0, "step": 2633 }, { "epoch": 6.0, "grad_norm": 5.5625, "learning_rate": 1.8884967351332778e-06, "loss": 0.5877, "mean_token_accuracy": 0.8762392401695251, "num_tokens": 281653392.0, "step": 2634 }, { "epoch": 6.002280501710376, "grad_norm": 2.734375, "learning_rate": 1.886670672139277e-06, "loss": 0.5603, "mean_token_accuracy": 0.8846362233161926, "num_tokens": 281760658.0, "step": 2635 }, { "epoch": 6.004561003420752, "grad_norm": 2.59375, "learning_rate": 1.884844957259163e-06, "loss": 0.5781, "mean_token_accuracy": 0.8855700343847275, "num_tokens": 281867681.0, "step": 2636 }, { "epoch": 6.006841505131129, "grad_norm": 5.15625, "learning_rate": 1.8830195915291741e-06, "loss": 0.5797, "mean_token_accuracy": 0.8807573318481445, "num_tokens": 281974677.0, "step": 2637 }, { "epoch": 6.009122006841505, "grad_norm": 3.09375, "learning_rate": 1.8811945759853543e-06, "loss": 0.5822, "mean_token_accuracy": 0.8797610551118851, "num_tokens": 282081659.0, "step": 2638 }, { "epoch": 6.011402508551882, "grad_norm": 3.234375, "learning_rate": 1.879369911663546e-06, "loss": 0.5799, "mean_token_accuracy": 0.881444662809372, "num_tokens": 282188705.0, "step": 2639 }, { "epoch": 6.013683010262258, "grad_norm": 2.765625, "learning_rate": 1.8775455995993941e-06, "loss": 0.5571, "mean_token_accuracy": 0.8842919617891312, "num_tokens": 282295789.0, "step": 2640 }, { "epoch": 6.013683010262258, "eval_loss": 0.5869334936141968, "eval_mean_token_accuracy": 0.8798886684863739, "eval_num_tokens": 282295789.0, "eval_runtime": 58.6353, "eval_samples_per_second": 143.003, "eval_steps_per_second": 4.485, "step": 2640 }, { "epoch": 6.015963511972634, "grad_norm": 2.9375, "learning_rate": 1.875721640828344e-06, "loss": 0.5686, "mean_token_accuracy": 0.8848172128200531, "num_tokens": 282402500.0, "step": 2641 }, { "epoch": 6.01824401368301, "grad_norm": 3.71875, "learning_rate": 1.8738980363856376e-06, "loss": 0.5911, "mean_token_accuracy": 0.8776390701532364, "num_tokens": 282509194.0, "step": 2642 }, { "epoch": 6.020524515393387, "grad_norm": 3.09375, "learning_rate": 1.8720747873063184e-06, "loss": 0.5629, "mean_token_accuracy": 0.8839791864156723, "num_tokens": 282616314.0, "step": 2643 }, { "epoch": 6.022805017103763, "grad_norm": 2.78125, "learning_rate": 1.870251894625227e-06, "loss": 0.563, "mean_token_accuracy": 0.8833863288164139, "num_tokens": 282723232.0, "step": 2644 }, { "epoch": 6.025085518814139, "grad_norm": 3.140625, "learning_rate": 1.8684293593770026e-06, "loss": 0.5728, "mean_token_accuracy": 0.8824484199285507, "num_tokens": 282829758.0, "step": 2645 }, { "epoch": 6.027366020524515, "grad_norm": 3.34375, "learning_rate": 1.866607182596081e-06, "loss": 0.5839, "mean_token_accuracy": 0.8822183609008789, "num_tokens": 282936677.0, "step": 2646 }, { "epoch": 6.029646522234891, "grad_norm": 2.8125, "learning_rate": 1.8647853653166953e-06, "loss": 0.5873, "mean_token_accuracy": 0.8796509057283401, "num_tokens": 283043715.0, "step": 2647 }, { "epoch": 6.031927023945268, "grad_norm": 3.0, "learning_rate": 1.862963908572872e-06, "loss": 0.5755, "mean_token_accuracy": 0.8782380074262619, "num_tokens": 283150407.0, "step": 2648 }, { "epoch": 6.034207525655645, "grad_norm": 2.8125, "learning_rate": 1.8611428133984365e-06, "loss": 0.5848, "mean_token_accuracy": 0.8795290291309357, "num_tokens": 283257249.0, "step": 2649 }, { "epoch": 6.036488027366021, "grad_norm": 3.84375, "learning_rate": 1.8593220808270057e-06, "loss": 0.5619, "mean_token_accuracy": 0.881279394030571, "num_tokens": 283363971.0, "step": 2650 }, { "epoch": 6.038768529076397, "grad_norm": 4.78125, "learning_rate": 1.857501711891993e-06, "loss": 0.552, "mean_token_accuracy": 0.8844835609197617, "num_tokens": 283471402.0, "step": 2651 }, { "epoch": 6.041049030786773, "grad_norm": 3.828125, "learning_rate": 1.8556817076266059e-06, "loss": 0.5612, "mean_token_accuracy": 0.8844869136810303, "num_tokens": 283578013.0, "step": 2652 }, { "epoch": 6.043329532497149, "grad_norm": 2.84375, "learning_rate": 1.8538620690638414e-06, "loss": 0.5541, "mean_token_accuracy": 0.8868309855461121, "num_tokens": 283685194.0, "step": 2653 }, { "epoch": 6.045610034207526, "grad_norm": 2.890625, "learning_rate": 1.8520427972364924e-06, "loss": 0.5879, "mean_token_accuracy": 0.8806286752223969, "num_tokens": 283792579.0, "step": 2654 }, { "epoch": 6.047890535917902, "grad_norm": 3.953125, "learning_rate": 1.8502238931771422e-06, "loss": 0.5739, "mean_token_accuracy": 0.884283035993576, "num_tokens": 283899492.0, "step": 2655 }, { "epoch": 6.050171037628278, "grad_norm": 2.8125, "learning_rate": 1.848405357918166e-06, "loss": 0.596, "mean_token_accuracy": 0.8780128657817841, "num_tokens": 284006120.0, "step": 2656 }, { "epoch": 6.052451539338654, "grad_norm": 3.140625, "learning_rate": 1.8465871924917295e-06, "loss": 0.6124, "mean_token_accuracy": 0.8742113560438156, "num_tokens": 284112587.0, "step": 2657 }, { "epoch": 6.05473204104903, "grad_norm": 2.65625, "learning_rate": 1.8447693979297882e-06, "loss": 0.5548, "mean_token_accuracy": 0.8869588524103165, "num_tokens": 284219902.0, "step": 2658 }, { "epoch": 6.0570125427594075, "grad_norm": 3.765625, "learning_rate": 1.8429519752640862e-06, "loss": 0.5597, "mean_token_accuracy": 0.8819890022277832, "num_tokens": 284327196.0, "step": 2659 }, { "epoch": 6.059293044469784, "grad_norm": 2.4375, "learning_rate": 1.8411349255261587e-06, "loss": 0.5647, "mean_token_accuracy": 0.8833761811256409, "num_tokens": 284434704.0, "step": 2660 }, { "epoch": 6.06157354618016, "grad_norm": 5.125, "learning_rate": 1.8393182497473271e-06, "loss": 0.5674, "mean_token_accuracy": 0.8822661340236664, "num_tokens": 284542263.0, "step": 2661 }, { "epoch": 6.063854047890536, "grad_norm": 3.875, "learning_rate": 1.837501948958702e-06, "loss": 0.5619, "mean_token_accuracy": 0.8850172013044357, "num_tokens": 284649568.0, "step": 2662 }, { "epoch": 6.066134549600912, "grad_norm": 2.65625, "learning_rate": 1.8356860241911817e-06, "loss": 0.5662, "mean_token_accuracy": 0.8816670328378677, "num_tokens": 284757479.0, "step": 2663 }, { "epoch": 6.068415051311288, "grad_norm": 3.53125, "learning_rate": 1.833870476475448e-06, "loss": 0.5754, "mean_token_accuracy": 0.8798322230577469, "num_tokens": 284864552.0, "step": 2664 }, { "epoch": 6.070695553021665, "grad_norm": 4.8125, "learning_rate": 1.8320553068419716e-06, "loss": 0.5802, "mean_token_accuracy": 0.8766209781169891, "num_tokens": 284971361.0, "step": 2665 }, { "epoch": 6.072976054732041, "grad_norm": 3.3125, "learning_rate": 1.830240516321008e-06, "loss": 0.5904, "mean_token_accuracy": 0.8826578855514526, "num_tokens": 285078270.0, "step": 2666 }, { "epoch": 6.075256556442417, "grad_norm": 4.03125, "learning_rate": 1.8284261059425972e-06, "loss": 0.5675, "mean_token_accuracy": 0.8815598785877228, "num_tokens": 285185441.0, "step": 2667 }, { "epoch": 6.077537058152793, "grad_norm": 3.046875, "learning_rate": 1.8266120767365642e-06, "loss": 0.5596, "mean_token_accuracy": 0.8829776495695114, "num_tokens": 285291883.0, "step": 2668 }, { "epoch": 6.07981755986317, "grad_norm": 4.59375, "learning_rate": 1.8247984297325156e-06, "loss": 0.585, "mean_token_accuracy": 0.8801479190587997, "num_tokens": 285398806.0, "step": 2669 }, { "epoch": 6.0820980615735465, "grad_norm": 3.25, "learning_rate": 1.8229851659598425e-06, "loss": 0.5735, "mean_token_accuracy": 0.8817605227231979, "num_tokens": 285505881.0, "step": 2670 }, { "epoch": 6.084378563283923, "grad_norm": 3.125, "learning_rate": 1.8211722864477197e-06, "loss": 0.5865, "mean_token_accuracy": 0.8797835558652878, "num_tokens": 285612626.0, "step": 2671 }, { "epoch": 6.086659064994299, "grad_norm": 3.546875, "learning_rate": 1.819359792225101e-06, "loss": 0.5725, "mean_token_accuracy": 0.8806306272745132, "num_tokens": 285720025.0, "step": 2672 }, { "epoch": 6.088939566704675, "grad_norm": 3.109375, "learning_rate": 1.8175476843207245e-06, "loss": 0.5537, "mean_token_accuracy": 0.8841009885072708, "num_tokens": 285827144.0, "step": 2673 }, { "epoch": 6.091220068415051, "grad_norm": 2.734375, "learning_rate": 1.8157359637631078e-06, "loss": 0.5758, "mean_token_accuracy": 0.8842537254095078, "num_tokens": 285933624.0, "step": 2674 }, { "epoch": 6.0935005701254275, "grad_norm": 3.421875, "learning_rate": 1.813924631580547e-06, "loss": 0.5834, "mean_token_accuracy": 0.881016343832016, "num_tokens": 286040489.0, "step": 2675 }, { "epoch": 6.095781071835804, "grad_norm": 2.921875, "learning_rate": 1.8121136888011198e-06, "loss": 0.5817, "mean_token_accuracy": 0.8789900839328766, "num_tokens": 286146914.0, "step": 2676 }, { "epoch": 6.09806157354618, "grad_norm": 4.15625, "learning_rate": 1.810303136452683e-06, "loss": 0.5671, "mean_token_accuracy": 0.8830416947603226, "num_tokens": 286254159.0, "step": 2677 }, { "epoch": 6.100342075256556, "grad_norm": 2.546875, "learning_rate": 1.8084929755628707e-06, "loss": 0.5559, "mean_token_accuracy": 0.8868089914321899, "num_tokens": 286361203.0, "step": 2678 }, { "epoch": 6.102622576966933, "grad_norm": 2.890625, "learning_rate": 1.8066832071590967e-06, "loss": 0.5861, "mean_token_accuracy": 0.8792434334754944, "num_tokens": 286468526.0, "step": 2679 }, { "epoch": 6.104903078677309, "grad_norm": 3.140625, "learning_rate": 1.8048738322685478e-06, "loss": 0.5705, "mean_token_accuracy": 0.8830481469631195, "num_tokens": 286575282.0, "step": 2680 }, { "epoch": 6.1071835803876855, "grad_norm": 3.0625, "learning_rate": 1.8030648519181926e-06, "loss": 0.5604, "mean_token_accuracy": 0.8848658800125122, "num_tokens": 286682350.0, "step": 2681 }, { "epoch": 6.109464082098062, "grad_norm": 2.609375, "learning_rate": 1.8012562671347721e-06, "loss": 0.5672, "mean_token_accuracy": 0.8794921636581421, "num_tokens": 286789153.0, "step": 2682 }, { "epoch": 6.111744583808438, "grad_norm": 4.34375, "learning_rate": 1.7994480789448043e-06, "loss": 0.598, "mean_token_accuracy": 0.877688080072403, "num_tokens": 286896549.0, "step": 2683 }, { "epoch": 6.114025085518814, "grad_norm": 2.546875, "learning_rate": 1.7976402883745836e-06, "loss": 0.5455, "mean_token_accuracy": 0.8872903436422348, "num_tokens": 287003999.0, "step": 2684 }, { "epoch": 6.11630558722919, "grad_norm": 2.6875, "learning_rate": 1.7958328964501749e-06, "loss": 0.5613, "mean_token_accuracy": 0.8816230744123459, "num_tokens": 287110496.0, "step": 2685 }, { "epoch": 6.1185860889395665, "grad_norm": 3.46875, "learning_rate": 1.7940259041974189e-06, "loss": 0.583, "mean_token_accuracy": 0.8829924911260605, "num_tokens": 287217513.0, "step": 2686 }, { "epoch": 6.120866590649943, "grad_norm": 4.28125, "learning_rate": 1.7922193126419306e-06, "loss": 0.5719, "mean_token_accuracy": 0.8832157105207443, "num_tokens": 287324344.0, "step": 2687 }, { "epoch": 6.123147092360319, "grad_norm": 4.75, "learning_rate": 1.7904131228090965e-06, "loss": 0.5609, "mean_token_accuracy": 0.8847935795783997, "num_tokens": 287431380.0, "step": 2688 }, { "epoch": 6.125427594070696, "grad_norm": 3.84375, "learning_rate": 1.7886073357240746e-06, "loss": 0.5555, "mean_token_accuracy": 0.8844207972288132, "num_tokens": 287538513.0, "step": 2689 }, { "epoch": 6.127708095781072, "grad_norm": 3.109375, "learning_rate": 1.7868019524117957e-06, "loss": 0.5821, "mean_token_accuracy": 0.8769582509994507, "num_tokens": 287645242.0, "step": 2690 }, { "epoch": 6.129988597491448, "grad_norm": 3.375, "learning_rate": 1.7849969738969592e-06, "loss": 0.5993, "mean_token_accuracy": 0.8752488493919373, "num_tokens": 287751854.0, "step": 2691 }, { "epoch": 6.1322690992018245, "grad_norm": 2.765625, "learning_rate": 1.783192401204037e-06, "loss": 0.569, "mean_token_accuracy": 0.8854573518037796, "num_tokens": 287858938.0, "step": 2692 }, { "epoch": 6.134549600912201, "grad_norm": 4.34375, "learning_rate": 1.7813882353572692e-06, "loss": 0.5684, "mean_token_accuracy": 0.8819147795438766, "num_tokens": 287965788.0, "step": 2693 }, { "epoch": 6.136830102622577, "grad_norm": 3.484375, "learning_rate": 1.7795844773806653e-06, "loss": 0.5687, "mean_token_accuracy": 0.884269118309021, "num_tokens": 288073071.0, "step": 2694 }, { "epoch": 6.139110604332953, "grad_norm": 3.390625, "learning_rate": 1.7777811282980047e-06, "loss": 0.5668, "mean_token_accuracy": 0.8846654891967773, "num_tokens": 288180362.0, "step": 2695 }, { "epoch": 6.141391106043329, "grad_norm": 3.4375, "learning_rate": 1.7759781891328321e-06, "loss": 0.5662, "mean_token_accuracy": 0.8841256648302078, "num_tokens": 288287566.0, "step": 2696 }, { "epoch": 6.1436716077537055, "grad_norm": 5.59375, "learning_rate": 1.7741756609084616e-06, "loss": 0.5633, "mean_token_accuracy": 0.8848319351673126, "num_tokens": 288395039.0, "step": 2697 }, { "epoch": 6.145952109464082, "grad_norm": 3.875, "learning_rate": 1.772373544647973e-06, "loss": 0.5661, "mean_token_accuracy": 0.8844243884086609, "num_tokens": 288502651.0, "step": 2698 }, { "epoch": 6.148232611174459, "grad_norm": 3.234375, "learning_rate": 1.770571841374213e-06, "loss": 0.5655, "mean_token_accuracy": 0.882807731628418, "num_tokens": 288609620.0, "step": 2699 }, { "epoch": 6.150513112884835, "grad_norm": 3.828125, "learning_rate": 1.7687705521097954e-06, "loss": 0.5494, "mean_token_accuracy": 0.8844881951808929, "num_tokens": 288716986.0, "step": 2700 }, { "epoch": 6.152793614595211, "grad_norm": 3.296875, "learning_rate": 1.766969677877094e-06, "loss": 0.5695, "mean_token_accuracy": 0.8841705173254013, "num_tokens": 288824522.0, "step": 2701 }, { "epoch": 6.155074116305587, "grad_norm": 3.140625, "learning_rate": 1.7651692196982517e-06, "loss": 0.5635, "mean_token_accuracy": 0.8823985755443573, "num_tokens": 288931504.0, "step": 2702 }, { "epoch": 6.1573546180159635, "grad_norm": 3.234375, "learning_rate": 1.7633691785951746e-06, "loss": 0.5661, "mean_token_accuracy": 0.8812411278486252, "num_tokens": 289038321.0, "step": 2703 }, { "epoch": 6.15963511972634, "grad_norm": 3.28125, "learning_rate": 1.7615695555895296e-06, "loss": 0.5512, "mean_token_accuracy": 0.8865274041891098, "num_tokens": 289145591.0, "step": 2704 }, { "epoch": 6.161915621436716, "grad_norm": 3.65625, "learning_rate": 1.7597703517027491e-06, "loss": 0.5795, "mean_token_accuracy": 0.8811384439468384, "num_tokens": 289252364.0, "step": 2705 }, { "epoch": 6.164196123147092, "grad_norm": 2.5625, "learning_rate": 1.7579715679560273e-06, "loss": 0.5683, "mean_token_accuracy": 0.8845842778682709, "num_tokens": 289359378.0, "step": 2706 }, { "epoch": 6.166476624857468, "grad_norm": 3.234375, "learning_rate": 1.7561732053703174e-06, "loss": 0.5737, "mean_token_accuracy": 0.8825568854808807, "num_tokens": 289466858.0, "step": 2707 }, { "epoch": 6.168757126567845, "grad_norm": 4.15625, "learning_rate": 1.7543752649663354e-06, "loss": 0.564, "mean_token_accuracy": 0.8844540864229202, "num_tokens": 289574570.0, "step": 2708 }, { "epoch": 6.1710376282782216, "grad_norm": 2.984375, "learning_rate": 1.7525777477645586e-06, "loss": 0.554, "mean_token_accuracy": 0.8892973810434341, "num_tokens": 289682543.0, "step": 2709 }, { "epoch": 6.173318129988598, "grad_norm": 5.4375, "learning_rate": 1.7507806547852224e-06, "loss": 0.5571, "mean_token_accuracy": 0.885601207613945, "num_tokens": 289789759.0, "step": 2710 }, { "epoch": 6.175598631698974, "grad_norm": 3.671875, "learning_rate": 1.7489839870483236e-06, "loss": 0.5518, "mean_token_accuracy": 0.885807454586029, "num_tokens": 289897075.0, "step": 2711 }, { "epoch": 6.17787913340935, "grad_norm": 3.34375, "learning_rate": 1.7471877455736136e-06, "loss": 0.5733, "mean_token_accuracy": 0.8810791075229645, "num_tokens": 290004392.0, "step": 2712 }, { "epoch": 6.180159635119726, "grad_norm": 3.046875, "learning_rate": 1.7453919313806057e-06, "loss": 0.5888, "mean_token_accuracy": 0.880391463637352, "num_tokens": 290111469.0, "step": 2713 }, { "epoch": 6.1824401368301025, "grad_norm": 4.03125, "learning_rate": 1.7435965454885699e-06, "loss": 0.5589, "mean_token_accuracy": 0.8832577913999557, "num_tokens": 290218539.0, "step": 2714 }, { "epoch": 6.184720638540479, "grad_norm": 3.546875, "learning_rate": 1.7418015889165312e-06, "loss": 0.533, "mean_token_accuracy": 0.8896812498569489, "num_tokens": 290326424.0, "step": 2715 }, { "epoch": 6.187001140250855, "grad_norm": 3.8125, "learning_rate": 1.7400070626832732e-06, "loss": 0.5837, "mean_token_accuracy": 0.8802052289247513, "num_tokens": 290433540.0, "step": 2716 }, { "epoch": 6.189281641961231, "grad_norm": 4.34375, "learning_rate": 1.7382129678073351e-06, "loss": 0.5833, "mean_token_accuracy": 0.878139853477478, "num_tokens": 290540729.0, "step": 2717 }, { "epoch": 6.191562143671608, "grad_norm": 4.21875, "learning_rate": 1.7364193053070082e-06, "loss": 0.5799, "mean_token_accuracy": 0.8827469646930695, "num_tokens": 290647100.0, "step": 2718 }, { "epoch": 6.193842645381984, "grad_norm": 4.1875, "learning_rate": 1.7346260762003428e-06, "loss": 0.5656, "mean_token_accuracy": 0.8845665007829666, "num_tokens": 290753980.0, "step": 2719 }, { "epoch": 6.196123147092361, "grad_norm": 5.40625, "learning_rate": 1.7328332815051403e-06, "loss": 0.581, "mean_token_accuracy": 0.8774483948945999, "num_tokens": 290861484.0, "step": 2720 }, { "epoch": 6.198403648802737, "grad_norm": 3.171875, "learning_rate": 1.7310409222389563e-06, "loss": 0.5454, "mean_token_accuracy": 0.8843204379081726, "num_tokens": 290968606.0, "step": 2721 }, { "epoch": 6.200684150513113, "grad_norm": 3.546875, "learning_rate": 1.7292489994191005e-06, "loss": 0.5741, "mean_token_accuracy": 0.8810103088617325, "num_tokens": 291075885.0, "step": 2722 }, { "epoch": 6.202964652223489, "grad_norm": 2.75, "learning_rate": 1.7274575140626318e-06, "loss": 0.5756, "mean_token_accuracy": 0.8839705884456635, "num_tokens": 291183303.0, "step": 2723 }, { "epoch": 6.205245153933865, "grad_norm": 3.1875, "learning_rate": 1.7256664671863634e-06, "loss": 0.5398, "mean_token_accuracy": 0.8867494761943817, "num_tokens": 291290745.0, "step": 2724 }, { "epoch": 6.2075256556442415, "grad_norm": 5.1875, "learning_rate": 1.72387585980686e-06, "loss": 0.6035, "mean_token_accuracy": 0.8760453164577484, "num_tokens": 291397663.0, "step": 2725 }, { "epoch": 6.209806157354618, "grad_norm": 5.1875, "learning_rate": 1.7220856929404342e-06, "loss": 0.5625, "mean_token_accuracy": 0.8818619549274445, "num_tokens": 291504707.0, "step": 2726 }, { "epoch": 6.212086659064994, "grad_norm": 3.390625, "learning_rate": 1.720295967603152e-06, "loss": 0.5569, "mean_token_accuracy": 0.8842518627643585, "num_tokens": 291611837.0, "step": 2727 }, { "epoch": 6.214367160775371, "grad_norm": 5.78125, "learning_rate": 1.7185066848108244e-06, "loss": 0.5682, "mean_token_accuracy": 0.8826566338539124, "num_tokens": 291718969.0, "step": 2728 }, { "epoch": 6.216647662485747, "grad_norm": 4.59375, "learning_rate": 1.7167178455790157e-06, "loss": 0.5668, "mean_token_accuracy": 0.8848382532596588, "num_tokens": 291826377.0, "step": 2729 }, { "epoch": 6.218928164196123, "grad_norm": 3.6875, "learning_rate": 1.7149294509230357e-06, "loss": 0.5986, "mean_token_accuracy": 0.8756033033132553, "num_tokens": 291933188.0, "step": 2730 }, { "epoch": 6.2212086659065, "grad_norm": 2.84375, "learning_rate": 1.713141501857943e-06, "loss": 0.5674, "mean_token_accuracy": 0.8862540572881699, "num_tokens": 292040480.0, "step": 2731 }, { "epoch": 6.223489167616876, "grad_norm": 3.765625, "learning_rate": 1.7113539993985431e-06, "loss": 0.5827, "mean_token_accuracy": 0.8784683644771576, "num_tokens": 292147117.0, "step": 2732 }, { "epoch": 6.225769669327252, "grad_norm": 5.1875, "learning_rate": 1.7095669445593887e-06, "loss": 0.55, "mean_token_accuracy": 0.8856497257947922, "num_tokens": 292254295.0, "step": 2733 }, { "epoch": 6.228050171037628, "grad_norm": 3.203125, "learning_rate": 1.707780338354776e-06, "loss": 0.5757, "mean_token_accuracy": 0.8842949420213699, "num_tokens": 292361287.0, "step": 2734 }, { "epoch": 6.230330672748004, "grad_norm": 2.984375, "learning_rate": 1.7059941817987485e-06, "loss": 0.5717, "mean_token_accuracy": 0.8810069411993027, "num_tokens": 292468510.0, "step": 2735 }, { "epoch": 6.2326111744583805, "grad_norm": 2.921875, "learning_rate": 1.7042084759050948e-06, "loss": 0.5731, "mean_token_accuracy": 0.8827442526817322, "num_tokens": 292575697.0, "step": 2736 }, { "epoch": 6.234891676168757, "grad_norm": 2.53125, "learning_rate": 1.7024232216873465e-06, "loss": 0.5581, "mean_token_accuracy": 0.8848889619112015, "num_tokens": 292682969.0, "step": 2737 }, { "epoch": 6.237172177879134, "grad_norm": 4.84375, "learning_rate": 1.7006384201587809e-06, "loss": 0.5584, "mean_token_accuracy": 0.884590744972229, "num_tokens": 292790103.0, "step": 2738 }, { "epoch": 6.23945267958951, "grad_norm": 3.109375, "learning_rate": 1.6988540723324145e-06, "loss": 0.5753, "mean_token_accuracy": 0.881315678358078, "num_tokens": 292897356.0, "step": 2739 }, { "epoch": 6.241733181299886, "grad_norm": 3.546875, "learning_rate": 1.6970701792210101e-06, "loss": 0.5632, "mean_token_accuracy": 0.8851087540388107, "num_tokens": 293004918.0, "step": 2740 }, { "epoch": 6.244013683010262, "grad_norm": 3.1875, "learning_rate": 1.6952867418370707e-06, "loss": 0.5723, "mean_token_accuracy": 0.8804485946893692, "num_tokens": 293112328.0, "step": 2741 }, { "epoch": 6.246294184720639, "grad_norm": 5.96875, "learning_rate": 1.6935037611928412e-06, "loss": 0.5993, "mean_token_accuracy": 0.8765440136194229, "num_tokens": 293218993.0, "step": 2742 }, { "epoch": 6.248574686431015, "grad_norm": 5.1875, "learning_rate": 1.691721238300308e-06, "loss": 0.5587, "mean_token_accuracy": 0.882225289940834, "num_tokens": 293325757.0, "step": 2743 }, { "epoch": 6.250855188141391, "grad_norm": 3.265625, "learning_rate": 1.689939174171194e-06, "loss": 0.5923, "mean_token_accuracy": 0.876752108335495, "num_tokens": 293432687.0, "step": 2744 }, { "epoch": 6.253135689851767, "grad_norm": 3.5625, "learning_rate": 1.6881575698169662e-06, "loss": 0.5656, "mean_token_accuracy": 0.8828807026147842, "num_tokens": 293540213.0, "step": 2745 }, { "epoch": 6.255416191562143, "grad_norm": 2.71875, "learning_rate": 1.6863764262488292e-06, "loss": 0.5841, "mean_token_accuracy": 0.8817939162254333, "num_tokens": 293647497.0, "step": 2746 }, { "epoch": 6.2576966932725195, "grad_norm": 2.828125, "learning_rate": 1.6845957444777244e-06, "loss": 0.58, "mean_token_accuracy": 0.8802742511034012, "num_tokens": 293754862.0, "step": 2747 }, { "epoch": 6.259977194982897, "grad_norm": 2.640625, "learning_rate": 1.6828155255143331e-06, "loss": 0.5715, "mean_token_accuracy": 0.882528692483902, "num_tokens": 293861700.0, "step": 2748 }, { "epoch": 6.262257696693273, "grad_norm": 6.125, "learning_rate": 1.6810357703690739e-06, "loss": 0.5875, "mean_token_accuracy": 0.8790639489889145, "num_tokens": 293969027.0, "step": 2749 }, { "epoch": 6.264538198403649, "grad_norm": 2.734375, "learning_rate": 1.6792564800521e-06, "loss": 0.5739, "mean_token_accuracy": 0.8794273883104324, "num_tokens": 294075857.0, "step": 2750 }, { "epoch": 6.266818700114025, "grad_norm": 3.046875, "learning_rate": 1.677477655573303e-06, "loss": 0.5915, "mean_token_accuracy": 0.8795068562030792, "num_tokens": 294182659.0, "step": 2751 }, { "epoch": 6.269099201824401, "grad_norm": 3.796875, "learning_rate": 1.675699297942309e-06, "loss": 0.5741, "mean_token_accuracy": 0.8815376460552216, "num_tokens": 294289557.0, "step": 2752 }, { "epoch": 6.271379703534778, "grad_norm": 3.0, "learning_rate": 1.6739214081684799e-06, "loss": 0.5931, "mean_token_accuracy": 0.876123234629631, "num_tokens": 294396897.0, "step": 2753 }, { "epoch": 6.273660205245154, "grad_norm": 2.65625, "learning_rate": 1.6721439872609125e-06, "loss": 0.5649, "mean_token_accuracy": 0.8840298056602478, "num_tokens": 294504720.0, "step": 2754 }, { "epoch": 6.27594070695553, "grad_norm": 6.96875, "learning_rate": 1.6703670362284346e-06, "loss": 0.5652, "mean_token_accuracy": 0.8847046941518784, "num_tokens": 294611688.0, "step": 2755 }, { "epoch": 6.278221208665906, "grad_norm": 3.8125, "learning_rate": 1.6685905560796101e-06, "loss": 0.584, "mean_token_accuracy": 0.878638818860054, "num_tokens": 294718635.0, "step": 2756 }, { "epoch": 6.280501710376283, "grad_norm": 3.203125, "learning_rate": 1.6668145478227354e-06, "loss": 0.5705, "mean_token_accuracy": 0.8829765170812607, "num_tokens": 294825331.0, "step": 2757 }, { "epoch": 6.282782212086659, "grad_norm": 3.921875, "learning_rate": 1.6650390124658378e-06, "loss": 0.5761, "mean_token_accuracy": 0.878277525305748, "num_tokens": 294932357.0, "step": 2758 }, { "epoch": 6.285062713797036, "grad_norm": 2.9375, "learning_rate": 1.663263951016678e-06, "loss": 0.5759, "mean_token_accuracy": 0.8813124597072601, "num_tokens": 295039173.0, "step": 2759 }, { "epoch": 6.287343215507412, "grad_norm": 4.28125, "learning_rate": 1.661489364482745e-06, "loss": 0.575, "mean_token_accuracy": 0.8825227618217468, "num_tokens": 295146182.0, "step": 2760 }, { "epoch": 6.289623717217788, "grad_norm": 3.015625, "learning_rate": 1.6597152538712608e-06, "loss": 0.5799, "mean_token_accuracy": 0.8797194510698318, "num_tokens": 295253659.0, "step": 2761 }, { "epoch": 6.291904218928164, "grad_norm": 2.765625, "learning_rate": 1.6579416201891757e-06, "loss": 0.5815, "mean_token_accuracy": 0.8829602301120758, "num_tokens": 295361105.0, "step": 2762 }, { "epoch": 6.29418472063854, "grad_norm": 5.4375, "learning_rate": 1.6561684644431709e-06, "loss": 0.5895, "mean_token_accuracy": 0.8760716319084167, "num_tokens": 295467879.0, "step": 2763 }, { "epoch": 6.296465222348917, "grad_norm": 4.25, "learning_rate": 1.6543957876396544e-06, "loss": 0.5639, "mean_token_accuracy": 0.8809865862131119, "num_tokens": 295575043.0, "step": 2764 }, { "epoch": 6.298745724059293, "grad_norm": 4.40625, "learning_rate": 1.6526235907847649e-06, "loss": 0.585, "mean_token_accuracy": 0.8808043301105499, "num_tokens": 295682338.0, "step": 2765 }, { "epoch": 6.301026225769669, "grad_norm": 3.0625, "learning_rate": 1.6508518748843651e-06, "loss": 0.5653, "mean_token_accuracy": 0.8840913772583008, "num_tokens": 295789718.0, "step": 2766 }, { "epoch": 6.303306727480045, "grad_norm": 2.9375, "learning_rate": 1.649080640944048e-06, "loss": 0.5719, "mean_token_accuracy": 0.8826287239789963, "num_tokens": 295897773.0, "step": 2767 }, { "epoch": 6.305587229190422, "grad_norm": 3.28125, "learning_rate": 1.6473098899691313e-06, "loss": 0.5909, "mean_token_accuracy": 0.8791597187519073, "num_tokens": 296004423.0, "step": 2768 }, { "epoch": 6.307867730900798, "grad_norm": 4.0, "learning_rate": 1.6455396229646595e-06, "loss": 0.5639, "mean_token_accuracy": 0.8822521567344666, "num_tokens": 296111701.0, "step": 2769 }, { "epoch": 6.310148232611175, "grad_norm": 2.6875, "learning_rate": 1.6437698409354025e-06, "loss": 0.5532, "mean_token_accuracy": 0.887429729104042, "num_tokens": 296218964.0, "step": 2770 }, { "epoch": 6.312428734321551, "grad_norm": 2.921875, "learning_rate": 1.6420005448858522e-06, "loss": 0.5738, "mean_token_accuracy": 0.8810682147741318, "num_tokens": 296325719.0, "step": 2771 }, { "epoch": 6.314709236031927, "grad_norm": 4.21875, "learning_rate": 1.6402317358202286e-06, "loss": 0.5797, "mean_token_accuracy": 0.8821781575679779, "num_tokens": 296432464.0, "step": 2772 }, { "epoch": 6.316989737742303, "grad_norm": 4.125, "learning_rate": 1.6384634147424732e-06, "loss": 0.5668, "mean_token_accuracy": 0.8829574584960938, "num_tokens": 296539951.0, "step": 2773 }, { "epoch": 6.319270239452679, "grad_norm": 2.71875, "learning_rate": 1.636695582656251e-06, "loss": 0.5872, "mean_token_accuracy": 0.8789397776126862, "num_tokens": 296646664.0, "step": 2774 }, { "epoch": 6.321550741163056, "grad_norm": 3.359375, "learning_rate": 1.6349282405649506e-06, "loss": 0.5393, "mean_token_accuracy": 0.889549657702446, "num_tokens": 296754082.0, "step": 2775 }, { "epoch": 6.323831242873432, "grad_norm": 3.6875, "learning_rate": 1.6331613894716787e-06, "loss": 0.5546, "mean_token_accuracy": 0.8843847513198853, "num_tokens": 296861010.0, "step": 2776 }, { "epoch": 6.326111744583809, "grad_norm": 6.21875, "learning_rate": 1.6313950303792672e-06, "loss": 0.5679, "mean_token_accuracy": 0.8804360330104828, "num_tokens": 296968168.0, "step": 2777 }, { "epoch": 6.328392246294185, "grad_norm": 2.796875, "learning_rate": 1.6296291642902673e-06, "loss": 0.5791, "mean_token_accuracy": 0.8830510526895523, "num_tokens": 297075136.0, "step": 2778 }, { "epoch": 6.330672748004561, "grad_norm": 3.046875, "learning_rate": 1.6278637922069512e-06, "loss": 0.5924, "mean_token_accuracy": 0.8757496923208237, "num_tokens": 297182280.0, "step": 2779 }, { "epoch": 6.3329532497149374, "grad_norm": 4.125, "learning_rate": 1.6260989151313091e-06, "loss": 0.5731, "mean_token_accuracy": 0.8829326927661896, "num_tokens": 297288826.0, "step": 2780 }, { "epoch": 6.335233751425314, "grad_norm": 3.59375, "learning_rate": 1.6243345340650523e-06, "loss": 0.5787, "mean_token_accuracy": 0.8820019513368607, "num_tokens": 297395720.0, "step": 2781 }, { "epoch": 6.33751425313569, "grad_norm": 2.53125, "learning_rate": 1.6225706500096079e-06, "loss": 0.5719, "mean_token_accuracy": 0.8875852972269058, "num_tokens": 297502615.0, "step": 2782 }, { "epoch": 6.339794754846066, "grad_norm": 2.875, "learning_rate": 1.6208072639661226e-06, "loss": 0.5507, "mean_token_accuracy": 0.8859220743179321, "num_tokens": 297609799.0, "step": 2783 }, { "epoch": 6.342075256556442, "grad_norm": 3.03125, "learning_rate": 1.6190443769354608e-06, "loss": 0.5932, "mean_token_accuracy": 0.8765744715929031, "num_tokens": 297716620.0, "step": 2784 }, { "epoch": 6.344355758266818, "grad_norm": 2.84375, "learning_rate": 1.6172819899182036e-06, "loss": 0.5781, "mean_token_accuracy": 0.8797289431095123, "num_tokens": 297823497.0, "step": 2785 }, { "epoch": 6.346636259977195, "grad_norm": 4.9375, "learning_rate": 1.6155201039146478e-06, "loss": 0.5865, "mean_token_accuracy": 0.876724436879158, "num_tokens": 297930683.0, "step": 2786 }, { "epoch": 6.348916761687571, "grad_norm": 3.75, "learning_rate": 1.613758719924805e-06, "loss": 0.589, "mean_token_accuracy": 0.8765078336000443, "num_tokens": 298037433.0, "step": 2787 }, { "epoch": 6.351197263397948, "grad_norm": 2.640625, "learning_rate": 1.611997838948403e-06, "loss": 0.5686, "mean_token_accuracy": 0.8840204477310181, "num_tokens": 298145428.0, "step": 2788 }, { "epoch": 6.353477765108324, "grad_norm": 4.71875, "learning_rate": 1.6102374619848845e-06, "loss": 0.5716, "mean_token_accuracy": 0.8840190768241882, "num_tokens": 298252510.0, "step": 2789 }, { "epoch": 6.3557582668187, "grad_norm": 2.640625, "learning_rate": 1.6084775900334046e-06, "loss": 0.5767, "mean_token_accuracy": 0.8814188688993454, "num_tokens": 298359365.0, "step": 2790 }, { "epoch": 6.3580387685290765, "grad_norm": 2.953125, "learning_rate": 1.6067182240928332e-06, "loss": 0.5766, "mean_token_accuracy": 0.8799934387207031, "num_tokens": 298466492.0, "step": 2791 }, { "epoch": 6.360319270239453, "grad_norm": 2.96875, "learning_rate": 1.6049593651617534e-06, "loss": 0.5805, "mean_token_accuracy": 0.8819076120853424, "num_tokens": 298573579.0, "step": 2792 }, { "epoch": 6.362599771949829, "grad_norm": 3.21875, "learning_rate": 1.6032010142384572e-06, "loss": 0.575, "mean_token_accuracy": 0.880363255739212, "num_tokens": 298680688.0, "step": 2793 }, { "epoch": 6.364880273660205, "grad_norm": 3.28125, "learning_rate": 1.6014431723209522e-06, "loss": 0.5801, "mean_token_accuracy": 0.8782119452953339, "num_tokens": 298787550.0, "step": 2794 }, { "epoch": 6.367160775370581, "grad_norm": 3.3125, "learning_rate": 1.599685840406955e-06, "loss": 0.5735, "mean_token_accuracy": 0.8798055797815323, "num_tokens": 298895010.0, "step": 2795 }, { "epoch": 6.369441277080957, "grad_norm": 3.59375, "learning_rate": 1.5979290194938938e-06, "loss": 0.5831, "mean_token_accuracy": 0.8793617337942123, "num_tokens": 299001708.0, "step": 2796 }, { "epoch": 6.3717217787913345, "grad_norm": 3.015625, "learning_rate": 1.5961727105789072e-06, "loss": 0.5857, "mean_token_accuracy": 0.8799160867929459, "num_tokens": 299108766.0, "step": 2797 }, { "epoch": 6.374002280501711, "grad_norm": 3.375, "learning_rate": 1.5944169146588395e-06, "loss": 0.5493, "mean_token_accuracy": 0.8847594708204269, "num_tokens": 299215718.0, "step": 2798 }, { "epoch": 6.376282782212087, "grad_norm": 4.96875, "learning_rate": 1.5926616327302482e-06, "loss": 0.5708, "mean_token_accuracy": 0.8823742717504501, "num_tokens": 299322716.0, "step": 2799 }, { "epoch": 6.378563283922463, "grad_norm": 4.6875, "learning_rate": 1.5909068657893978e-06, "loss": 0.574, "mean_token_accuracy": 0.8821296989917755, "num_tokens": 299429926.0, "step": 2800 }, { "epoch": 6.380843785632839, "grad_norm": 2.75, "learning_rate": 1.5891526148322594e-06, "loss": 0.5701, "mean_token_accuracy": 0.8831614851951599, "num_tokens": 299537185.0, "step": 2801 }, { "epoch": 6.3831242873432155, "grad_norm": 3.125, "learning_rate": 1.5873988808545127e-06, "loss": 0.5669, "mean_token_accuracy": 0.8860654383897781, "num_tokens": 299644399.0, "step": 2802 }, { "epoch": 6.385404789053592, "grad_norm": 2.609375, "learning_rate": 1.5856456648515425e-06, "loss": 0.5397, "mean_token_accuracy": 0.8893062770366669, "num_tokens": 299751766.0, "step": 2803 }, { "epoch": 6.387685290763968, "grad_norm": 2.5, "learning_rate": 1.5838929678184405e-06, "loss": 0.5785, "mean_token_accuracy": 0.880397379398346, "num_tokens": 299857907.0, "step": 2804 }, { "epoch": 6.389965792474344, "grad_norm": 2.953125, "learning_rate": 1.5821407907500036e-06, "loss": 0.5597, "mean_token_accuracy": 0.8866594880819321, "num_tokens": 299965673.0, "step": 2805 }, { "epoch": 6.39224629418472, "grad_norm": 3.078125, "learning_rate": 1.5803891346407342e-06, "loss": 0.5633, "mean_token_accuracy": 0.8857071846723557, "num_tokens": 300072879.0, "step": 2806 }, { "epoch": 6.394526795895097, "grad_norm": 2.8125, "learning_rate": 1.5786380004848379e-06, "loss": 0.582, "mean_token_accuracy": 0.8817501962184906, "num_tokens": 300180006.0, "step": 2807 }, { "epoch": 6.3968072976054735, "grad_norm": 2.6875, "learning_rate": 1.576887389276226e-06, "loss": 0.5684, "mean_token_accuracy": 0.8835788518190384, "num_tokens": 300287280.0, "step": 2808 }, { "epoch": 6.39908779931585, "grad_norm": 2.9375, "learning_rate": 1.5751373020085093e-06, "loss": 0.5706, "mean_token_accuracy": 0.8786880671977997, "num_tokens": 300394445.0, "step": 2809 }, { "epoch": 6.401368301026226, "grad_norm": 3.046875, "learning_rate": 1.5733877396750051e-06, "loss": 0.5719, "mean_token_accuracy": 0.8815035223960876, "num_tokens": 300501045.0, "step": 2810 }, { "epoch": 6.403648802736602, "grad_norm": 2.53125, "learning_rate": 1.5716387032687314e-06, "loss": 0.5715, "mean_token_accuracy": 0.883103683590889, "num_tokens": 300607844.0, "step": 2811 }, { "epoch": 6.405929304446978, "grad_norm": 2.8125, "learning_rate": 1.5698901937824066e-06, "loss": 0.5944, "mean_token_accuracy": 0.8773173838853836, "num_tokens": 300714942.0, "step": 2812 }, { "epoch": 6.4082098061573545, "grad_norm": 3.0625, "learning_rate": 1.5681422122084522e-06, "loss": 0.5497, "mean_token_accuracy": 0.8895971029996872, "num_tokens": 300822920.0, "step": 2813 }, { "epoch": 6.410490307867731, "grad_norm": 4.4375, "learning_rate": 1.5663947595389873e-06, "loss": 0.5817, "mean_token_accuracy": 0.8785582333803177, "num_tokens": 300930457.0, "step": 2814 }, { "epoch": 6.412770809578107, "grad_norm": 2.609375, "learning_rate": 1.5646478367658325e-06, "loss": 0.5739, "mean_token_accuracy": 0.8812803626060486, "num_tokens": 301037070.0, "step": 2815 }, { "epoch": 6.415051311288483, "grad_norm": 3.09375, "learning_rate": 1.562901444880508e-06, "loss": 0.5524, "mean_token_accuracy": 0.8858920186758041, "num_tokens": 301144185.0, "step": 2816 }, { "epoch": 6.41733181299886, "grad_norm": 2.640625, "learning_rate": 1.5611555848742318e-06, "loss": 0.5678, "mean_token_accuracy": 0.8812452852725983, "num_tokens": 301251360.0, "step": 2817 }, { "epoch": 6.419612314709236, "grad_norm": 4.25, "learning_rate": 1.5594102577379216e-06, "loss": 0.5622, "mean_token_accuracy": 0.8858301192522049, "num_tokens": 301358185.0, "step": 2818 }, { "epoch": 6.4218928164196125, "grad_norm": 3.515625, "learning_rate": 1.5576654644621897e-06, "loss": 0.5916, "mean_token_accuracy": 0.8778669685125351, "num_tokens": 301464922.0, "step": 2819 }, { "epoch": 6.424173318129989, "grad_norm": 2.578125, "learning_rate": 1.5559212060373474e-06, "loss": 0.5671, "mean_token_accuracy": 0.8837179839611053, "num_tokens": 301571945.0, "step": 2820 }, { "epoch": 6.426453819840365, "grad_norm": 2.765625, "learning_rate": 1.5541774834534024e-06, "loss": 0.5865, "mean_token_accuracy": 0.8790989071130753, "num_tokens": 301678995.0, "step": 2821 }, { "epoch": 6.428734321550741, "grad_norm": 3.46875, "learning_rate": 1.5524342977000587e-06, "loss": 0.5816, "mean_token_accuracy": 0.880190521478653, "num_tokens": 301786689.0, "step": 2822 }, { "epoch": 6.431014823261117, "grad_norm": 3.703125, "learning_rate": 1.5506916497667134e-06, "loss": 0.5599, "mean_token_accuracy": 0.8833551555871964, "num_tokens": 301894003.0, "step": 2823 }, { "epoch": 6.4332953249714935, "grad_norm": 2.671875, "learning_rate": 1.5489495406424618e-06, "loss": 0.5376, "mean_token_accuracy": 0.8886000365018845, "num_tokens": 302001595.0, "step": 2824 }, { "epoch": 6.43557582668187, "grad_norm": 2.71875, "learning_rate": 1.5472079713160892e-06, "loss": 0.5669, "mean_token_accuracy": 0.8830095380544662, "num_tokens": 302109787.0, "step": 2825 }, { "epoch": 6.437856328392247, "grad_norm": 4.0, "learning_rate": 1.5454669427760774e-06, "loss": 0.5723, "mean_token_accuracy": 0.8822568953037262, "num_tokens": 302216211.0, "step": 2826 }, { "epoch": 6.440136830102623, "grad_norm": 2.9375, "learning_rate": 1.5437264560106014e-06, "loss": 0.5764, "mean_token_accuracy": 0.8839522004127502, "num_tokens": 302323292.0, "step": 2827 }, { "epoch": 6.442417331812999, "grad_norm": 2.75, "learning_rate": 1.5419865120075267e-06, "loss": 0.5576, "mean_token_accuracy": 0.8829583674669266, "num_tokens": 302430317.0, "step": 2828 }, { "epoch": 6.444697833523375, "grad_norm": 3.734375, "learning_rate": 1.5402471117544143e-06, "loss": 0.5663, "mean_token_accuracy": 0.8828336149454117, "num_tokens": 302537141.0, "step": 2829 }, { "epoch": 6.4469783352337515, "grad_norm": 3.1875, "learning_rate": 1.5385082562385112e-06, "loss": 0.5505, "mean_token_accuracy": 0.8884585201740265, "num_tokens": 302644739.0, "step": 2830 }, { "epoch": 6.449258836944128, "grad_norm": 2.828125, "learning_rate": 1.5367699464467596e-06, "loss": 0.5758, "mean_token_accuracy": 0.8810509741306305, "num_tokens": 302751800.0, "step": 2831 }, { "epoch": 6.451539338654504, "grad_norm": 4.1875, "learning_rate": 1.5350321833657904e-06, "loss": 0.6045, "mean_token_accuracy": 0.8751820474863052, "num_tokens": 302858532.0, "step": 2832 }, { "epoch": 6.45381984036488, "grad_norm": 3.046875, "learning_rate": 1.5332949679819251e-06, "loss": 0.5479, "mean_token_accuracy": 0.8875158727169037, "num_tokens": 302965381.0, "step": 2833 }, { "epoch": 6.456100342075256, "grad_norm": 3.6875, "learning_rate": 1.531558301281173e-06, "loss": 0.577, "mean_token_accuracy": 0.8815342336893082, "num_tokens": 303072747.0, "step": 2834 }, { "epoch": 6.4583808437856325, "grad_norm": 2.9375, "learning_rate": 1.5298221842492328e-06, "loss": 0.5683, "mean_token_accuracy": 0.8833811283111572, "num_tokens": 303179347.0, "step": 2835 }, { "epoch": 6.460661345496009, "grad_norm": 3.96875, "learning_rate": 1.5280866178714898e-06, "loss": 0.5662, "mean_token_accuracy": 0.8801722079515457, "num_tokens": 303287273.0, "step": 2836 }, { "epoch": 6.462941847206386, "grad_norm": 3.453125, "learning_rate": 1.5263516031330195e-06, "loss": 0.5798, "mean_token_accuracy": 0.8790634274482727, "num_tokens": 303394414.0, "step": 2837 }, { "epoch": 6.465222348916762, "grad_norm": 2.890625, "learning_rate": 1.524617141018582e-06, "loss": 0.5903, "mean_token_accuracy": 0.8738131821155548, "num_tokens": 303501157.0, "step": 2838 }, { "epoch": 6.467502850627138, "grad_norm": 2.796875, "learning_rate": 1.5228832325126248e-06, "loss": 0.5768, "mean_token_accuracy": 0.881572037935257, "num_tokens": 303608294.0, "step": 2839 }, { "epoch": 6.469783352337514, "grad_norm": 4.21875, "learning_rate": 1.5211498785992818e-06, "loss": 0.5717, "mean_token_accuracy": 0.8818154633045197, "num_tokens": 303715138.0, "step": 2840 }, { "epoch": 6.4720638540478905, "grad_norm": 3.65625, "learning_rate": 1.5194170802623692e-06, "loss": 0.5776, "mean_token_accuracy": 0.8801160901784897, "num_tokens": 303822023.0, "step": 2841 }, { "epoch": 6.474344355758267, "grad_norm": 3.484375, "learning_rate": 1.5176848384853913e-06, "loss": 0.5854, "mean_token_accuracy": 0.8776775300502777, "num_tokens": 303929007.0, "step": 2842 }, { "epoch": 6.476624857468643, "grad_norm": 2.859375, "learning_rate": 1.515953154251535e-06, "loss": 0.5725, "mean_token_accuracy": 0.8842798173427582, "num_tokens": 304035636.0, "step": 2843 }, { "epoch": 6.478905359179019, "grad_norm": 4.53125, "learning_rate": 1.5142220285436701e-06, "loss": 0.5627, "mean_token_accuracy": 0.8824753314256668, "num_tokens": 304142349.0, "step": 2844 }, { "epoch": 6.481185860889395, "grad_norm": 4.0, "learning_rate": 1.512491462344351e-06, "loss": 0.5708, "mean_token_accuracy": 0.8803005963563919, "num_tokens": 304249360.0, "step": 2845 }, { "epoch": 6.483466362599772, "grad_norm": 2.9375, "learning_rate": 1.5107614566358136e-06, "loss": 0.5729, "mean_token_accuracy": 0.8824707418680191, "num_tokens": 304357003.0, "step": 2846 }, { "epoch": 6.485746864310149, "grad_norm": 3.15625, "learning_rate": 1.5090320123999746e-06, "loss": 0.5918, "mean_token_accuracy": 0.8782989233732224, "num_tokens": 304463367.0, "step": 2847 }, { "epoch": 6.488027366020525, "grad_norm": 4.90625, "learning_rate": 1.5073031306184343e-06, "loss": 0.5776, "mean_token_accuracy": 0.882141649723053, "num_tokens": 304570072.0, "step": 2848 }, { "epoch": 6.490307867730901, "grad_norm": 3.15625, "learning_rate": 1.5055748122724722e-06, "loss": 0.585, "mean_token_accuracy": 0.8819586336612701, "num_tokens": 304677214.0, "step": 2849 }, { "epoch": 6.492588369441277, "grad_norm": 2.4375, "learning_rate": 1.5038470583430485e-06, "loss": 0.5446, "mean_token_accuracy": 0.8865179270505905, "num_tokens": 304784053.0, "step": 2850 }, { "epoch": 6.494868871151653, "grad_norm": 3.78125, "learning_rate": 1.5021198698108038e-06, "loss": 0.5841, "mean_token_accuracy": 0.8798740953207016, "num_tokens": 304891175.0, "step": 2851 }, { "epoch": 6.4971493728620295, "grad_norm": 6.3125, "learning_rate": 1.5003932476560554e-06, "loss": 0.5782, "mean_token_accuracy": 0.8817462623119354, "num_tokens": 304997679.0, "step": 2852 }, { "epoch": 6.499429874572406, "grad_norm": 4.03125, "learning_rate": 1.4986671928588016e-06, "loss": 0.5738, "mean_token_accuracy": 0.8805342614650726, "num_tokens": 305104659.0, "step": 2853 }, { "epoch": 6.501710376282782, "grad_norm": 7.0, "learning_rate": 1.496941706398718e-06, "loss": 0.5651, "mean_token_accuracy": 0.8827610611915588, "num_tokens": 305211801.0, "step": 2854 }, { "epoch": 6.503990877993158, "grad_norm": 3.4375, "learning_rate": 1.495216789255156e-06, "loss": 0.5888, "mean_token_accuracy": 0.8812805563211441, "num_tokens": 305317978.0, "step": 2855 }, { "epoch": 6.506271379703534, "grad_norm": 2.78125, "learning_rate": 1.4934924424071479e-06, "loss": 0.5954, "mean_token_accuracy": 0.8763656914234161, "num_tokens": 305424656.0, "step": 2856 }, { "epoch": 6.508551881413911, "grad_norm": 2.71875, "learning_rate": 1.4917686668333975e-06, "loss": 0.5881, "mean_token_accuracy": 0.881349191069603, "num_tokens": 305531468.0, "step": 2857 }, { "epoch": 6.510832383124288, "grad_norm": 6.21875, "learning_rate": 1.4900454635122866e-06, "loss": 0.5963, "mean_token_accuracy": 0.8755508661270142, "num_tokens": 305638201.0, "step": 2858 }, { "epoch": 6.513112884834664, "grad_norm": 3.921875, "learning_rate": 1.4883228334218727e-06, "loss": 0.5756, "mean_token_accuracy": 0.8790022134780884, "num_tokens": 305745038.0, "step": 2859 }, { "epoch": 6.51539338654504, "grad_norm": 6.1875, "learning_rate": 1.4866007775398874e-06, "loss": 0.5735, "mean_token_accuracy": 0.8798090815544128, "num_tokens": 305852570.0, "step": 2860 }, { "epoch": 6.51539338654504, "eval_loss": 0.5866165161132812, "eval_mean_token_accuracy": 0.8798247088497583, "eval_num_tokens": 305852570.0, "eval_runtime": 58.6358, "eval_samples_per_second": 143.001, "eval_steps_per_second": 4.485, "step": 2860 }, { "epoch": 6.517673888255416, "grad_norm": 2.484375, "learning_rate": 1.4848792968437376e-06, "loss": 0.54, "mean_token_accuracy": 0.888011172413826, "num_tokens": 305960317.0, "step": 2861 }, { "epoch": 6.519954389965792, "grad_norm": 3.234375, "learning_rate": 1.4831583923105e-06, "loss": 0.5677, "mean_token_accuracy": 0.8860371559858322, "num_tokens": 306067692.0, "step": 2862 }, { "epoch": 6.5222348916761685, "grad_norm": 2.921875, "learning_rate": 1.481438064916928e-06, "loss": 0.5586, "mean_token_accuracy": 0.8848079293966293, "num_tokens": 306174375.0, "step": 2863 }, { "epoch": 6.524515393386545, "grad_norm": 3.1875, "learning_rate": 1.4797183156394462e-06, "loss": 0.5544, "mean_token_accuracy": 0.8868988156318665, "num_tokens": 306282289.0, "step": 2864 }, { "epoch": 6.526795895096921, "grad_norm": 2.984375, "learning_rate": 1.477999145454152e-06, "loss": 0.5735, "mean_token_accuracy": 0.8846388012170792, "num_tokens": 306389414.0, "step": 2865 }, { "epoch": 6.529076396807298, "grad_norm": 5.15625, "learning_rate": 1.4762805553368115e-06, "loss": 0.5892, "mean_token_accuracy": 0.8790937066078186, "num_tokens": 306496182.0, "step": 2866 }, { "epoch": 6.531356898517674, "grad_norm": 3.90625, "learning_rate": 1.4745625462628654e-06, "loss": 0.5926, "mean_token_accuracy": 0.8804367631673813, "num_tokens": 306603764.0, "step": 2867 }, { "epoch": 6.53363740022805, "grad_norm": 4.0, "learning_rate": 1.47284511920742e-06, "loss": 0.5766, "mean_token_accuracy": 0.8812845051288605, "num_tokens": 306710535.0, "step": 2868 }, { "epoch": 6.535917901938427, "grad_norm": 3.421875, "learning_rate": 1.4711282751452549e-06, "loss": 0.6152, "mean_token_accuracy": 0.8755820095539093, "num_tokens": 306817409.0, "step": 2869 }, { "epoch": 6.538198403648803, "grad_norm": 3.28125, "learning_rate": 1.4694120150508179e-06, "loss": 0.5614, "mean_token_accuracy": 0.8860864490270615, "num_tokens": 306924651.0, "step": 2870 }, { "epoch": 6.540478905359179, "grad_norm": 3.921875, "learning_rate": 1.4676963398982248e-06, "loss": 0.5668, "mean_token_accuracy": 0.8829345405101776, "num_tokens": 307032470.0, "step": 2871 }, { "epoch": 6.542759407069555, "grad_norm": 3.390625, "learning_rate": 1.4659812506612608e-06, "loss": 0.5594, "mean_token_accuracy": 0.8877181857824326, "num_tokens": 307139315.0, "step": 2872 }, { "epoch": 6.545039908779931, "grad_norm": 3.578125, "learning_rate": 1.4642667483133753e-06, "loss": 0.5823, "mean_token_accuracy": 0.8820486813783646, "num_tokens": 307245855.0, "step": 2873 }, { "epoch": 6.5473204104903076, "grad_norm": 4.0, "learning_rate": 1.4625528338276879e-06, "loss": 0.5888, "mean_token_accuracy": 0.8795684576034546, "num_tokens": 307352674.0, "step": 2874 }, { "epoch": 6.549600912200685, "grad_norm": 3.015625, "learning_rate": 1.4608395081769833e-06, "loss": 0.551, "mean_token_accuracy": 0.885527640581131, "num_tokens": 307460336.0, "step": 2875 }, { "epoch": 6.55188141391106, "grad_norm": 2.875, "learning_rate": 1.4591267723337122e-06, "loss": 0.5699, "mean_token_accuracy": 0.8819777965545654, "num_tokens": 307567742.0, "step": 2876 }, { "epoch": 6.554161915621437, "grad_norm": 2.75, "learning_rate": 1.4574146272699914e-06, "loss": 0.566, "mean_token_accuracy": 0.8834614157676697, "num_tokens": 307674838.0, "step": 2877 }, { "epoch": 6.556442417331813, "grad_norm": 3.015625, "learning_rate": 1.4557030739575988e-06, "loss": 0.5621, "mean_token_accuracy": 0.8842470943927765, "num_tokens": 307782038.0, "step": 2878 }, { "epoch": 6.558722919042189, "grad_norm": 6.40625, "learning_rate": 1.4539921133679808e-06, "loss": 0.5837, "mean_token_accuracy": 0.8773439824581146, "num_tokens": 307889575.0, "step": 2879 }, { "epoch": 6.561003420752566, "grad_norm": 2.609375, "learning_rate": 1.4522817464722453e-06, "loss": 0.5685, "mean_token_accuracy": 0.8828644007444382, "num_tokens": 307996562.0, "step": 2880 }, { "epoch": 6.563283922462942, "grad_norm": 3.03125, "learning_rate": 1.4505719742411644e-06, "loss": 0.581, "mean_token_accuracy": 0.8790508657693863, "num_tokens": 308103487.0, "step": 2881 }, { "epoch": 6.565564424173318, "grad_norm": 3.140625, "learning_rate": 1.44886279764517e-06, "loss": 0.5806, "mean_token_accuracy": 0.881091371178627, "num_tokens": 308210017.0, "step": 2882 }, { "epoch": 6.567844925883694, "grad_norm": 3.890625, "learning_rate": 1.4471542176543587e-06, "loss": 0.5541, "mean_token_accuracy": 0.8873138725757599, "num_tokens": 308316619.0, "step": 2883 }, { "epoch": 6.57012542759407, "grad_norm": 3.421875, "learning_rate": 1.4454462352384885e-06, "loss": 0.5621, "mean_token_accuracy": 0.8838976472616196, "num_tokens": 308424126.0, "step": 2884 }, { "epoch": 6.572405929304447, "grad_norm": 2.40625, "learning_rate": 1.4437388513669754e-06, "loss": 0.5733, "mean_token_accuracy": 0.880536213517189, "num_tokens": 308531132.0, "step": 2885 }, { "epoch": 6.574686431014824, "grad_norm": 3.09375, "learning_rate": 1.4420320670088977e-06, "loss": 0.5665, "mean_token_accuracy": 0.8818255960941315, "num_tokens": 308639423.0, "step": 2886 }, { "epoch": 6.5769669327252, "grad_norm": 3.265625, "learning_rate": 1.4403258831329947e-06, "loss": 0.5709, "mean_token_accuracy": 0.8803568929433823, "num_tokens": 308746298.0, "step": 2887 }, { "epoch": 6.579247434435576, "grad_norm": 3.0625, "learning_rate": 1.4386203007076632e-06, "loss": 0.569, "mean_token_accuracy": 0.8798245489597321, "num_tokens": 308853686.0, "step": 2888 }, { "epoch": 6.581527936145952, "grad_norm": 4.53125, "learning_rate": 1.4369153207009573e-06, "loss": 0.5695, "mean_token_accuracy": 0.8845240324735641, "num_tokens": 308960699.0, "step": 2889 }, { "epoch": 6.583808437856328, "grad_norm": 3.84375, "learning_rate": 1.4352109440805917e-06, "loss": 0.5706, "mean_token_accuracy": 0.880035325884819, "num_tokens": 309067894.0, "step": 2890 }, { "epoch": 6.586088939566705, "grad_norm": 2.9375, "learning_rate": 1.4335071718139379e-06, "loss": 0.5742, "mean_token_accuracy": 0.8802482336759567, "num_tokens": 309175232.0, "step": 2891 }, { "epoch": 6.588369441277081, "grad_norm": 3.3125, "learning_rate": 1.4318040048680238e-06, "loss": 0.5359, "mean_token_accuracy": 0.8893679231405258, "num_tokens": 309282678.0, "step": 2892 }, { "epoch": 6.590649942987457, "grad_norm": 3.671875, "learning_rate": 1.430101444209535e-06, "loss": 0.5545, "mean_token_accuracy": 0.8879834562540054, "num_tokens": 309390130.0, "step": 2893 }, { "epoch": 6.592930444697833, "grad_norm": 3.421875, "learning_rate": 1.4283994908048107e-06, "loss": 0.5622, "mean_token_accuracy": 0.8846594393253326, "num_tokens": 309497117.0, "step": 2894 }, { "epoch": 6.59521094640821, "grad_norm": 3.171875, "learning_rate": 1.426698145619847e-06, "loss": 0.5772, "mean_token_accuracy": 0.8807022422552109, "num_tokens": 309604240.0, "step": 2895 }, { "epoch": 6.5974914481185865, "grad_norm": 2.640625, "learning_rate": 1.424997409620295e-06, "loss": 0.5594, "mean_token_accuracy": 0.881415531039238, "num_tokens": 309712035.0, "step": 2896 }, { "epoch": 6.599771949828963, "grad_norm": 3.140625, "learning_rate": 1.4232972837714598e-06, "loss": 0.5659, "mean_token_accuracy": 0.8861576318740845, "num_tokens": 309818871.0, "step": 2897 }, { "epoch": 6.602052451539339, "grad_norm": 3.265625, "learning_rate": 1.4215977690382998e-06, "loss": 0.5611, "mean_token_accuracy": 0.8870658576488495, "num_tokens": 309926653.0, "step": 2898 }, { "epoch": 6.604332953249715, "grad_norm": 2.96875, "learning_rate": 1.4198988663854276e-06, "loss": 0.5845, "mean_token_accuracy": 0.8792040795087814, "num_tokens": 310033580.0, "step": 2899 }, { "epoch": 6.606613454960091, "grad_norm": 2.859375, "learning_rate": 1.4182005767771057e-06, "loss": 0.5611, "mean_token_accuracy": 0.8829011768102646, "num_tokens": 310141344.0, "step": 2900 }, { "epoch": 6.608893956670467, "grad_norm": 3.734375, "learning_rate": 1.4165029011772513e-06, "loss": 0.5765, "mean_token_accuracy": 0.8812812268733978, "num_tokens": 310247976.0, "step": 2901 }, { "epoch": 6.611174458380844, "grad_norm": 3.1875, "learning_rate": 1.4148058405494328e-06, "loss": 0.572, "mean_token_accuracy": 0.8837906569242477, "num_tokens": 310354966.0, "step": 2902 }, { "epoch": 6.61345496009122, "grad_norm": 3.96875, "learning_rate": 1.4131093958568695e-06, "loss": 0.566, "mean_token_accuracy": 0.8823096752166748, "num_tokens": 310462145.0, "step": 2903 }, { "epoch": 6.615735461801596, "grad_norm": 3.21875, "learning_rate": 1.4114135680624291e-06, "loss": 0.5919, "mean_token_accuracy": 0.8795100599527359, "num_tokens": 310568947.0, "step": 2904 }, { "epoch": 6.618015963511972, "grad_norm": 3.046875, "learning_rate": 1.4097183581286322e-06, "loss": 0.5776, "mean_token_accuracy": 0.8784335255622864, "num_tokens": 310675494.0, "step": 2905 }, { "epoch": 6.620296465222349, "grad_norm": 4.53125, "learning_rate": 1.4080237670176456e-06, "loss": 0.5678, "mean_token_accuracy": 0.8832641243934631, "num_tokens": 310782897.0, "step": 2906 }, { "epoch": 6.6225769669327255, "grad_norm": 3.125, "learning_rate": 1.4063297956912875e-06, "loss": 0.5763, "mean_token_accuracy": 0.8814590126276016, "num_tokens": 310890161.0, "step": 2907 }, { "epoch": 6.624857468643102, "grad_norm": 2.5, "learning_rate": 1.4046364451110234e-06, "loss": 0.5637, "mean_token_accuracy": 0.884671688079834, "num_tokens": 310996832.0, "step": 2908 }, { "epoch": 6.627137970353478, "grad_norm": 3.265625, "learning_rate": 1.4029437162379666e-06, "loss": 0.5814, "mean_token_accuracy": 0.8783999532461166, "num_tokens": 311103426.0, "step": 2909 }, { "epoch": 6.629418472063854, "grad_norm": 4.3125, "learning_rate": 1.4012516100328766e-06, "loss": 0.5786, "mean_token_accuracy": 0.8824923038482666, "num_tokens": 311210237.0, "step": 2910 }, { "epoch": 6.63169897377423, "grad_norm": 4.40625, "learning_rate": 1.3995601274561605e-06, "loss": 0.571, "mean_token_accuracy": 0.8831194937229156, "num_tokens": 311317987.0, "step": 2911 }, { "epoch": 6.633979475484606, "grad_norm": 3.0625, "learning_rate": 1.3978692694678711e-06, "loss": 0.5727, "mean_token_accuracy": 0.8810025453567505, "num_tokens": 311424839.0, "step": 2912 }, { "epoch": 6.636259977194983, "grad_norm": 2.90625, "learning_rate": 1.3961790370277068e-06, "loss": 0.5655, "mean_token_accuracy": 0.8853645473718643, "num_tokens": 311531661.0, "step": 2913 }, { "epoch": 6.638540478905359, "grad_norm": 2.734375, "learning_rate": 1.3944894310950113e-06, "loss": 0.5772, "mean_token_accuracy": 0.8820922076702118, "num_tokens": 311639163.0, "step": 2914 }, { "epoch": 6.640820980615736, "grad_norm": 3.5625, "learning_rate": 1.3928004526287729e-06, "loss": 0.5713, "mean_token_accuracy": 0.8839680552482605, "num_tokens": 311746011.0, "step": 2915 }, { "epoch": 6.643101482326112, "grad_norm": 3.59375, "learning_rate": 1.3911121025876212e-06, "loss": 0.5661, "mean_token_accuracy": 0.8828011155128479, "num_tokens": 311852925.0, "step": 2916 }, { "epoch": 6.645381984036488, "grad_norm": 3.296875, "learning_rate": 1.389424381929832e-06, "loss": 0.5658, "mean_token_accuracy": 0.8824039697647095, "num_tokens": 311960025.0, "step": 2917 }, { "epoch": 6.6476624857468645, "grad_norm": 2.9375, "learning_rate": 1.3877372916133234e-06, "loss": 0.5964, "mean_token_accuracy": 0.8741250783205032, "num_tokens": 312066896.0, "step": 2918 }, { "epoch": 6.649942987457241, "grad_norm": 4.6875, "learning_rate": 1.3860508325956549e-06, "loss": 0.6063, "mean_token_accuracy": 0.8746428936719894, "num_tokens": 312173856.0, "step": 2919 }, { "epoch": 6.652223489167617, "grad_norm": 3.59375, "learning_rate": 1.3843650058340291e-06, "loss": 0.5702, "mean_token_accuracy": 0.8829982429742813, "num_tokens": 312280996.0, "step": 2920 }, { "epoch": 6.654503990877993, "grad_norm": 3.859375, "learning_rate": 1.382679812285287e-06, "loss": 0.5725, "mean_token_accuracy": 0.8866865336894989, "num_tokens": 312388440.0, "step": 2921 }, { "epoch": 6.656784492588369, "grad_norm": 3.21875, "learning_rate": 1.3809952529059127e-06, "loss": 0.5597, "mean_token_accuracy": 0.8881336748600006, "num_tokens": 312495421.0, "step": 2922 }, { "epoch": 6.659064994298745, "grad_norm": 4.0, "learning_rate": 1.3793113286520293e-06, "loss": 0.563, "mean_token_accuracy": 0.884793609380722, "num_tokens": 312602917.0, "step": 2923 }, { "epoch": 6.661345496009122, "grad_norm": 3.390625, "learning_rate": 1.3776280404794016e-06, "loss": 0.5775, "mean_token_accuracy": 0.8788406997919083, "num_tokens": 312710035.0, "step": 2924 }, { "epoch": 6.663625997719498, "grad_norm": 2.84375, "learning_rate": 1.3759453893434285e-06, "loss": 0.5722, "mean_token_accuracy": 0.8826200515031815, "num_tokens": 312816809.0, "step": 2925 }, { "epoch": 6.665906499429875, "grad_norm": 3.640625, "learning_rate": 1.3742633761991519e-06, "loss": 0.5746, "mean_token_accuracy": 0.8815308213233948, "num_tokens": 312923988.0, "step": 2926 }, { "epoch": 6.668187001140251, "grad_norm": 3.28125, "learning_rate": 1.3725820020012506e-06, "loss": 0.5486, "mean_token_accuracy": 0.8900953233242035, "num_tokens": 313031446.0, "step": 2927 }, { "epoch": 6.670467502850627, "grad_norm": 2.59375, "learning_rate": 1.3709012677040385e-06, "loss": 0.5719, "mean_token_accuracy": 0.882040336728096, "num_tokens": 313138636.0, "step": 2928 }, { "epoch": 6.6727480045610035, "grad_norm": 2.671875, "learning_rate": 1.3692211742614686e-06, "loss": 0.5656, "mean_token_accuracy": 0.8804366439580917, "num_tokens": 313245263.0, "step": 2929 }, { "epoch": 6.67502850627138, "grad_norm": 4.25, "learning_rate": 1.3675417226271298e-06, "loss": 0.5615, "mean_token_accuracy": 0.8823003172874451, "num_tokens": 313353027.0, "step": 2930 }, { "epoch": 6.677309007981756, "grad_norm": 3.15625, "learning_rate": 1.365862913754247e-06, "loss": 0.5825, "mean_token_accuracy": 0.8799808919429779, "num_tokens": 313460650.0, "step": 2931 }, { "epoch": 6.679589509692132, "grad_norm": 5.90625, "learning_rate": 1.3641847485956782e-06, "loss": 0.5846, "mean_token_accuracy": 0.8758883625268936, "num_tokens": 313567543.0, "step": 2932 }, { "epoch": 6.681870011402508, "grad_norm": 3.015625, "learning_rate": 1.362507228103918e-06, "loss": 0.5508, "mean_token_accuracy": 0.8851123601198196, "num_tokens": 313674854.0, "step": 2933 }, { "epoch": 6.684150513112884, "grad_norm": 2.59375, "learning_rate": 1.3608303532310956e-06, "loss": 0.5605, "mean_token_accuracy": 0.884984016418457, "num_tokens": 313781754.0, "step": 2934 }, { "epoch": 6.6864310148232615, "grad_norm": 2.96875, "learning_rate": 1.3591541249289718e-06, "loss": 0.5382, "mean_token_accuracy": 0.8888740241527557, "num_tokens": 313889331.0, "step": 2935 }, { "epoch": 6.688711516533638, "grad_norm": 2.75, "learning_rate": 1.357478544148943e-06, "loss": 0.5517, "mean_token_accuracy": 0.8864340782165527, "num_tokens": 313997028.0, "step": 2936 }, { "epoch": 6.690992018244014, "grad_norm": 2.78125, "learning_rate": 1.3558036118420343e-06, "loss": 0.5695, "mean_token_accuracy": 0.8856192529201508, "num_tokens": 314104052.0, "step": 2937 }, { "epoch": 6.69327251995439, "grad_norm": 3.25, "learning_rate": 1.3541293289589058e-06, "loss": 0.5629, "mean_token_accuracy": 0.882482260465622, "num_tokens": 314211575.0, "step": 2938 }, { "epoch": 6.695553021664766, "grad_norm": 4.125, "learning_rate": 1.3524556964498482e-06, "loss": 0.5554, "mean_token_accuracy": 0.88676318526268, "num_tokens": 314318790.0, "step": 2939 }, { "epoch": 6.6978335233751425, "grad_norm": 2.78125, "learning_rate": 1.3507827152647835e-06, "loss": 0.5654, "mean_token_accuracy": 0.8853241056203842, "num_tokens": 314425965.0, "step": 2940 }, { "epoch": 6.700114025085519, "grad_norm": 2.53125, "learning_rate": 1.3491103863532626e-06, "loss": 0.5577, "mean_token_accuracy": 0.8847237229347229, "num_tokens": 314533161.0, "step": 2941 }, { "epoch": 6.702394526795895, "grad_norm": 3.46875, "learning_rate": 1.3474387106644688e-06, "loss": 0.5874, "mean_token_accuracy": 0.8763073831796646, "num_tokens": 314640305.0, "step": 2942 }, { "epoch": 6.704675028506271, "grad_norm": 4.5, "learning_rate": 1.345767689147211e-06, "loss": 0.5751, "mean_token_accuracy": 0.8827333003282547, "num_tokens": 314747484.0, "step": 2943 }, { "epoch": 6.706955530216648, "grad_norm": 2.90625, "learning_rate": 1.3440973227499293e-06, "loss": 0.5799, "mean_token_accuracy": 0.8817901015281677, "num_tokens": 314854356.0, "step": 2944 }, { "epoch": 6.7092360319270234, "grad_norm": 2.6875, "learning_rate": 1.3424276124206917e-06, "loss": 0.5739, "mean_token_accuracy": 0.8832406252622604, "num_tokens": 314961203.0, "step": 2945 }, { "epoch": 6.7115165336374005, "grad_norm": 3.0, "learning_rate": 1.3407585591071944e-06, "loss": 0.5863, "mean_token_accuracy": 0.8804884552955627, "num_tokens": 315067663.0, "step": 2946 }, { "epoch": 6.713797035347777, "grad_norm": 6.0, "learning_rate": 1.3390901637567579e-06, "loss": 0.5654, "mean_token_accuracy": 0.8794064670801163, "num_tokens": 315175060.0, "step": 2947 }, { "epoch": 6.716077537058153, "grad_norm": 3.15625, "learning_rate": 1.3374224273163334e-06, "loss": 0.5867, "mean_token_accuracy": 0.8783332705497742, "num_tokens": 315281889.0, "step": 2948 }, { "epoch": 6.718358038768529, "grad_norm": 3.4375, "learning_rate": 1.3357553507324938e-06, "loss": 0.5681, "mean_token_accuracy": 0.8839272111654282, "num_tokens": 315388559.0, "step": 2949 }, { "epoch": 6.720638540478905, "grad_norm": 5.5625, "learning_rate": 1.3340889349514403e-06, "loss": 0.5636, "mean_token_accuracy": 0.8809895366430283, "num_tokens": 315495344.0, "step": 2950 }, { "epoch": 6.7229190421892815, "grad_norm": 3.9375, "learning_rate": 1.3324231809189985e-06, "loss": 0.5728, "mean_token_accuracy": 0.8813484013080597, "num_tokens": 315602116.0, "step": 2951 }, { "epoch": 6.725199543899658, "grad_norm": 3.453125, "learning_rate": 1.3307580895806194e-06, "loss": 0.581, "mean_token_accuracy": 0.8836003094911575, "num_tokens": 315709172.0, "step": 2952 }, { "epoch": 6.727480045610034, "grad_norm": 3.375, "learning_rate": 1.3290936618813747e-06, "loss": 0.5706, "mean_token_accuracy": 0.8845668733119965, "num_tokens": 315816102.0, "step": 2953 }, { "epoch": 6.72976054732041, "grad_norm": 2.703125, "learning_rate": 1.327429898765962e-06, "loss": 0.5712, "mean_token_accuracy": 0.8821906894445419, "num_tokens": 315923130.0, "step": 2954 }, { "epoch": 6.732041049030787, "grad_norm": 3.21875, "learning_rate": 1.3257668011787018e-06, "loss": 0.5899, "mean_token_accuracy": 0.8804189115762711, "num_tokens": 316030172.0, "step": 2955 }, { "epoch": 6.734321550741163, "grad_norm": 2.734375, "learning_rate": 1.3241043700635352e-06, "loss": 0.5801, "mean_token_accuracy": 0.8796321153640747, "num_tokens": 316136895.0, "step": 2956 }, { "epoch": 6.7366020524515395, "grad_norm": 3.09375, "learning_rate": 1.3224426063640272e-06, "loss": 0.5823, "mean_token_accuracy": 0.8832730054855347, "num_tokens": 316243931.0, "step": 2957 }, { "epoch": 6.738882554161916, "grad_norm": 3.015625, "learning_rate": 1.320781511023363e-06, "loss": 0.5692, "mean_token_accuracy": 0.883409783244133, "num_tokens": 316351625.0, "step": 2958 }, { "epoch": 6.741163055872292, "grad_norm": 4.84375, "learning_rate": 1.3191210849843461e-06, "loss": 0.581, "mean_token_accuracy": 0.8800751566886902, "num_tokens": 316459647.0, "step": 2959 }, { "epoch": 6.743443557582668, "grad_norm": 4.15625, "learning_rate": 1.3174613291894039e-06, "loss": 0.5953, "mean_token_accuracy": 0.8763148337602615, "num_tokens": 316566718.0, "step": 2960 }, { "epoch": 6.745724059293044, "grad_norm": 3.34375, "learning_rate": 1.3158022445805816e-06, "loss": 0.5492, "mean_token_accuracy": 0.8837677389383316, "num_tokens": 316673847.0, "step": 2961 }, { "epoch": 6.7480045610034205, "grad_norm": 2.953125, "learning_rate": 1.3141438320995433e-06, "loss": 0.577, "mean_token_accuracy": 0.8806140273809433, "num_tokens": 316780635.0, "step": 2962 }, { "epoch": 6.750285062713797, "grad_norm": 2.796875, "learning_rate": 1.3124860926875732e-06, "loss": 0.5505, "mean_token_accuracy": 0.8896244168281555, "num_tokens": 316887938.0, "step": 2963 }, { "epoch": 6.752565564424174, "grad_norm": 2.84375, "learning_rate": 1.3108290272855697e-06, "loss": 0.5678, "mean_token_accuracy": 0.8813838809728622, "num_tokens": 316995335.0, "step": 2964 }, { "epoch": 6.75484606613455, "grad_norm": 5.0625, "learning_rate": 1.309172636834053e-06, "loss": 0.5642, "mean_token_accuracy": 0.8819121271371841, "num_tokens": 317102837.0, "step": 2965 }, { "epoch": 6.757126567844926, "grad_norm": 2.90625, "learning_rate": 1.3075169222731573e-06, "loss": 0.5684, "mean_token_accuracy": 0.8826228231191635, "num_tokens": 317210216.0, "step": 2966 }, { "epoch": 6.759407069555302, "grad_norm": 3.5, "learning_rate": 1.305861884542636e-06, "loss": 0.5793, "mean_token_accuracy": 0.8787815272808075, "num_tokens": 317317069.0, "step": 2967 }, { "epoch": 6.7616875712656785, "grad_norm": 4.875, "learning_rate": 1.3042075245818542e-06, "loss": 0.5684, "mean_token_accuracy": 0.8827643990516663, "num_tokens": 317424086.0, "step": 2968 }, { "epoch": 6.763968072976055, "grad_norm": 3.296875, "learning_rate": 1.3025538433297957e-06, "loss": 0.5791, "mean_token_accuracy": 0.8830547332763672, "num_tokens": 317531744.0, "step": 2969 }, { "epoch": 6.766248574686431, "grad_norm": 2.765625, "learning_rate": 1.3009008417250597e-06, "loss": 0.5753, "mean_token_accuracy": 0.883535161614418, "num_tokens": 317638896.0, "step": 2970 }, { "epoch": 6.768529076396807, "grad_norm": 2.53125, "learning_rate": 1.2992485207058548e-06, "loss": 0.5958, "mean_token_accuracy": 0.8761639446020126, "num_tokens": 317745636.0, "step": 2971 }, { "epoch": 6.770809578107183, "grad_norm": 2.625, "learning_rate": 1.2975968812100081e-06, "loss": 0.5413, "mean_token_accuracy": 0.8862400203943253, "num_tokens": 317852822.0, "step": 2972 }, { "epoch": 6.7730900798175595, "grad_norm": 4.6875, "learning_rate": 1.295945924174959e-06, "loss": 0.585, "mean_token_accuracy": 0.8786645531654358, "num_tokens": 317959386.0, "step": 2973 }, { "epoch": 6.775370581527936, "grad_norm": 3.03125, "learning_rate": 1.2942956505377585e-06, "loss": 0.565, "mean_token_accuracy": 0.8845228105783463, "num_tokens": 318066201.0, "step": 2974 }, { "epoch": 6.777651083238313, "grad_norm": 3.703125, "learning_rate": 1.2926460612350688e-06, "loss": 0.5577, "mean_token_accuracy": 0.883774995803833, "num_tokens": 318173322.0, "step": 2975 }, { "epoch": 6.779931584948689, "grad_norm": 3.421875, "learning_rate": 1.2909971572031663e-06, "loss": 0.558, "mean_token_accuracy": 0.8879963010549545, "num_tokens": 318281003.0, "step": 2976 }, { "epoch": 6.782212086659065, "grad_norm": 3.375, "learning_rate": 1.2893489393779362e-06, "loss": 0.5749, "mean_token_accuracy": 0.8839980363845825, "num_tokens": 318387942.0, "step": 2977 }, { "epoch": 6.784492588369441, "grad_norm": 2.53125, "learning_rate": 1.2877014086948762e-06, "loss": 0.5757, "mean_token_accuracy": 0.8807279914617538, "num_tokens": 318494877.0, "step": 2978 }, { "epoch": 6.7867730900798175, "grad_norm": 3.171875, "learning_rate": 1.2860545660890928e-06, "loss": 0.5537, "mean_token_accuracy": 0.8867141306400299, "num_tokens": 318601668.0, "step": 2979 }, { "epoch": 6.789053591790194, "grad_norm": 2.34375, "learning_rate": 1.2844084124953006e-06, "loss": 0.5662, "mean_token_accuracy": 0.8846204280853271, "num_tokens": 318708916.0, "step": 2980 }, { "epoch": 6.79133409350057, "grad_norm": 5.25, "learning_rate": 1.2827629488478254e-06, "loss": 0.5706, "mean_token_accuracy": 0.8803286552429199, "num_tokens": 318816351.0, "step": 2981 }, { "epoch": 6.793614595210946, "grad_norm": 3.25, "learning_rate": 1.2811181760806013e-06, "loss": 0.5722, "mean_token_accuracy": 0.8851221948862076, "num_tokens": 318923078.0, "step": 2982 }, { "epoch": 6.795895096921322, "grad_norm": 4.21875, "learning_rate": 1.2794740951271686e-06, "loss": 0.5837, "mean_token_accuracy": 0.8797405809164047, "num_tokens": 319029672.0, "step": 2983 }, { "epoch": 6.798175598631699, "grad_norm": 3.765625, "learning_rate": 1.2778307069206764e-06, "loss": 0.5599, "mean_token_accuracy": 0.8823461681604385, "num_tokens": 319136574.0, "step": 2984 }, { "epoch": 6.800456100342076, "grad_norm": 3.265625, "learning_rate": 1.2761880123938814e-06, "loss": 0.5618, "mean_token_accuracy": 0.8868078589439392, "num_tokens": 319244374.0, "step": 2985 }, { "epoch": 6.802736602052452, "grad_norm": 3.09375, "learning_rate": 1.2745460124791425e-06, "loss": 0.575, "mean_token_accuracy": 0.8795672506093979, "num_tokens": 319352045.0, "step": 2986 }, { "epoch": 6.805017103762828, "grad_norm": 3.59375, "learning_rate": 1.272904708108429e-06, "loss": 0.5784, "mean_token_accuracy": 0.8827357441186905, "num_tokens": 319458393.0, "step": 2987 }, { "epoch": 6.807297605473204, "grad_norm": 3.421875, "learning_rate": 1.2712641002133128e-06, "loss": 0.6055, "mean_token_accuracy": 0.8749971687793732, "num_tokens": 319564852.0, "step": 2988 }, { "epoch": 6.80957810718358, "grad_norm": 3.65625, "learning_rate": 1.2696241897249728e-06, "loss": 0.5863, "mean_token_accuracy": 0.8824810981750488, "num_tokens": 319671606.0, "step": 2989 }, { "epoch": 6.811858608893957, "grad_norm": 3.578125, "learning_rate": 1.2679849775741884e-06, "loss": 0.5606, "mean_token_accuracy": 0.8874455839395523, "num_tokens": 319778774.0, "step": 2990 }, { "epoch": 6.814139110604333, "grad_norm": 2.84375, "learning_rate": 1.266346464691346e-06, "loss": 0.5838, "mean_token_accuracy": 0.8811825066804886, "num_tokens": 319885624.0, "step": 2991 }, { "epoch": 6.816419612314709, "grad_norm": 3.25, "learning_rate": 1.2647086520064343e-06, "loss": 0.575, "mean_token_accuracy": 0.880624532699585, "num_tokens": 319992897.0, "step": 2992 }, { "epoch": 6.818700114025085, "grad_norm": 2.90625, "learning_rate": 1.2630715404490424e-06, "loss": 0.5768, "mean_token_accuracy": 0.8810169398784637, "num_tokens": 320099665.0, "step": 2993 }, { "epoch": 6.820980615735461, "grad_norm": 2.625, "learning_rate": 1.2614351309483646e-06, "loss": 0.5474, "mean_token_accuracy": 0.8881228417158127, "num_tokens": 320207542.0, "step": 2994 }, { "epoch": 6.823261117445838, "grad_norm": 2.78125, "learning_rate": 1.259799424433196e-06, "loss": 0.5763, "mean_token_accuracy": 0.8811034262180328, "num_tokens": 320314111.0, "step": 2995 }, { "epoch": 6.825541619156215, "grad_norm": 3.171875, "learning_rate": 1.25816442183193e-06, "loss": 0.5749, "mean_token_accuracy": 0.8809540122747421, "num_tokens": 320421117.0, "step": 2996 }, { "epoch": 6.827822120866591, "grad_norm": 2.953125, "learning_rate": 1.2565301240725636e-06, "loss": 0.5847, "mean_token_accuracy": 0.8778086453676224, "num_tokens": 320527962.0, "step": 2997 }, { "epoch": 6.830102622576967, "grad_norm": 3.015625, "learning_rate": 1.2548965320826928e-06, "loss": 0.5667, "mean_token_accuracy": 0.883897066116333, "num_tokens": 320634881.0, "step": 2998 }, { "epoch": 6.832383124287343, "grad_norm": 3.0, "learning_rate": 1.2532636467895126e-06, "loss": 0.5781, "mean_token_accuracy": 0.8828509300947189, "num_tokens": 320741780.0, "step": 2999 }, { "epoch": 6.834663625997719, "grad_norm": 3.375, "learning_rate": 1.2516314691198172e-06, "loss": 0.5606, "mean_token_accuracy": 0.8836653828620911, "num_tokens": 320849047.0, "step": 3000 }, { "epoch": 6.836944127708096, "grad_norm": 3.890625, "learning_rate": 1.2500000000000007e-06, "loss": 0.5653, "mean_token_accuracy": 0.8793622553348541, "num_tokens": 320955644.0, "step": 3001 }, { "epoch": 6.839224629418472, "grad_norm": 3.75, "learning_rate": 1.2483692403560507e-06, "loss": 0.5484, "mean_token_accuracy": 0.8884700238704681, "num_tokens": 321063849.0, "step": 3002 }, { "epoch": 6.841505131128848, "grad_norm": 2.921875, "learning_rate": 1.2467391911135562e-06, "loss": 0.5754, "mean_token_accuracy": 0.8834027796983719, "num_tokens": 321170372.0, "step": 3003 }, { "epoch": 6.843785632839225, "grad_norm": 4.0, "learning_rate": 1.2451098531977015e-06, "loss": 0.5687, "mean_token_accuracy": 0.8847081363201141, "num_tokens": 321277858.0, "step": 3004 }, { "epoch": 6.846066134549601, "grad_norm": 2.875, "learning_rate": 1.2434812275332678e-06, "loss": 0.536, "mean_token_accuracy": 0.8904493749141693, "num_tokens": 321384888.0, "step": 3005 }, { "epoch": 6.848346636259977, "grad_norm": 2.796875, "learning_rate": 1.2418533150446324e-06, "loss": 0.5543, "mean_token_accuracy": 0.8845214545726776, "num_tokens": 321491996.0, "step": 3006 }, { "epoch": 6.850627137970354, "grad_norm": 2.796875, "learning_rate": 1.2402261166557647e-06, "loss": 0.5801, "mean_token_accuracy": 0.8830210715532303, "num_tokens": 321598410.0, "step": 3007 }, { "epoch": 6.85290763968073, "grad_norm": 2.765625, "learning_rate": 1.2385996332902326e-06, "loss": 0.5717, "mean_token_accuracy": 0.8822718113660812, "num_tokens": 321705299.0, "step": 3008 }, { "epoch": 6.855188141391106, "grad_norm": 2.765625, "learning_rate": 1.236973865871196e-06, "loss": 0.5725, "mean_token_accuracy": 0.8847811222076416, "num_tokens": 321811999.0, "step": 3009 }, { "epoch": 6.857468643101482, "grad_norm": 2.953125, "learning_rate": 1.2353488153214096e-06, "loss": 0.5683, "mean_token_accuracy": 0.8817588984966278, "num_tokens": 321919613.0, "step": 3010 }, { "epoch": 6.859749144811858, "grad_norm": 2.640625, "learning_rate": 1.2337244825632217e-06, "loss": 0.5628, "mean_token_accuracy": 0.8837725669145584, "num_tokens": 322026837.0, "step": 3011 }, { "epoch": 6.862029646522235, "grad_norm": 3.734375, "learning_rate": 1.2321008685185699e-06, "loss": 0.588, "mean_token_accuracy": 0.8756814897060394, "num_tokens": 322132737.0, "step": 3012 }, { "epoch": 6.864310148232612, "grad_norm": 2.46875, "learning_rate": 1.2304779741089884e-06, "loss": 0.559, "mean_token_accuracy": 0.8843032121658325, "num_tokens": 322239975.0, "step": 3013 }, { "epoch": 6.866590649942988, "grad_norm": 3.90625, "learning_rate": 1.228855800255599e-06, "loss": 0.5796, "mean_token_accuracy": 0.8782118856906891, "num_tokens": 322346564.0, "step": 3014 }, { "epoch": 6.868871151653364, "grad_norm": 2.8125, "learning_rate": 1.2272343478791165e-06, "loss": 0.5709, "mean_token_accuracy": 0.8836709409952164, "num_tokens": 322453680.0, "step": 3015 }, { "epoch": 6.87115165336374, "grad_norm": 2.6875, "learning_rate": 1.2256136178998468e-06, "loss": 0.5737, "mean_token_accuracy": 0.8812815845012665, "num_tokens": 322561481.0, "step": 3016 }, { "epoch": 6.873432155074116, "grad_norm": 3.015625, "learning_rate": 1.2239936112376858e-06, "loss": 0.5769, "mean_token_accuracy": 0.8811852186918259, "num_tokens": 322668877.0, "step": 3017 }, { "epoch": 6.875712656784493, "grad_norm": 3.828125, "learning_rate": 1.2223743288121155e-06, "loss": 0.5657, "mean_token_accuracy": 0.8837666660547256, "num_tokens": 322775613.0, "step": 3018 }, { "epoch": 6.877993158494869, "grad_norm": 3.171875, "learning_rate": 1.2207557715422106e-06, "loss": 0.5742, "mean_token_accuracy": 0.8822371959686279, "num_tokens": 322883149.0, "step": 3019 }, { "epoch": 6.880273660205245, "grad_norm": 2.859375, "learning_rate": 1.219137940346633e-06, "loss": 0.5757, "mean_token_accuracy": 0.880626305937767, "num_tokens": 322989979.0, "step": 3020 }, { "epoch": 6.882554161915621, "grad_norm": 5.4375, "learning_rate": 1.2175208361436328e-06, "loss": 0.5689, "mean_token_accuracy": 0.8829739540815353, "num_tokens": 323096950.0, "step": 3021 }, { "epoch": 6.884834663625997, "grad_norm": 3.4375, "learning_rate": 1.2159044598510473e-06, "loss": 0.5621, "mean_token_accuracy": 0.8852163255214691, "num_tokens": 323204095.0, "step": 3022 }, { "epoch": 6.887115165336374, "grad_norm": 2.765625, "learning_rate": 1.2142888123862992e-06, "loss": 0.5789, "mean_token_accuracy": 0.8801557719707489, "num_tokens": 323311535.0, "step": 3023 }, { "epoch": 6.889395667046751, "grad_norm": 3.703125, "learning_rate": 1.2126738946663996e-06, "loss": 0.5741, "mean_token_accuracy": 0.8806950151920319, "num_tokens": 323418174.0, "step": 3024 }, { "epoch": 6.891676168757127, "grad_norm": 3.171875, "learning_rate": 1.2110597076079448e-06, "loss": 0.5536, "mean_token_accuracy": 0.8857726603746414, "num_tokens": 323524933.0, "step": 3025 }, { "epoch": 6.893956670467503, "grad_norm": 2.953125, "learning_rate": 1.2094462521271156e-06, "loss": 0.564, "mean_token_accuracy": 0.8852896243333817, "num_tokens": 323631823.0, "step": 3026 }, { "epoch": 6.896237172177879, "grad_norm": 3.015625, "learning_rate": 1.2078335291396798e-06, "loss": 0.5733, "mean_token_accuracy": 0.8801656812429428, "num_tokens": 323738919.0, "step": 3027 }, { "epoch": 6.898517673888255, "grad_norm": 3.1875, "learning_rate": 1.2062215395609856e-06, "loss": 0.5623, "mean_token_accuracy": 0.8838664293289185, "num_tokens": 323845761.0, "step": 3028 }, { "epoch": 6.900798175598632, "grad_norm": 3.484375, "learning_rate": 1.2046102843059681e-06, "loss": 0.5774, "mean_token_accuracy": 0.8809877783060074, "num_tokens": 323952934.0, "step": 3029 }, { "epoch": 6.903078677309008, "grad_norm": 3.046875, "learning_rate": 1.202999764289145e-06, "loss": 0.5805, "mean_token_accuracy": 0.8807922154664993, "num_tokens": 324060199.0, "step": 3030 }, { "epoch": 6.905359179019384, "grad_norm": 2.828125, "learning_rate": 1.201389980424616e-06, "loss": 0.5782, "mean_token_accuracy": 0.8789155036211014, "num_tokens": 324166935.0, "step": 3031 }, { "epoch": 6.90763968072976, "grad_norm": 3.609375, "learning_rate": 1.1997809336260644e-06, "loss": 0.5569, "mean_token_accuracy": 0.8868138492107391, "num_tokens": 324273307.0, "step": 3032 }, { "epoch": 6.909920182440137, "grad_norm": 2.734375, "learning_rate": 1.1981726248067521e-06, "loss": 0.5478, "mean_token_accuracy": 0.8851444870233536, "num_tokens": 324380120.0, "step": 3033 }, { "epoch": 6.9122006841505135, "grad_norm": 2.734375, "learning_rate": 1.1965650548795251e-06, "loss": 0.5511, "mean_token_accuracy": 0.8851585388183594, "num_tokens": 324487235.0, "step": 3034 }, { "epoch": 6.91448118586089, "grad_norm": 3.484375, "learning_rate": 1.1949582247568107e-06, "loss": 0.5739, "mean_token_accuracy": 0.8827362954616547, "num_tokens": 324594111.0, "step": 3035 }, { "epoch": 6.916761687571266, "grad_norm": 5.15625, "learning_rate": 1.1933521353506117e-06, "loss": 0.5604, "mean_token_accuracy": 0.8829712569713593, "num_tokens": 324701818.0, "step": 3036 }, { "epoch": 6.919042189281642, "grad_norm": 3.515625, "learning_rate": 1.1917467875725148e-06, "loss": 0.5528, "mean_token_accuracy": 0.8853205442428589, "num_tokens": 324808789.0, "step": 3037 }, { "epoch": 6.921322690992018, "grad_norm": 2.828125, "learning_rate": 1.1901421823336856e-06, "loss": 0.5456, "mean_token_accuracy": 0.885327011346817, "num_tokens": 324916062.0, "step": 3038 }, { "epoch": 6.923603192702394, "grad_norm": 4.0625, "learning_rate": 1.188538320544865e-06, "loss": 0.5632, "mean_token_accuracy": 0.880491703748703, "num_tokens": 325023122.0, "step": 3039 }, { "epoch": 6.925883694412771, "grad_norm": 2.859375, "learning_rate": 1.1869352031163746e-06, "loss": 0.5819, "mean_token_accuracy": 0.880371481180191, "num_tokens": 325130007.0, "step": 3040 }, { "epoch": 6.928164196123147, "grad_norm": 3.578125, "learning_rate": 1.1853328309581139e-06, "loss": 0.5676, "mean_token_accuracy": 0.884147435426712, "num_tokens": 325236627.0, "step": 3041 }, { "epoch": 6.930444697833523, "grad_norm": 3.328125, "learning_rate": 1.183731204979557e-06, "loss": 0.5679, "mean_token_accuracy": 0.8821654170751572, "num_tokens": 325344292.0, "step": 3042 }, { "epoch": 6.932725199543899, "grad_norm": 2.96875, "learning_rate": 1.182130326089758e-06, "loss": 0.566, "mean_token_accuracy": 0.8848479390144348, "num_tokens": 325451752.0, "step": 3043 }, { "epoch": 6.935005701254276, "grad_norm": 4.71875, "learning_rate": 1.1805301951973423e-06, "loss": 0.57, "mean_token_accuracy": 0.8822412639856339, "num_tokens": 325558775.0, "step": 3044 }, { "epoch": 6.9372862029646525, "grad_norm": 5.1875, "learning_rate": 1.1789308132105145e-06, "loss": 0.5711, "mean_token_accuracy": 0.8826945275068283, "num_tokens": 325665451.0, "step": 3045 }, { "epoch": 6.939566704675029, "grad_norm": 3.203125, "learning_rate": 1.1773321810370527e-06, "loss": 0.5702, "mean_token_accuracy": 0.8854436427354813, "num_tokens": 325772180.0, "step": 3046 }, { "epoch": 6.941847206385405, "grad_norm": 2.515625, "learning_rate": 1.1757342995843103e-06, "loss": 0.5885, "mean_token_accuracy": 0.8795358538627625, "num_tokens": 325879003.0, "step": 3047 }, { "epoch": 6.944127708095781, "grad_norm": 3.125, "learning_rate": 1.1741371697592134e-06, "loss": 0.5642, "mean_token_accuracy": 0.8838000446557999, "num_tokens": 325986026.0, "step": 3048 }, { "epoch": 6.946408209806157, "grad_norm": 3.625, "learning_rate": 1.1725407924682628e-06, "loss": 0.5718, "mean_token_accuracy": 0.8830106258392334, "num_tokens": 326093663.0, "step": 3049 }, { "epoch": 6.9486887115165334, "grad_norm": 4.8125, "learning_rate": 1.17094516861753e-06, "loss": 0.5393, "mean_token_accuracy": 0.8868276029825211, "num_tokens": 326201499.0, "step": 3050 }, { "epoch": 6.95096921322691, "grad_norm": 3.1875, "learning_rate": 1.1693502991126609e-06, "loss": 0.5703, "mean_token_accuracy": 0.8848345726728439, "num_tokens": 326308543.0, "step": 3051 }, { "epoch": 6.953249714937286, "grad_norm": 3.359375, "learning_rate": 1.1677561848588734e-06, "loss": 0.5626, "mean_token_accuracy": 0.8820251226425171, "num_tokens": 326415635.0, "step": 3052 }, { "epoch": 6.955530216647663, "grad_norm": 3.375, "learning_rate": 1.166162826760955e-06, "loss": 0.5705, "mean_token_accuracy": 0.8849399089813232, "num_tokens": 326523279.0, "step": 3053 }, { "epoch": 6.957810718358039, "grad_norm": 2.84375, "learning_rate": 1.1645702257232663e-06, "loss": 0.5785, "mean_token_accuracy": 0.8775086104869843, "num_tokens": 326630109.0, "step": 3054 }, { "epoch": 6.960091220068415, "grad_norm": 3.0, "learning_rate": 1.1629783826497351e-06, "loss": 0.5695, "mean_token_accuracy": 0.8805986046791077, "num_tokens": 326737084.0, "step": 3055 }, { "epoch": 6.9623717217787915, "grad_norm": 2.625, "learning_rate": 1.161387298443863e-06, "loss": 0.5631, "mean_token_accuracy": 0.8876520395278931, "num_tokens": 326844156.0, "step": 3056 }, { "epoch": 6.964652223489168, "grad_norm": 3.1875, "learning_rate": 1.1597969740087159e-06, "loss": 0.5757, "mean_token_accuracy": 0.8821894228458405, "num_tokens": 326951005.0, "step": 3057 }, { "epoch": 6.966932725199544, "grad_norm": 3.953125, "learning_rate": 1.1582074102469332e-06, "loss": 0.5554, "mean_token_accuracy": 0.8848588168621063, "num_tokens": 327058614.0, "step": 3058 }, { "epoch": 6.96921322690992, "grad_norm": 3.0, "learning_rate": 1.1566186080607198e-06, "loss": 0.5495, "mean_token_accuracy": 0.8877767473459244, "num_tokens": 327166015.0, "step": 3059 }, { "epoch": 6.971493728620296, "grad_norm": 3.96875, "learning_rate": 1.1550305683518506e-06, "loss": 0.5698, "mean_token_accuracy": 0.8813445121049881, "num_tokens": 327273077.0, "step": 3060 }, { "epoch": 6.9737742303306725, "grad_norm": 3.109375, "learning_rate": 1.1534432920216643e-06, "loss": 0.5984, "mean_token_accuracy": 0.8741457164287567, "num_tokens": 327379556.0, "step": 3061 }, { "epoch": 6.976054732041049, "grad_norm": 2.578125, "learning_rate": 1.151856779971069e-06, "loss": 0.5636, "mean_token_accuracy": 0.8825311958789825, "num_tokens": 327486300.0, "step": 3062 }, { "epoch": 6.978335233751425, "grad_norm": 2.8125, "learning_rate": 1.1502710331005384e-06, "loss": 0.5883, "mean_token_accuracy": 0.8804716914892197, "num_tokens": 327593244.0, "step": 3063 }, { "epoch": 6.980615735461802, "grad_norm": 2.96875, "learning_rate": 1.148686052310112e-06, "loss": 0.596, "mean_token_accuracy": 0.875918909907341, "num_tokens": 327699724.0, "step": 3064 }, { "epoch": 6.982896237172178, "grad_norm": 3.40625, "learning_rate": 1.147101838499395e-06, "loss": 0.583, "mean_token_accuracy": 0.8763432502746582, "num_tokens": 327806813.0, "step": 3065 }, { "epoch": 6.985176738882554, "grad_norm": 2.875, "learning_rate": 1.145518392567555e-06, "loss": 0.5514, "mean_token_accuracy": 0.8882738053798676, "num_tokens": 327913705.0, "step": 3066 }, { "epoch": 6.9874572405929305, "grad_norm": 3.09375, "learning_rate": 1.1439357154133263e-06, "loss": 0.5451, "mean_token_accuracy": 0.8874611854553223, "num_tokens": 328021868.0, "step": 3067 }, { "epoch": 6.989737742303307, "grad_norm": 2.828125, "learning_rate": 1.1423538079350053e-06, "loss": 0.5511, "mean_token_accuracy": 0.8833891451358795, "num_tokens": 328129123.0, "step": 3068 }, { "epoch": 6.992018244013683, "grad_norm": 3.03125, "learning_rate": 1.1407726710304525e-06, "loss": 0.6043, "mean_token_accuracy": 0.8766579329967499, "num_tokens": 328235771.0, "step": 3069 }, { "epoch": 6.994298745724059, "grad_norm": 4.90625, "learning_rate": 1.139192305597092e-06, "loss": 0.57, "mean_token_accuracy": 0.8802380710840225, "num_tokens": 328342486.0, "step": 3070 }, { "epoch": 6.996579247434435, "grad_norm": 3.625, "learning_rate": 1.1376127125319065e-06, "loss": 0.5908, "mean_token_accuracy": 0.8793238997459412, "num_tokens": 328449792.0, "step": 3071 }, { "epoch": 6.9988597491448115, "grad_norm": 3.296875, "learning_rate": 1.1360338927314432e-06, "loss": 0.5572, "mean_token_accuracy": 0.8853235244750977, "num_tokens": 328556382.0, "step": 3072 }, { "epoch": 7.0, "grad_norm": 5.625, "learning_rate": 1.1344558470918098e-06, "loss": 0.6197, "mean_token_accuracy": 0.8733022511005402, "num_tokens": 328595624.0, "step": 3073 }, { "epoch": 7.002280501710376, "grad_norm": 5.1875, "learning_rate": 1.1328785765086752e-06, "loss": 0.5623, "mean_token_accuracy": 0.8809218257665634, "num_tokens": 328702654.0, "step": 3074 }, { "epoch": 7.004561003420752, "grad_norm": 2.96875, "learning_rate": 1.131302081877268e-06, "loss": 0.5739, "mean_token_accuracy": 0.8839420080184937, "num_tokens": 328809480.0, "step": 3075 }, { "epoch": 7.006841505131129, "grad_norm": 3.0625, "learning_rate": 1.1297263640923745e-06, "loss": 0.5697, "mean_token_accuracy": 0.8816477954387665, "num_tokens": 328916070.0, "step": 3076 }, { "epoch": 7.009122006841505, "grad_norm": 2.71875, "learning_rate": 1.1281514240483427e-06, "loss": 0.5495, "mean_token_accuracy": 0.8879576325416565, "num_tokens": 329022615.0, "step": 3077 }, { "epoch": 7.011402508551882, "grad_norm": 3.875, "learning_rate": 1.1265772626390786e-06, "loss": 0.5521, "mean_token_accuracy": 0.888176754117012, "num_tokens": 329129873.0, "step": 3078 }, { "epoch": 7.013683010262258, "grad_norm": 3.0, "learning_rate": 1.1250038807580449e-06, "loss": 0.5592, "mean_token_accuracy": 0.8815352469682693, "num_tokens": 329236596.0, "step": 3079 }, { "epoch": 7.015963511972634, "grad_norm": 3.359375, "learning_rate": 1.1234312792982627e-06, "loss": 0.5557, "mean_token_accuracy": 0.8854427635669708, "num_tokens": 329344039.0, "step": 3080 }, { "epoch": 7.015963511972634, "eval_loss": 0.5865316987037659, "eval_mean_token_accuracy": 0.8799761234580791, "eval_num_tokens": 329344039.0, "eval_runtime": 58.6794, "eval_samples_per_second": 142.895, "eval_steps_per_second": 4.482, "step": 3080 }, { "epoch": 7.01824401368301, "grad_norm": 3.078125, "learning_rate": 1.1218594591523118e-06, "loss": 0.5583, "mean_token_accuracy": 0.8825812339782715, "num_tokens": 329451249.0, "step": 3081 }, { "epoch": 7.020524515393387, "grad_norm": 2.546875, "learning_rate": 1.120288421212325e-06, "loss": 0.5654, "mean_token_accuracy": 0.8850286900997162, "num_tokens": 329558258.0, "step": 3082 }, { "epoch": 7.022805017103763, "grad_norm": 2.859375, "learning_rate": 1.1187181663699935e-06, "loss": 0.5759, "mean_token_accuracy": 0.8803520053625107, "num_tokens": 329665648.0, "step": 3083 }, { "epoch": 7.025085518814139, "grad_norm": 3.453125, "learning_rate": 1.1171486955165645e-06, "loss": 0.5544, "mean_token_accuracy": 0.886691614985466, "num_tokens": 329772942.0, "step": 3084 }, { "epoch": 7.027366020524515, "grad_norm": 2.5625, "learning_rate": 1.115580009542839e-06, "loss": 0.5551, "mean_token_accuracy": 0.8843899071216583, "num_tokens": 329880278.0, "step": 3085 }, { "epoch": 7.029646522234891, "grad_norm": 2.875, "learning_rate": 1.1140121093391736e-06, "loss": 0.5499, "mean_token_accuracy": 0.8839717209339142, "num_tokens": 329987716.0, "step": 3086 }, { "epoch": 7.031927023945268, "grad_norm": 3.34375, "learning_rate": 1.1124449957954764e-06, "loss": 0.5737, "mean_token_accuracy": 0.8822407573461533, "num_tokens": 330094855.0, "step": 3087 }, { "epoch": 7.034207525655645, "grad_norm": 2.484375, "learning_rate": 1.110878669801212e-06, "loss": 0.56, "mean_token_accuracy": 0.8854365050792694, "num_tokens": 330202300.0, "step": 3088 }, { "epoch": 7.036488027366021, "grad_norm": 2.8125, "learning_rate": 1.1093131322453966e-06, "loss": 0.562, "mean_token_accuracy": 0.8828160762786865, "num_tokens": 330309184.0, "step": 3089 }, { "epoch": 7.038768529076397, "grad_norm": 2.96875, "learning_rate": 1.1077483840165986e-06, "loss": 0.5645, "mean_token_accuracy": 0.8839673697948456, "num_tokens": 330415860.0, "step": 3090 }, { "epoch": 7.041049030786773, "grad_norm": 2.796875, "learning_rate": 1.10618442600294e-06, "loss": 0.5521, "mean_token_accuracy": 0.8864448517560959, "num_tokens": 330522700.0, "step": 3091 }, { "epoch": 7.043329532497149, "grad_norm": 2.703125, "learning_rate": 1.1046212590920931e-06, "loss": 0.55, "mean_token_accuracy": 0.8861220180988312, "num_tokens": 330630445.0, "step": 3092 }, { "epoch": 7.045610034207526, "grad_norm": 3.25, "learning_rate": 1.10305888417128e-06, "loss": 0.5674, "mean_token_accuracy": 0.8770045042037964, "num_tokens": 330737176.0, "step": 3093 }, { "epoch": 7.047890535917902, "grad_norm": 2.765625, "learning_rate": 1.101497302127275e-06, "loss": 0.5751, "mean_token_accuracy": 0.880455732345581, "num_tokens": 330843523.0, "step": 3094 }, { "epoch": 7.050171037628278, "grad_norm": 2.703125, "learning_rate": 1.0999365138464024e-06, "loss": 0.5503, "mean_token_accuracy": 0.8857882469892502, "num_tokens": 330950912.0, "step": 3095 }, { "epoch": 7.052451539338654, "grad_norm": 3.546875, "learning_rate": 1.0983765202145351e-06, "loss": 0.5694, "mean_token_accuracy": 0.8827014863491058, "num_tokens": 331057394.0, "step": 3096 }, { "epoch": 7.05473204104903, "grad_norm": 3.734375, "learning_rate": 1.0968173221170966e-06, "loss": 0.5841, "mean_token_accuracy": 0.88177290558815, "num_tokens": 331164519.0, "step": 3097 }, { "epoch": 7.0570125427594075, "grad_norm": 2.578125, "learning_rate": 1.0952589204390557e-06, "loss": 0.5574, "mean_token_accuracy": 0.8831107765436172, "num_tokens": 331272101.0, "step": 3098 }, { "epoch": 7.059293044469784, "grad_norm": 3.46875, "learning_rate": 1.0937013160649328e-06, "loss": 0.5819, "mean_token_accuracy": 0.8797809779644012, "num_tokens": 331379251.0, "step": 3099 }, { "epoch": 7.06157354618016, "grad_norm": 3.40625, "learning_rate": 1.0921445098787923e-06, "loss": 0.5698, "mean_token_accuracy": 0.8834618926048279, "num_tokens": 331486795.0, "step": 3100 }, { "epoch": 7.063854047890536, "grad_norm": 3.375, "learning_rate": 1.0905885027642484e-06, "loss": 0.5577, "mean_token_accuracy": 0.882593035697937, "num_tokens": 331593690.0, "step": 3101 }, { "epoch": 7.066134549600912, "grad_norm": 4.21875, "learning_rate": 1.0890332956044614e-06, "loss": 0.5693, "mean_token_accuracy": 0.8840242773294449, "num_tokens": 331700378.0, "step": 3102 }, { "epoch": 7.068415051311288, "grad_norm": 2.921875, "learning_rate": 1.0874788892821354e-06, "loss": 0.5603, "mean_token_accuracy": 0.886967346072197, "num_tokens": 331807432.0, "step": 3103 }, { "epoch": 7.070695553021665, "grad_norm": 2.5625, "learning_rate": 1.0859252846795215e-06, "loss": 0.5571, "mean_token_accuracy": 0.8852237462997437, "num_tokens": 331914837.0, "step": 3104 }, { "epoch": 7.072976054732041, "grad_norm": 3.765625, "learning_rate": 1.0843724826784165e-06, "loss": 0.5743, "mean_token_accuracy": 0.8843671381473541, "num_tokens": 332021366.0, "step": 3105 }, { "epoch": 7.075256556442417, "grad_norm": 3.359375, "learning_rate": 1.0828204841601608e-06, "loss": 0.5772, "mean_token_accuracy": 0.8801092207431793, "num_tokens": 332128357.0, "step": 3106 }, { "epoch": 7.077537058152793, "grad_norm": 3.1875, "learning_rate": 1.0812692900056384e-06, "loss": 0.5754, "mean_token_accuracy": 0.8785477876663208, "num_tokens": 332235371.0, "step": 3107 }, { "epoch": 7.07981755986317, "grad_norm": 3.09375, "learning_rate": 1.0797189010952784e-06, "loss": 0.5819, "mean_token_accuracy": 0.8777376860380173, "num_tokens": 332342307.0, "step": 3108 }, { "epoch": 7.0820980615735465, "grad_norm": 4.125, "learning_rate": 1.0781693183090495e-06, "loss": 0.5818, "mean_token_accuracy": 0.8822515457868576, "num_tokens": 332449426.0, "step": 3109 }, { "epoch": 7.084378563283923, "grad_norm": 3.390625, "learning_rate": 1.076620542526466e-06, "loss": 0.5588, "mean_token_accuracy": 0.8819447606801987, "num_tokens": 332556668.0, "step": 3110 }, { "epoch": 7.086659064994299, "grad_norm": 3.484375, "learning_rate": 1.0750725746265832e-06, "loss": 0.5739, "mean_token_accuracy": 0.8780965954065323, "num_tokens": 332664662.0, "step": 3111 }, { "epoch": 7.088939566704675, "grad_norm": 2.578125, "learning_rate": 1.0735254154879979e-06, "loss": 0.5859, "mean_token_accuracy": 0.880235344171524, "num_tokens": 332771413.0, "step": 3112 }, { "epoch": 7.091220068415051, "grad_norm": 5.21875, "learning_rate": 1.0719790659888481e-06, "loss": 0.5952, "mean_token_accuracy": 0.8757160604000092, "num_tokens": 332878585.0, "step": 3113 }, { "epoch": 7.0935005701254275, "grad_norm": 3.078125, "learning_rate": 1.070433527006811e-06, "loss": 0.5831, "mean_token_accuracy": 0.8773680776357651, "num_tokens": 332985770.0, "step": 3114 }, { "epoch": 7.095781071835804, "grad_norm": 2.6875, "learning_rate": 1.0688887994191049e-06, "loss": 0.5717, "mean_token_accuracy": 0.8812025189399719, "num_tokens": 333093197.0, "step": 3115 }, { "epoch": 7.09806157354618, "grad_norm": 3.4375, "learning_rate": 1.0673448841024875e-06, "loss": 0.5629, "mean_token_accuracy": 0.8841598778963089, "num_tokens": 333200787.0, "step": 3116 }, { "epoch": 7.100342075256556, "grad_norm": 2.96875, "learning_rate": 1.0658017819332556e-06, "loss": 0.5759, "mean_token_accuracy": 0.8818695098161697, "num_tokens": 333308356.0, "step": 3117 }, { "epoch": 7.102622576966933, "grad_norm": 4.8125, "learning_rate": 1.064259493787244e-06, "loss": 0.5847, "mean_token_accuracy": 0.8795827180147171, "num_tokens": 333415586.0, "step": 3118 }, { "epoch": 7.104903078677309, "grad_norm": 3.109375, "learning_rate": 1.0627180205398263e-06, "loss": 0.5729, "mean_token_accuracy": 0.8810782432556152, "num_tokens": 333522537.0, "step": 3119 }, { "epoch": 7.1071835803876855, "grad_norm": 2.953125, "learning_rate": 1.0611773630659117e-06, "loss": 0.5742, "mean_token_accuracy": 0.8795880377292633, "num_tokens": 333629886.0, "step": 3120 }, { "epoch": 7.109464082098062, "grad_norm": 4.40625, "learning_rate": 1.0596375222399491e-06, "loss": 0.5703, "mean_token_accuracy": 0.8833544999361038, "num_tokens": 333737026.0, "step": 3121 }, { "epoch": 7.111744583808438, "grad_norm": 2.875, "learning_rate": 1.0580984989359205e-06, "loss": 0.5599, "mean_token_accuracy": 0.8837230205535889, "num_tokens": 333843902.0, "step": 3122 }, { "epoch": 7.114025085518814, "grad_norm": 2.84375, "learning_rate": 1.0565602940273472e-06, "loss": 0.5771, "mean_token_accuracy": 0.8842257410287857, "num_tokens": 333951056.0, "step": 3123 }, { "epoch": 7.11630558722919, "grad_norm": 3.609375, "learning_rate": 1.055022908387285e-06, "loss": 0.5895, "mean_token_accuracy": 0.8821539282798767, "num_tokens": 334058094.0, "step": 3124 }, { "epoch": 7.1185860889395665, "grad_norm": 4.21875, "learning_rate": 1.053486342888323e-06, "loss": 0.5581, "mean_token_accuracy": 0.8864398002624512, "num_tokens": 334165433.0, "step": 3125 }, { "epoch": 7.120866590649943, "grad_norm": 3.09375, "learning_rate": 1.0519505984025865e-06, "loss": 0.5968, "mean_token_accuracy": 0.8731893301010132, "num_tokens": 334272058.0, "step": 3126 }, { "epoch": 7.123147092360319, "grad_norm": 5.34375, "learning_rate": 1.050415675801735e-06, "loss": 0.5417, "mean_token_accuracy": 0.8889250755310059, "num_tokens": 334379380.0, "step": 3127 }, { "epoch": 7.125427594070696, "grad_norm": 2.84375, "learning_rate": 1.0488815759569605e-06, "loss": 0.5574, "mean_token_accuracy": 0.8852666765451431, "num_tokens": 334486391.0, "step": 3128 }, { "epoch": 7.127708095781072, "grad_norm": 4.09375, "learning_rate": 1.0473482997389891e-06, "loss": 0.577, "mean_token_accuracy": 0.8808193802833557, "num_tokens": 334593174.0, "step": 3129 }, { "epoch": 7.129988597491448, "grad_norm": 3.25, "learning_rate": 1.0458158480180777e-06, "loss": 0.575, "mean_token_accuracy": 0.8826655298471451, "num_tokens": 334700093.0, "step": 3130 }, { "epoch": 7.1322690992018245, "grad_norm": 3.140625, "learning_rate": 1.0442842216640168e-06, "loss": 0.5637, "mean_token_accuracy": 0.8832122981548309, "num_tokens": 334807273.0, "step": 3131 }, { "epoch": 7.134549600912201, "grad_norm": 2.90625, "learning_rate": 1.042753421546128e-06, "loss": 0.5607, "mean_token_accuracy": 0.883466050028801, "num_tokens": 334914093.0, "step": 3132 }, { "epoch": 7.136830102622577, "grad_norm": 2.921875, "learning_rate": 1.0412234485332636e-06, "loss": 0.5643, "mean_token_accuracy": 0.8840949237346649, "num_tokens": 335021072.0, "step": 3133 }, { "epoch": 7.139110604332953, "grad_norm": 3.046875, "learning_rate": 1.0396943034938077e-06, "loss": 0.5824, "mean_token_accuracy": 0.8802383244037628, "num_tokens": 335127732.0, "step": 3134 }, { "epoch": 7.141391106043329, "grad_norm": 3.671875, "learning_rate": 1.0381659872956732e-06, "loss": 0.5877, "mean_token_accuracy": 0.8789878785610199, "num_tokens": 335234354.0, "step": 3135 }, { "epoch": 7.1436716077537055, "grad_norm": 2.71875, "learning_rate": 1.0366385008063015e-06, "loss": 0.5801, "mean_token_accuracy": 0.8824616819620132, "num_tokens": 335341184.0, "step": 3136 }, { "epoch": 7.145952109464082, "grad_norm": 2.625, "learning_rate": 1.0351118448926658e-06, "loss": 0.5661, "mean_token_accuracy": 0.8839305341243744, "num_tokens": 335448536.0, "step": 3137 }, { "epoch": 7.148232611174459, "grad_norm": 3.0625, "learning_rate": 1.0335860204212662e-06, "loss": 0.5823, "mean_token_accuracy": 0.880255714058876, "num_tokens": 335555399.0, "step": 3138 }, { "epoch": 7.150513112884835, "grad_norm": 2.875, "learning_rate": 1.0320610282581309e-06, "loss": 0.5759, "mean_token_accuracy": 0.8826555460691452, "num_tokens": 335662374.0, "step": 3139 }, { "epoch": 7.152793614595211, "grad_norm": 2.640625, "learning_rate": 1.0305368692688175e-06, "loss": 0.5711, "mean_token_accuracy": 0.8849854469299316, "num_tokens": 335769670.0, "step": 3140 }, { "epoch": 7.155074116305587, "grad_norm": 3.53125, "learning_rate": 1.029013544318407e-06, "loss": 0.5811, "mean_token_accuracy": 0.8815399259328842, "num_tokens": 335876896.0, "step": 3141 }, { "epoch": 7.1573546180159635, "grad_norm": 3.5625, "learning_rate": 1.0274910542715103e-06, "loss": 0.5852, "mean_token_accuracy": 0.8821052312850952, "num_tokens": 335983657.0, "step": 3142 }, { "epoch": 7.15963511972634, "grad_norm": 4.625, "learning_rate": 1.025969399992264e-06, "loss": 0.5746, "mean_token_accuracy": 0.8829544335603714, "num_tokens": 336090948.0, "step": 3143 }, { "epoch": 7.161915621436716, "grad_norm": 4.40625, "learning_rate": 1.0244485823443281e-06, "loss": 0.5876, "mean_token_accuracy": 0.8804685175418854, "num_tokens": 336197903.0, "step": 3144 }, { "epoch": 7.164196123147092, "grad_norm": 2.421875, "learning_rate": 1.0229286021908913e-06, "loss": 0.5621, "mean_token_accuracy": 0.8843429386615753, "num_tokens": 336305488.0, "step": 3145 }, { "epoch": 7.166476624857468, "grad_norm": 3.15625, "learning_rate": 1.021409460394663e-06, "loss": 0.5326, "mean_token_accuracy": 0.8926713168621063, "num_tokens": 336413922.0, "step": 3146 }, { "epoch": 7.168757126567845, "grad_norm": 2.875, "learning_rate": 1.0198911578178797e-06, "loss": 0.5753, "mean_token_accuracy": 0.8805201500654221, "num_tokens": 336521353.0, "step": 3147 }, { "epoch": 7.1710376282782216, "grad_norm": 2.859375, "learning_rate": 1.0183736953223005e-06, "loss": 0.5719, "mean_token_accuracy": 0.8848622888326645, "num_tokens": 336628098.0, "step": 3148 }, { "epoch": 7.173318129988598, "grad_norm": 4.5, "learning_rate": 1.0168570737692082e-06, "loss": 0.5669, "mean_token_accuracy": 0.8816571831703186, "num_tokens": 336735354.0, "step": 3149 }, { "epoch": 7.175598631698974, "grad_norm": 2.859375, "learning_rate": 1.0153412940194073e-06, "loss": 0.5797, "mean_token_accuracy": 0.8790102005004883, "num_tokens": 336843105.0, "step": 3150 }, { "epoch": 7.17787913340935, "grad_norm": 4.09375, "learning_rate": 1.0138263569332268e-06, "loss": 0.5677, "mean_token_accuracy": 0.8861614763736725, "num_tokens": 336950109.0, "step": 3151 }, { "epoch": 7.180159635119726, "grad_norm": 3.703125, "learning_rate": 1.0123122633705131e-06, "loss": 0.5865, "mean_token_accuracy": 0.8783406764268875, "num_tokens": 337056741.0, "step": 3152 }, { "epoch": 7.1824401368301025, "grad_norm": 3.234375, "learning_rate": 1.0107990141906378e-06, "loss": 0.5768, "mean_token_accuracy": 0.8800586462020874, "num_tokens": 337163436.0, "step": 3153 }, { "epoch": 7.184720638540479, "grad_norm": 3.0, "learning_rate": 1.0092866102524922e-06, "loss": 0.5597, "mean_token_accuracy": 0.8837520331144333, "num_tokens": 337270521.0, "step": 3154 }, { "epoch": 7.187001140250855, "grad_norm": 4.34375, "learning_rate": 1.0077750524144871e-06, "loss": 0.56, "mean_token_accuracy": 0.887342780828476, "num_tokens": 337377963.0, "step": 3155 }, { "epoch": 7.189281641961231, "grad_norm": 2.90625, "learning_rate": 1.0062643415345546e-06, "loss": 0.5532, "mean_token_accuracy": 0.8867213129997253, "num_tokens": 337484753.0, "step": 3156 }, { "epoch": 7.191562143671608, "grad_norm": 3.21875, "learning_rate": 1.0047544784701435e-06, "loss": 0.5591, "mean_token_accuracy": 0.8835946470499039, "num_tokens": 337592359.0, "step": 3157 }, { "epoch": 7.193842645381984, "grad_norm": 3.28125, "learning_rate": 1.0032454640782232e-06, "loss": 0.5675, "mean_token_accuracy": 0.8831405937671661, "num_tokens": 337698920.0, "step": 3158 }, { "epoch": 7.196123147092361, "grad_norm": 2.84375, "learning_rate": 1.0017372992152819e-06, "loss": 0.5524, "mean_token_accuracy": 0.8846133053302765, "num_tokens": 337806568.0, "step": 3159 }, { "epoch": 7.198403648802737, "grad_norm": 2.734375, "learning_rate": 1.0002299847373243e-06, "loss": 0.5758, "mean_token_accuracy": 0.88174769282341, "num_tokens": 337913427.0, "step": 3160 }, { "epoch": 7.200684150513113, "grad_norm": 2.71875, "learning_rate": 9.987235214998741e-07, "loss": 0.5774, "mean_token_accuracy": 0.8783168196678162, "num_tokens": 338020074.0, "step": 3161 }, { "epoch": 7.202964652223489, "grad_norm": 3.234375, "learning_rate": 9.972179103579687e-07, "loss": 0.5932, "mean_token_accuracy": 0.8791584521532059, "num_tokens": 338126865.0, "step": 3162 }, { "epoch": 7.205245153933865, "grad_norm": 2.984375, "learning_rate": 9.957131521661655e-07, "loss": 0.561, "mean_token_accuracy": 0.8836409598588943, "num_tokens": 338234022.0, "step": 3163 }, { "epoch": 7.2075256556442415, "grad_norm": 2.6875, "learning_rate": 9.942092477785365e-07, "loss": 0.5694, "mean_token_accuracy": 0.8865296542644501, "num_tokens": 338341091.0, "step": 3164 }, { "epoch": 7.209806157354618, "grad_norm": 2.875, "learning_rate": 9.927061980486668e-07, "loss": 0.5629, "mean_token_accuracy": 0.883445993065834, "num_tokens": 338447802.0, "step": 3165 }, { "epoch": 7.212086659064994, "grad_norm": 3.359375, "learning_rate": 9.9120400382966e-07, "loss": 0.5991, "mean_token_accuracy": 0.8793905973434448, "num_tokens": 338554661.0, "step": 3166 }, { "epoch": 7.214367160775371, "grad_norm": 5.5625, "learning_rate": 9.897026659741328e-07, "loss": 0.5641, "mean_token_accuracy": 0.8824618011713028, "num_tokens": 338661638.0, "step": 3167 }, { "epoch": 7.216647662485747, "grad_norm": 6.0, "learning_rate": 9.882021853342143e-07, "loss": 0.5616, "mean_token_accuracy": 0.882818415760994, "num_tokens": 338768332.0, "step": 3168 }, { "epoch": 7.218928164196123, "grad_norm": 3.046875, "learning_rate": 9.867025627615493e-07, "loss": 0.5752, "mean_token_accuracy": 0.8801371902227402, "num_tokens": 338875605.0, "step": 3169 }, { "epoch": 7.2212086659065, "grad_norm": 3.046875, "learning_rate": 9.852037991072941e-07, "loss": 0.5702, "mean_token_accuracy": 0.8800628185272217, "num_tokens": 338982521.0, "step": 3170 }, { "epoch": 7.223489167616876, "grad_norm": 4.21875, "learning_rate": 9.837058952221182e-07, "loss": 0.5762, "mean_token_accuracy": 0.8799264430999756, "num_tokens": 339089755.0, "step": 3171 }, { "epoch": 7.225769669327252, "grad_norm": 4.59375, "learning_rate": 9.822088519562038e-07, "loss": 0.5651, "mean_token_accuracy": 0.8843246251344681, "num_tokens": 339196888.0, "step": 3172 }, { "epoch": 7.228050171037628, "grad_norm": 2.5625, "learning_rate": 9.80712670159242e-07, "loss": 0.5808, "mean_token_accuracy": 0.8821865916252136, "num_tokens": 339303828.0, "step": 3173 }, { "epoch": 7.230330672748004, "grad_norm": 4.3125, "learning_rate": 9.792173506804378e-07, "loss": 0.5741, "mean_token_accuracy": 0.880447655916214, "num_tokens": 339410331.0, "step": 3174 }, { "epoch": 7.2326111744583805, "grad_norm": 4.40625, "learning_rate": 9.777228943685055e-07, "loss": 0.5748, "mean_token_accuracy": 0.8847004473209381, "num_tokens": 339517573.0, "step": 3175 }, { "epoch": 7.234891676168757, "grad_norm": 2.859375, "learning_rate": 9.762293020716696e-07, "loss": 0.5744, "mean_token_accuracy": 0.8809026032686234, "num_tokens": 339624146.0, "step": 3176 }, { "epoch": 7.237172177879134, "grad_norm": 2.75, "learning_rate": 9.74736574637665e-07, "loss": 0.5662, "mean_token_accuracy": 0.8855401873588562, "num_tokens": 339731001.0, "step": 3177 }, { "epoch": 7.23945267958951, "grad_norm": 3.203125, "learning_rate": 9.732447129137337e-07, "loss": 0.5738, "mean_token_accuracy": 0.8829121142625809, "num_tokens": 339838319.0, "step": 3178 }, { "epoch": 7.241733181299886, "grad_norm": 3.484375, "learning_rate": 9.717537177466279e-07, "loss": 0.5855, "mean_token_accuracy": 0.8830237686634064, "num_tokens": 339945278.0, "step": 3179 }, { "epoch": 7.244013683010262, "grad_norm": 3.984375, "learning_rate": 9.702635899826082e-07, "loss": 0.5612, "mean_token_accuracy": 0.8862001597881317, "num_tokens": 340052303.0, "step": 3180 }, { "epoch": 7.246294184720639, "grad_norm": 3.734375, "learning_rate": 9.687743304674421e-07, "loss": 0.5634, "mean_token_accuracy": 0.8842123001813889, "num_tokens": 340158489.0, "step": 3181 }, { "epoch": 7.248574686431015, "grad_norm": 3.0625, "learning_rate": 9.672859400464046e-07, "loss": 0.5972, "mean_token_accuracy": 0.8775025904178619, "num_tokens": 340265222.0, "step": 3182 }, { "epoch": 7.250855188141391, "grad_norm": 2.921875, "learning_rate": 9.657984195642783e-07, "loss": 0.5997, "mean_token_accuracy": 0.8756692260503769, "num_tokens": 340371418.0, "step": 3183 }, { "epoch": 7.253135689851767, "grad_norm": 2.609375, "learning_rate": 9.64311769865349e-07, "loss": 0.5728, "mean_token_accuracy": 0.8807047456502914, "num_tokens": 340478369.0, "step": 3184 }, { "epoch": 7.255416191562143, "grad_norm": 4.03125, "learning_rate": 9.628259917934118e-07, "loss": 0.566, "mean_token_accuracy": 0.8846138417720795, "num_tokens": 340585428.0, "step": 3185 }, { "epoch": 7.2576966932725195, "grad_norm": 3.0625, "learning_rate": 9.613410861917661e-07, "loss": 0.5771, "mean_token_accuracy": 0.8808754086494446, "num_tokens": 340692420.0, "step": 3186 }, { "epoch": 7.259977194982897, "grad_norm": 3.65625, "learning_rate": 9.59857053903214e-07, "loss": 0.5659, "mean_token_accuracy": 0.8816579431295395, "num_tokens": 340799529.0, "step": 3187 }, { "epoch": 7.262257696693273, "grad_norm": 3.609375, "learning_rate": 9.583738957700653e-07, "loss": 0.5858, "mean_token_accuracy": 0.8804793953895569, "num_tokens": 340906789.0, "step": 3188 }, { "epoch": 7.264538198403649, "grad_norm": 2.640625, "learning_rate": 9.568916126341305e-07, "loss": 0.5822, "mean_token_accuracy": 0.8775817602872849, "num_tokens": 341013458.0, "step": 3189 }, { "epoch": 7.266818700114025, "grad_norm": 3.296875, "learning_rate": 9.554102053367253e-07, "loss": 0.5864, "mean_token_accuracy": 0.8787823021411896, "num_tokens": 341120727.0, "step": 3190 }, { "epoch": 7.269099201824401, "grad_norm": 4.6875, "learning_rate": 9.53929674718668e-07, "loss": 0.5834, "mean_token_accuracy": 0.8785655200481415, "num_tokens": 341227639.0, "step": 3191 }, { "epoch": 7.271379703534778, "grad_norm": 3.1875, "learning_rate": 9.524500216202795e-07, "loss": 0.5885, "mean_token_accuracy": 0.8779048174619675, "num_tokens": 341334736.0, "step": 3192 }, { "epoch": 7.273660205245154, "grad_norm": 3.671875, "learning_rate": 9.50971246881382e-07, "loss": 0.5946, "mean_token_accuracy": 0.8770471662282944, "num_tokens": 341441533.0, "step": 3193 }, { "epoch": 7.27594070695553, "grad_norm": 3.890625, "learning_rate": 9.494933513413007e-07, "loss": 0.5982, "mean_token_accuracy": 0.8785821348428726, "num_tokens": 341547940.0, "step": 3194 }, { "epoch": 7.278221208665906, "grad_norm": 2.890625, "learning_rate": 9.480163358388584e-07, "loss": 0.5544, "mean_token_accuracy": 0.8853187263011932, "num_tokens": 341655509.0, "step": 3195 }, { "epoch": 7.280501710376283, "grad_norm": 2.921875, "learning_rate": 9.465402012123818e-07, "loss": 0.5664, "mean_token_accuracy": 0.8840262442827225, "num_tokens": 341762697.0, "step": 3196 }, { "epoch": 7.282782212086659, "grad_norm": 3.296875, "learning_rate": 9.45064948299696e-07, "loss": 0.5929, "mean_token_accuracy": 0.8797417432069778, "num_tokens": 341869635.0, "step": 3197 }, { "epoch": 7.285062713797036, "grad_norm": 2.921875, "learning_rate": 9.435905779381265e-07, "loss": 0.6004, "mean_token_accuracy": 0.8786374479532242, "num_tokens": 341975850.0, "step": 3198 }, { "epoch": 7.287343215507412, "grad_norm": 3.3125, "learning_rate": 9.421170909644983e-07, "loss": 0.5555, "mean_token_accuracy": 0.8853399753570557, "num_tokens": 342083686.0, "step": 3199 }, { "epoch": 7.289623717217788, "grad_norm": 3.375, "learning_rate": 9.406444882151322e-07, "loss": 0.5771, "mean_token_accuracy": 0.8812949508428574, "num_tokens": 342191163.0, "step": 3200 }, { "epoch": 7.291904218928164, "grad_norm": 3.09375, "learning_rate": 9.391727705258502e-07, "loss": 0.5411, "mean_token_accuracy": 0.8899759203195572, "num_tokens": 342298933.0, "step": 3201 }, { "epoch": 7.29418472063854, "grad_norm": 5.71875, "learning_rate": 9.377019387319705e-07, "loss": 0.5756, "mean_token_accuracy": 0.8839350640773773, "num_tokens": 342405767.0, "step": 3202 }, { "epoch": 7.296465222348917, "grad_norm": 3.0, "learning_rate": 9.362319936683092e-07, "loss": 0.5962, "mean_token_accuracy": 0.8760559111833572, "num_tokens": 342512638.0, "step": 3203 }, { "epoch": 7.298745724059293, "grad_norm": 3.078125, "learning_rate": 9.347629361691795e-07, "loss": 0.5643, "mean_token_accuracy": 0.8815828859806061, "num_tokens": 342619888.0, "step": 3204 }, { "epoch": 7.301026225769669, "grad_norm": 3.953125, "learning_rate": 9.332947670683882e-07, "loss": 0.5465, "mean_token_accuracy": 0.8862589299678802, "num_tokens": 342726859.0, "step": 3205 }, { "epoch": 7.303306727480045, "grad_norm": 2.484375, "learning_rate": 9.318274871992408e-07, "loss": 0.5435, "mean_token_accuracy": 0.8866942375898361, "num_tokens": 342834871.0, "step": 3206 }, { "epoch": 7.305587229190422, "grad_norm": 2.953125, "learning_rate": 9.303610973945376e-07, "loss": 0.5552, "mean_token_accuracy": 0.8848265260457993, "num_tokens": 342942485.0, "step": 3207 }, { "epoch": 7.307867730900798, "grad_norm": 3.28125, "learning_rate": 9.288955984865717e-07, "loss": 0.5443, "mean_token_accuracy": 0.8879365026950836, "num_tokens": 343049386.0, "step": 3208 }, { "epoch": 7.310148232611175, "grad_norm": 4.1875, "learning_rate": 9.274309913071328e-07, "loss": 0.5866, "mean_token_accuracy": 0.8782593309879303, "num_tokens": 343155935.0, "step": 3209 }, { "epoch": 7.312428734321551, "grad_norm": 3.640625, "learning_rate": 9.259672766875044e-07, "loss": 0.5835, "mean_token_accuracy": 0.8784520477056503, "num_tokens": 343262869.0, "step": 3210 }, { "epoch": 7.314709236031927, "grad_norm": 2.640625, "learning_rate": 9.245044554584609e-07, "loss": 0.5644, "mean_token_accuracy": 0.884000301361084, "num_tokens": 343369889.0, "step": 3211 }, { "epoch": 7.316989737742303, "grad_norm": 3.640625, "learning_rate": 9.230425284502725e-07, "loss": 0.5673, "mean_token_accuracy": 0.8830729424953461, "num_tokens": 343476997.0, "step": 3212 }, { "epoch": 7.319270239452679, "grad_norm": 3.8125, "learning_rate": 9.215814964927005e-07, "loss": 0.5622, "mean_token_accuracy": 0.8840525597333908, "num_tokens": 343584267.0, "step": 3213 }, { "epoch": 7.321550741163056, "grad_norm": 3.203125, "learning_rate": 9.201213604149989e-07, "loss": 0.5861, "mean_token_accuracy": 0.8782940655946732, "num_tokens": 343690596.0, "step": 3214 }, { "epoch": 7.323831242873432, "grad_norm": 3.03125, "learning_rate": 9.186621210459129e-07, "loss": 0.6009, "mean_token_accuracy": 0.877897322177887, "num_tokens": 343797695.0, "step": 3215 }, { "epoch": 7.326111744583809, "grad_norm": 2.46875, "learning_rate": 9.172037792136773e-07, "loss": 0.5559, "mean_token_accuracy": 0.8841624855995178, "num_tokens": 343905377.0, "step": 3216 }, { "epoch": 7.328392246294185, "grad_norm": 3.21875, "learning_rate": 9.157463357460194e-07, "loss": 0.5597, "mean_token_accuracy": 0.8875628858804703, "num_tokens": 344012519.0, "step": 3217 }, { "epoch": 7.330672748004561, "grad_norm": 3.421875, "learning_rate": 9.142897914701565e-07, "loss": 0.5795, "mean_token_accuracy": 0.8802744299173355, "num_tokens": 344120062.0, "step": 3218 }, { "epoch": 7.3329532497149374, "grad_norm": 2.59375, "learning_rate": 9.128341472127944e-07, "loss": 0.562, "mean_token_accuracy": 0.8852846622467041, "num_tokens": 344227315.0, "step": 3219 }, { "epoch": 7.335233751425314, "grad_norm": 2.5, "learning_rate": 9.113794038001298e-07, "loss": 0.5734, "mean_token_accuracy": 0.8799241334199905, "num_tokens": 344334679.0, "step": 3220 }, { "epoch": 7.33751425313569, "grad_norm": 3.1875, "learning_rate": 9.099255620578451e-07, "loss": 0.5621, "mean_token_accuracy": 0.8857553601264954, "num_tokens": 344441934.0, "step": 3221 }, { "epoch": 7.339794754846066, "grad_norm": 2.796875, "learning_rate": 9.084726228111141e-07, "loss": 0.5754, "mean_token_accuracy": 0.8808804005384445, "num_tokens": 344548474.0, "step": 3222 }, { "epoch": 7.342075256556442, "grad_norm": 4.1875, "learning_rate": 9.070205868845966e-07, "loss": 0.5668, "mean_token_accuracy": 0.8860443085432053, "num_tokens": 344655190.0, "step": 3223 }, { "epoch": 7.344355758266818, "grad_norm": 5.09375, "learning_rate": 9.055694551024402e-07, "loss": 0.5654, "mean_token_accuracy": 0.8791934102773666, "num_tokens": 344762745.0, "step": 3224 }, { "epoch": 7.346636259977195, "grad_norm": 2.625, "learning_rate": 9.041192282882796e-07, "loss": 0.5971, "mean_token_accuracy": 0.8765173703432083, "num_tokens": 344869812.0, "step": 3225 }, { "epoch": 7.348916761687571, "grad_norm": 2.890625, "learning_rate": 9.026699072652361e-07, "loss": 0.5774, "mean_token_accuracy": 0.8783349841833115, "num_tokens": 344976836.0, "step": 3226 }, { "epoch": 7.351197263397948, "grad_norm": 3.5625, "learning_rate": 9.012214928559149e-07, "loss": 0.551, "mean_token_accuracy": 0.8875472396612167, "num_tokens": 345084950.0, "step": 3227 }, { "epoch": 7.353477765108324, "grad_norm": 6.34375, "learning_rate": 8.997739858824083e-07, "loss": 0.583, "mean_token_accuracy": 0.8791827261447906, "num_tokens": 345191976.0, "step": 3228 }, { "epoch": 7.3557582668187, "grad_norm": 2.8125, "learning_rate": 8.983273871662951e-07, "loss": 0.587, "mean_token_accuracy": 0.8798834979534149, "num_tokens": 345299539.0, "step": 3229 }, { "epoch": 7.3580387685290765, "grad_norm": 2.71875, "learning_rate": 8.968816975286346e-07, "loss": 0.5847, "mean_token_accuracy": 0.8791932165622711, "num_tokens": 345406386.0, "step": 3230 }, { "epoch": 7.360319270239453, "grad_norm": 3.640625, "learning_rate": 8.954369177899727e-07, "loss": 0.5763, "mean_token_accuracy": 0.8776541501283646, "num_tokens": 345513832.0, "step": 3231 }, { "epoch": 7.362599771949829, "grad_norm": 3.6875, "learning_rate": 8.939930487703402e-07, "loss": 0.5616, "mean_token_accuracy": 0.8819593489170074, "num_tokens": 345621238.0, "step": 3232 }, { "epoch": 7.364880273660205, "grad_norm": 4.75, "learning_rate": 8.925500912892471e-07, "loss": 0.5788, "mean_token_accuracy": 0.878967210650444, "num_tokens": 345727718.0, "step": 3233 }, { "epoch": 7.367160775370581, "grad_norm": 2.84375, "learning_rate": 8.911080461656893e-07, "loss": 0.5771, "mean_token_accuracy": 0.885176420211792, "num_tokens": 345834217.0, "step": 3234 }, { "epoch": 7.369441277080957, "grad_norm": 3.4375, "learning_rate": 8.896669142181436e-07, "loss": 0.5766, "mean_token_accuracy": 0.8792836219072342, "num_tokens": 345941843.0, "step": 3235 }, { "epoch": 7.3717217787913345, "grad_norm": 3.921875, "learning_rate": 8.882266962645695e-07, "loss": 0.5824, "mean_token_accuracy": 0.8821652680635452, "num_tokens": 346049168.0, "step": 3236 }, { "epoch": 7.374002280501711, "grad_norm": 3.34375, "learning_rate": 8.867873931224053e-07, "loss": 0.5566, "mean_token_accuracy": 0.8830807954072952, "num_tokens": 346156521.0, "step": 3237 }, { "epoch": 7.376282782212087, "grad_norm": 3.46875, "learning_rate": 8.853490056085723e-07, "loss": 0.5724, "mean_token_accuracy": 0.8781331777572632, "num_tokens": 346263445.0, "step": 3238 }, { "epoch": 7.378563283922463, "grad_norm": 2.625, "learning_rate": 8.839115345394716e-07, "loss": 0.5608, "mean_token_accuracy": 0.8834449648857117, "num_tokens": 346370432.0, "step": 3239 }, { "epoch": 7.380843785632839, "grad_norm": 3.0625, "learning_rate": 8.824749807309846e-07, "loss": 0.5643, "mean_token_accuracy": 0.8865119814872742, "num_tokens": 346477909.0, "step": 3240 }, { "epoch": 7.3831242873432155, "grad_norm": 3.578125, "learning_rate": 8.810393449984706e-07, "loss": 0.585, "mean_token_accuracy": 0.8794442266225815, "num_tokens": 346584653.0, "step": 3241 }, { "epoch": 7.385404789053592, "grad_norm": 2.890625, "learning_rate": 8.7960462815677e-07, "loss": 0.5767, "mean_token_accuracy": 0.8795085549354553, "num_tokens": 346691178.0, "step": 3242 }, { "epoch": 7.387685290763968, "grad_norm": 3.921875, "learning_rate": 8.781708310201989e-07, "loss": 0.5395, "mean_token_accuracy": 0.8901244848966599, "num_tokens": 346798766.0, "step": 3243 }, { "epoch": 7.389965792474344, "grad_norm": 3.75, "learning_rate": 8.767379544025531e-07, "loss": 0.5831, "mean_token_accuracy": 0.8783616870641708, "num_tokens": 346905556.0, "step": 3244 }, { "epoch": 7.39224629418472, "grad_norm": 3.078125, "learning_rate": 8.753059991171065e-07, "loss": 0.6058, "mean_token_accuracy": 0.8745017796754837, "num_tokens": 347012108.0, "step": 3245 }, { "epoch": 7.394526795895097, "grad_norm": 3.078125, "learning_rate": 8.738749659766085e-07, "loss": 0.5722, "mean_token_accuracy": 0.8834935277700424, "num_tokens": 347119108.0, "step": 3246 }, { "epoch": 7.3968072976054735, "grad_norm": 2.515625, "learning_rate": 8.724448557932874e-07, "loss": 0.5824, "mean_token_accuracy": 0.8808663189411163, "num_tokens": 347226523.0, "step": 3247 }, { "epoch": 7.39908779931585, "grad_norm": 3.9375, "learning_rate": 8.71015669378844e-07, "loss": 0.5613, "mean_token_accuracy": 0.883145734667778, "num_tokens": 347333636.0, "step": 3248 }, { "epoch": 7.401368301026226, "grad_norm": 3.046875, "learning_rate": 8.69587407544458e-07, "loss": 0.5815, "mean_token_accuracy": 0.8786642998456955, "num_tokens": 347440255.0, "step": 3249 }, { "epoch": 7.403648802736602, "grad_norm": 3.3125, "learning_rate": 8.681600711007832e-07, "loss": 0.5679, "mean_token_accuracy": 0.8843037188053131, "num_tokens": 347546867.0, "step": 3250 }, { "epoch": 7.405929304446978, "grad_norm": 2.75, "learning_rate": 8.667336608579488e-07, "loss": 0.5645, "mean_token_accuracy": 0.8831322491168976, "num_tokens": 347653956.0, "step": 3251 }, { "epoch": 7.4082098061573545, "grad_norm": 2.546875, "learning_rate": 8.653081776255562e-07, "loss": 0.557, "mean_token_accuracy": 0.8858849257230759, "num_tokens": 347761532.0, "step": 3252 }, { "epoch": 7.410490307867731, "grad_norm": 3.796875, "learning_rate": 8.638836222126839e-07, "loss": 0.5495, "mean_token_accuracy": 0.8869747668504715, "num_tokens": 347869055.0, "step": 3253 }, { "epoch": 7.412770809578107, "grad_norm": 5.0625, "learning_rate": 8.624599954278803e-07, "loss": 0.5862, "mean_token_accuracy": 0.8779660612344742, "num_tokens": 347975665.0, "step": 3254 }, { "epoch": 7.415051311288483, "grad_norm": 4.0625, "learning_rate": 8.610372980791695e-07, "loss": 0.5559, "mean_token_accuracy": 0.8830466419458389, "num_tokens": 348082812.0, "step": 3255 }, { "epoch": 7.41733181299886, "grad_norm": 6.40625, "learning_rate": 8.59615530974047e-07, "loss": 0.5588, "mean_token_accuracy": 0.8859402984380722, "num_tokens": 348190346.0, "step": 3256 }, { "epoch": 7.419612314709236, "grad_norm": 2.515625, "learning_rate": 8.581946949194802e-07, "loss": 0.5904, "mean_token_accuracy": 0.881903350353241, "num_tokens": 348297177.0, "step": 3257 }, { "epoch": 7.4218928164196125, "grad_norm": 2.59375, "learning_rate": 8.56774790721909e-07, "loss": 0.5616, "mean_token_accuracy": 0.8854805678129196, "num_tokens": 348404580.0, "step": 3258 }, { "epoch": 7.424173318129989, "grad_norm": 4.03125, "learning_rate": 8.553558191872422e-07, "loss": 0.5739, "mean_token_accuracy": 0.8778659850358963, "num_tokens": 348511494.0, "step": 3259 }, { "epoch": 7.426453819840365, "grad_norm": 3.765625, "learning_rate": 8.539377811208613e-07, "loss": 0.5819, "mean_token_accuracy": 0.8794593662023544, "num_tokens": 348618636.0, "step": 3260 }, { "epoch": 7.428734321550741, "grad_norm": 4.15625, "learning_rate": 8.525206773276173e-07, "loss": 0.5727, "mean_token_accuracy": 0.883561447262764, "num_tokens": 348725346.0, "step": 3261 }, { "epoch": 7.431014823261117, "grad_norm": 2.96875, "learning_rate": 8.511045086118311e-07, "loss": 0.5403, "mean_token_accuracy": 0.8908537030220032, "num_tokens": 348832898.0, "step": 3262 }, { "epoch": 7.4332953249714935, "grad_norm": 2.875, "learning_rate": 8.496892757772934e-07, "loss": 0.5582, "mean_token_accuracy": 0.8817539364099503, "num_tokens": 348940485.0, "step": 3263 }, { "epoch": 7.43557582668187, "grad_norm": 2.703125, "learning_rate": 8.482749796272613e-07, "loss": 0.586, "mean_token_accuracy": 0.8792066723108292, "num_tokens": 349047156.0, "step": 3264 }, { "epoch": 7.437856328392247, "grad_norm": 2.609375, "learning_rate": 8.468616209644634e-07, "loss": 0.5426, "mean_token_accuracy": 0.8881279230117798, "num_tokens": 349153893.0, "step": 3265 }, { "epoch": 7.440136830102623, "grad_norm": 2.78125, "learning_rate": 8.454492005910942e-07, "loss": 0.5668, "mean_token_accuracy": 0.8840852081775665, "num_tokens": 349260811.0, "step": 3266 }, { "epoch": 7.442417331812999, "grad_norm": 2.921875, "learning_rate": 8.440377193088162e-07, "loss": 0.5661, "mean_token_accuracy": 0.8842138350009918, "num_tokens": 349368153.0, "step": 3267 }, { "epoch": 7.444697833523375, "grad_norm": 3.921875, "learning_rate": 8.426271779187592e-07, "loss": 0.5885, "mean_token_accuracy": 0.8814673125743866, "num_tokens": 349475704.0, "step": 3268 }, { "epoch": 7.4469783352337515, "grad_norm": 2.703125, "learning_rate": 8.4121757722152e-07, "loss": 0.554, "mean_token_accuracy": 0.8854426443576813, "num_tokens": 349582741.0, "step": 3269 }, { "epoch": 7.449258836944128, "grad_norm": 2.90625, "learning_rate": 8.398089180171592e-07, "loss": 0.5512, "mean_token_accuracy": 0.8872993588447571, "num_tokens": 349689828.0, "step": 3270 }, { "epoch": 7.451539338654504, "grad_norm": 2.640625, "learning_rate": 8.384012011052053e-07, "loss": 0.541, "mean_token_accuracy": 0.8934241831302643, "num_tokens": 349797473.0, "step": 3271 }, { "epoch": 7.45381984036488, "grad_norm": 4.6875, "learning_rate": 8.369944272846522e-07, "loss": 0.5388, "mean_token_accuracy": 0.8873519897460938, "num_tokens": 349904634.0, "step": 3272 }, { "epoch": 7.456100342075256, "grad_norm": 3.109375, "learning_rate": 8.355885973539557e-07, "loss": 0.5718, "mean_token_accuracy": 0.8826733976602554, "num_tokens": 350011607.0, "step": 3273 }, { "epoch": 7.4583808437856325, "grad_norm": 3.0, "learning_rate": 8.341837121110386e-07, "loss": 0.5622, "mean_token_accuracy": 0.8846445381641388, "num_tokens": 350119034.0, "step": 3274 }, { "epoch": 7.460661345496009, "grad_norm": 2.5625, "learning_rate": 8.327797723532874e-07, "loss": 0.5438, "mean_token_accuracy": 0.8912030011415482, "num_tokens": 350226528.0, "step": 3275 }, { "epoch": 7.462941847206386, "grad_norm": 2.75, "learning_rate": 8.313767788775498e-07, "loss": 0.596, "mean_token_accuracy": 0.8776167631149292, "num_tokens": 350333295.0, "step": 3276 }, { "epoch": 7.465222348916762, "grad_norm": 2.609375, "learning_rate": 8.299747324801385e-07, "loss": 0.5985, "mean_token_accuracy": 0.8792443871498108, "num_tokens": 350440043.0, "step": 3277 }, { "epoch": 7.467502850627138, "grad_norm": 3.078125, "learning_rate": 8.285736339568279e-07, "loss": 0.5906, "mean_token_accuracy": 0.8789392858743668, "num_tokens": 350546875.0, "step": 3278 }, { "epoch": 7.469783352337514, "grad_norm": 2.765625, "learning_rate": 8.271734841028553e-07, "loss": 0.5682, "mean_token_accuracy": 0.8837863206863403, "num_tokens": 350653887.0, "step": 3279 }, { "epoch": 7.4720638540478905, "grad_norm": 2.625, "learning_rate": 8.25774283712917e-07, "loss": 0.5674, "mean_token_accuracy": 0.8839046061038971, "num_tokens": 350761695.0, "step": 3280 }, { "epoch": 7.474344355758267, "grad_norm": 3.515625, "learning_rate": 8.243760335811734e-07, "loss": 0.5873, "mean_token_accuracy": 0.8785386085510254, "num_tokens": 350868270.0, "step": 3281 }, { "epoch": 7.476624857468643, "grad_norm": 2.90625, "learning_rate": 8.229787345012439e-07, "loss": 0.5577, "mean_token_accuracy": 0.8850255161523819, "num_tokens": 350975311.0, "step": 3282 }, { "epoch": 7.478905359179019, "grad_norm": 3.28125, "learning_rate": 8.215823872662084e-07, "loss": 0.5602, "mean_token_accuracy": 0.8832758516073227, "num_tokens": 351082788.0, "step": 3283 }, { "epoch": 7.481185860889395, "grad_norm": 2.921875, "learning_rate": 8.201869926686068e-07, "loss": 0.5753, "mean_token_accuracy": 0.8810236304998398, "num_tokens": 351190001.0, "step": 3284 }, { "epoch": 7.483466362599772, "grad_norm": 4.28125, "learning_rate": 8.187925515004391e-07, "loss": 0.5749, "mean_token_accuracy": 0.8823070824146271, "num_tokens": 351296897.0, "step": 3285 }, { "epoch": 7.485746864310149, "grad_norm": 4.3125, "learning_rate": 8.173990645531612e-07, "loss": 0.5734, "mean_token_accuracy": 0.8813482820987701, "num_tokens": 351404175.0, "step": 3286 }, { "epoch": 7.488027366020525, "grad_norm": 2.9375, "learning_rate": 8.160065326176905e-07, "loss": 0.5748, "mean_token_accuracy": 0.8828651309013367, "num_tokens": 351510970.0, "step": 3287 }, { "epoch": 7.490307867730901, "grad_norm": 3.234375, "learning_rate": 8.14614956484401e-07, "loss": 0.5841, "mean_token_accuracy": 0.8815842270851135, "num_tokens": 351617878.0, "step": 3288 }, { "epoch": 7.492588369441277, "grad_norm": 3.5625, "learning_rate": 8.132243369431248e-07, "loss": 0.5592, "mean_token_accuracy": 0.8866011053323746, "num_tokens": 351725881.0, "step": 3289 }, { "epoch": 7.494868871151653, "grad_norm": 3.265625, "learning_rate": 8.11834674783151e-07, "loss": 0.5889, "mean_token_accuracy": 0.8792490214109421, "num_tokens": 351832802.0, "step": 3290 }, { "epoch": 7.4971493728620295, "grad_norm": 2.75, "learning_rate": 8.104459707932238e-07, "loss": 0.5753, "mean_token_accuracy": 0.8836107850074768, "num_tokens": 351939776.0, "step": 3291 }, { "epoch": 7.499429874572406, "grad_norm": 3.296875, "learning_rate": 8.090582257615456e-07, "loss": 0.5699, "mean_token_accuracy": 0.8809670358896255, "num_tokens": 352046740.0, "step": 3292 }, { "epoch": 7.501710376282782, "grad_norm": 2.546875, "learning_rate": 8.076714404757735e-07, "loss": 0.5593, "mean_token_accuracy": 0.8861764520406723, "num_tokens": 352153749.0, "step": 3293 }, { "epoch": 7.503990877993158, "grad_norm": 3.03125, "learning_rate": 8.062856157230209e-07, "loss": 0.5891, "mean_token_accuracy": 0.8781421184539795, "num_tokens": 352260556.0, "step": 3294 }, { "epoch": 7.506271379703534, "grad_norm": 3.921875, "learning_rate": 8.049007522898536e-07, "loss": 0.5713, "mean_token_accuracy": 0.8831081688404083, "num_tokens": 352367485.0, "step": 3295 }, { "epoch": 7.508551881413911, "grad_norm": 4.46875, "learning_rate": 8.035168509622948e-07, "loss": 0.5845, "mean_token_accuracy": 0.8832761347293854, "num_tokens": 352473992.0, "step": 3296 }, { "epoch": 7.510832383124288, "grad_norm": 3.375, "learning_rate": 8.02133912525819e-07, "loss": 0.5534, "mean_token_accuracy": 0.8868482112884521, "num_tokens": 352582216.0, "step": 3297 }, { "epoch": 7.513112884834664, "grad_norm": 3.15625, "learning_rate": 8.007519377653558e-07, "loss": 0.5803, "mean_token_accuracy": 0.8818371295928955, "num_tokens": 352688608.0, "step": 3298 }, { "epoch": 7.51539338654504, "grad_norm": 2.640625, "learning_rate": 7.993709274652872e-07, "loss": 0.564, "mean_token_accuracy": 0.8833436369895935, "num_tokens": 352794995.0, "step": 3299 }, { "epoch": 7.517673888255416, "grad_norm": 2.625, "learning_rate": 7.979908824094484e-07, "loss": 0.5448, "mean_token_accuracy": 0.8875368237495422, "num_tokens": 352902959.0, "step": 3300 }, { "epoch": 7.517673888255416, "eval_loss": 0.5862932801246643, "eval_mean_token_accuracy": 0.8799256974753318, "eval_num_tokens": 352902959.0, "eval_runtime": 58.6491, "eval_samples_per_second": 142.969, "eval_steps_per_second": 4.484, "step": 3300 }, { "epoch": 7.519954389965792, "grad_norm": 3.734375, "learning_rate": 7.966118033811271e-07, "loss": 0.5748, "mean_token_accuracy": 0.8830474466085434, "num_tokens": 353010544.0, "step": 3301 }, { "epoch": 7.5222348916761685, "grad_norm": 3.21875, "learning_rate": 7.952336911630604e-07, "loss": 0.5655, "mean_token_accuracy": 0.8814462274312973, "num_tokens": 353117940.0, "step": 3302 }, { "epoch": 7.524515393386545, "grad_norm": 3.359375, "learning_rate": 7.938565465374384e-07, "loss": 0.5743, "mean_token_accuracy": 0.8806511014699936, "num_tokens": 353225388.0, "step": 3303 }, { "epoch": 7.526795895096921, "grad_norm": 5.21875, "learning_rate": 7.924803702859024e-07, "loss": 0.5648, "mean_token_accuracy": 0.8838271647691727, "num_tokens": 353332497.0, "step": 3304 }, { "epoch": 7.529076396807298, "grad_norm": 2.734375, "learning_rate": 7.911051631895433e-07, "loss": 0.5663, "mean_token_accuracy": 0.8843842297792435, "num_tokens": 353439584.0, "step": 3305 }, { "epoch": 7.531356898517674, "grad_norm": 3.296875, "learning_rate": 7.897309260289027e-07, "loss": 0.5628, "mean_token_accuracy": 0.882561057806015, "num_tokens": 353546370.0, "step": 3306 }, { "epoch": 7.53363740022805, "grad_norm": 3.296875, "learning_rate": 7.883576595839698e-07, "loss": 0.5814, "mean_token_accuracy": 0.879010796546936, "num_tokens": 353653493.0, "step": 3307 }, { "epoch": 7.535917901938427, "grad_norm": 3.09375, "learning_rate": 7.869853646341849e-07, "loss": 0.581, "mean_token_accuracy": 0.8826321810483932, "num_tokens": 353759627.0, "step": 3308 }, { "epoch": 7.538198403648803, "grad_norm": 3.796875, "learning_rate": 7.856140419584357e-07, "loss": 0.5969, "mean_token_accuracy": 0.8757929354906082, "num_tokens": 353866548.0, "step": 3309 }, { "epoch": 7.540478905359179, "grad_norm": 2.828125, "learning_rate": 7.842436923350591e-07, "loss": 0.5741, "mean_token_accuracy": 0.8822405785322189, "num_tokens": 353973896.0, "step": 3310 }, { "epoch": 7.542759407069555, "grad_norm": 4.25, "learning_rate": 7.828743165418393e-07, "loss": 0.5866, "mean_token_accuracy": 0.8774521350860596, "num_tokens": 354080599.0, "step": 3311 }, { "epoch": 7.545039908779931, "grad_norm": 2.90625, "learning_rate": 7.815059153560065e-07, "loss": 0.5634, "mean_token_accuracy": 0.8832540661096573, "num_tokens": 354188075.0, "step": 3312 }, { "epoch": 7.5473204104903076, "grad_norm": 4.5, "learning_rate": 7.801384895542391e-07, "loss": 0.5556, "mean_token_accuracy": 0.8853226155042648, "num_tokens": 354295337.0, "step": 3313 }, { "epoch": 7.549600912200685, "grad_norm": 2.65625, "learning_rate": 7.78772039912662e-07, "loss": 0.5668, "mean_token_accuracy": 0.8821288347244263, "num_tokens": 354402522.0, "step": 3314 }, { "epoch": 7.55188141391106, "grad_norm": 2.921875, "learning_rate": 7.774065672068463e-07, "loss": 0.5767, "mean_token_accuracy": 0.8814186155796051, "num_tokens": 354509530.0, "step": 3315 }, { "epoch": 7.554161915621437, "grad_norm": 4.1875, "learning_rate": 7.760420722118059e-07, "loss": 0.566, "mean_token_accuracy": 0.8813648819923401, "num_tokens": 354616615.0, "step": 3316 }, { "epoch": 7.556442417331813, "grad_norm": 3.53125, "learning_rate": 7.746785557020034e-07, "loss": 0.564, "mean_token_accuracy": 0.8848537355661392, "num_tokens": 354724411.0, "step": 3317 }, { "epoch": 7.558722919042189, "grad_norm": 4.5, "learning_rate": 7.733160184513447e-07, "loss": 0.5716, "mean_token_accuracy": 0.8836092352867126, "num_tokens": 354831136.0, "step": 3318 }, { "epoch": 7.561003420752566, "grad_norm": 4.25, "learning_rate": 7.719544612331781e-07, "loss": 0.5753, "mean_token_accuracy": 0.8814618289470673, "num_tokens": 354938130.0, "step": 3319 }, { "epoch": 7.563283922462942, "grad_norm": 2.640625, "learning_rate": 7.705938848202985e-07, "loss": 0.585, "mean_token_accuracy": 0.8781514316797256, "num_tokens": 355044678.0, "step": 3320 }, { "epoch": 7.565564424173318, "grad_norm": 2.65625, "learning_rate": 7.692342899849419e-07, "loss": 0.5741, "mean_token_accuracy": 0.883265346288681, "num_tokens": 355152110.0, "step": 3321 }, { "epoch": 7.567844925883694, "grad_norm": 3.03125, "learning_rate": 7.678756774987897e-07, "loss": 0.5704, "mean_token_accuracy": 0.8851431161165237, "num_tokens": 355259414.0, "step": 3322 }, { "epoch": 7.57012542759407, "grad_norm": 2.953125, "learning_rate": 7.665180481329621e-07, "loss": 0.5549, "mean_token_accuracy": 0.8860851675271988, "num_tokens": 355366168.0, "step": 3323 }, { "epoch": 7.572405929304447, "grad_norm": 3.09375, "learning_rate": 7.651614026580243e-07, "loss": 0.5704, "mean_token_accuracy": 0.8835692703723907, "num_tokens": 355472724.0, "step": 3324 }, { "epoch": 7.574686431014824, "grad_norm": 2.703125, "learning_rate": 7.638057418439818e-07, "loss": 0.5896, "mean_token_accuracy": 0.8791484832763672, "num_tokens": 355579877.0, "step": 3325 }, { "epoch": 7.5769669327252, "grad_norm": 2.671875, "learning_rate": 7.624510664602819e-07, "loss": 0.5607, "mean_token_accuracy": 0.8831581175327301, "num_tokens": 355686885.0, "step": 3326 }, { "epoch": 7.579247434435576, "grad_norm": 3.625, "learning_rate": 7.610973772758118e-07, "loss": 0.5867, "mean_token_accuracy": 0.8818655163049698, "num_tokens": 355793461.0, "step": 3327 }, { "epoch": 7.581527936145952, "grad_norm": 3.0, "learning_rate": 7.597446750589005e-07, "loss": 0.579, "mean_token_accuracy": 0.8854977488517761, "num_tokens": 355900140.0, "step": 3328 }, { "epoch": 7.583808437856328, "grad_norm": 3.4375, "learning_rate": 7.583929605773138e-07, "loss": 0.5739, "mean_token_accuracy": 0.8835340738296509, "num_tokens": 356007033.0, "step": 3329 }, { "epoch": 7.586088939566705, "grad_norm": 2.53125, "learning_rate": 7.570422345982598e-07, "loss": 0.558, "mean_token_accuracy": 0.8873008489608765, "num_tokens": 356114209.0, "step": 3330 }, { "epoch": 7.588369441277081, "grad_norm": 3.75, "learning_rate": 7.556924978883843e-07, "loss": 0.5657, "mean_token_accuracy": 0.8829249292612076, "num_tokens": 356220979.0, "step": 3331 }, { "epoch": 7.590649942987457, "grad_norm": 3.8125, "learning_rate": 7.543437512137717e-07, "loss": 0.5788, "mean_token_accuracy": 0.8828289657831192, "num_tokens": 356327873.0, "step": 3332 }, { "epoch": 7.592930444697833, "grad_norm": 3.15625, "learning_rate": 7.529959953399455e-07, "loss": 0.5503, "mean_token_accuracy": 0.8852836340665817, "num_tokens": 356435610.0, "step": 3333 }, { "epoch": 7.59521094640821, "grad_norm": 3.21875, "learning_rate": 7.516492310318643e-07, "loss": 0.5862, "mean_token_accuracy": 0.8795439153909683, "num_tokens": 356542182.0, "step": 3334 }, { "epoch": 7.5974914481185865, "grad_norm": 2.703125, "learning_rate": 7.503034590539266e-07, "loss": 0.5845, "mean_token_accuracy": 0.88029345870018, "num_tokens": 356648911.0, "step": 3335 }, { "epoch": 7.599771949828963, "grad_norm": 3.65625, "learning_rate": 7.489586801699661e-07, "loss": 0.5624, "mean_token_accuracy": 0.8867314010858536, "num_tokens": 356755662.0, "step": 3336 }, { "epoch": 7.602052451539339, "grad_norm": 2.265625, "learning_rate": 7.476148951432543e-07, "loss": 0.5522, "mean_token_accuracy": 0.8867203295230865, "num_tokens": 356862489.0, "step": 3337 }, { "epoch": 7.604332953249715, "grad_norm": 3.375, "learning_rate": 7.462721047364965e-07, "loss": 0.5796, "mean_token_accuracy": 0.8798790127038956, "num_tokens": 356969949.0, "step": 3338 }, { "epoch": 7.606613454960091, "grad_norm": 3.671875, "learning_rate": 7.449303097118355e-07, "loss": 0.572, "mean_token_accuracy": 0.8800079971551895, "num_tokens": 357077056.0, "step": 3339 }, { "epoch": 7.608893956670467, "grad_norm": 2.78125, "learning_rate": 7.435895108308472e-07, "loss": 0.5725, "mean_token_accuracy": 0.8835026770830154, "num_tokens": 357184699.0, "step": 3340 }, { "epoch": 7.611174458380844, "grad_norm": 2.90625, "learning_rate": 7.422497088545436e-07, "loss": 0.5782, "mean_token_accuracy": 0.8809027224779129, "num_tokens": 357291871.0, "step": 3341 }, { "epoch": 7.61345496009122, "grad_norm": 2.578125, "learning_rate": 7.409109045433704e-07, "loss": 0.5732, "mean_token_accuracy": 0.8787789940834045, "num_tokens": 357399018.0, "step": 3342 }, { "epoch": 7.615735461801596, "grad_norm": 3.15625, "learning_rate": 7.395730986572075e-07, "loss": 0.5715, "mean_token_accuracy": 0.8798029869794846, "num_tokens": 357505735.0, "step": 3343 }, { "epoch": 7.618015963511972, "grad_norm": 2.546875, "learning_rate": 7.382362919553682e-07, "loss": 0.5706, "mean_token_accuracy": 0.8806295096874237, "num_tokens": 357613067.0, "step": 3344 }, { "epoch": 7.620296465222349, "grad_norm": 3.296875, "learning_rate": 7.369004851965966e-07, "loss": 0.5788, "mean_token_accuracy": 0.8788366466760635, "num_tokens": 357720663.0, "step": 3345 }, { "epoch": 7.6225769669327255, "grad_norm": 2.921875, "learning_rate": 7.355656791390717e-07, "loss": 0.5734, "mean_token_accuracy": 0.8791816085577011, "num_tokens": 357828239.0, "step": 3346 }, { "epoch": 7.624857468643102, "grad_norm": 4.09375, "learning_rate": 7.342318745404034e-07, "loss": 0.573, "mean_token_accuracy": 0.8827292621135712, "num_tokens": 357935648.0, "step": 3347 }, { "epoch": 7.627137970353478, "grad_norm": 3.046875, "learning_rate": 7.32899072157634e-07, "loss": 0.5635, "mean_token_accuracy": 0.8853467255830765, "num_tokens": 358042684.0, "step": 3348 }, { "epoch": 7.629418472063854, "grad_norm": 4.96875, "learning_rate": 7.315672727472365e-07, "loss": 0.5439, "mean_token_accuracy": 0.8852219432592392, "num_tokens": 358150191.0, "step": 3349 }, { "epoch": 7.63169897377423, "grad_norm": 3.125, "learning_rate": 7.302364770651132e-07, "loss": 0.5628, "mean_token_accuracy": 0.888086274266243, "num_tokens": 358257157.0, "step": 3350 }, { "epoch": 7.633979475484606, "grad_norm": 3.28125, "learning_rate": 7.289066858665991e-07, "loss": 0.5475, "mean_token_accuracy": 0.8860320299863815, "num_tokens": 358364536.0, "step": 3351 }, { "epoch": 7.636259977194983, "grad_norm": 2.734375, "learning_rate": 7.275778999064578e-07, "loss": 0.5927, "mean_token_accuracy": 0.8757172375917435, "num_tokens": 358471817.0, "step": 3352 }, { "epoch": 7.638540478905359, "grad_norm": 2.59375, "learning_rate": 7.262501199388827e-07, "loss": 0.5552, "mean_token_accuracy": 0.8869472742080688, "num_tokens": 358578963.0, "step": 3353 }, { "epoch": 7.640820980615736, "grad_norm": 2.546875, "learning_rate": 7.249233467174965e-07, "loss": 0.5882, "mean_token_accuracy": 0.8762264847755432, "num_tokens": 358685361.0, "step": 3354 }, { "epoch": 7.643101482326112, "grad_norm": 4.40625, "learning_rate": 7.235975809953491e-07, "loss": 0.55, "mean_token_accuracy": 0.8874974101781845, "num_tokens": 358792677.0, "step": 3355 }, { "epoch": 7.645381984036488, "grad_norm": 4.125, "learning_rate": 7.222728235249196e-07, "loss": 0.5887, "mean_token_accuracy": 0.8794031143188477, "num_tokens": 358899865.0, "step": 3356 }, { "epoch": 7.6476624857468645, "grad_norm": 2.828125, "learning_rate": 7.209490750581152e-07, "loss": 0.553, "mean_token_accuracy": 0.8865524530410767, "num_tokens": 359006856.0, "step": 3357 }, { "epoch": 7.649942987457241, "grad_norm": 2.921875, "learning_rate": 7.196263363462699e-07, "loss": 0.575, "mean_token_accuracy": 0.88320592045784, "num_tokens": 359113787.0, "step": 3358 }, { "epoch": 7.652223489167617, "grad_norm": 2.796875, "learning_rate": 7.183046081401454e-07, "loss": 0.565, "mean_token_accuracy": 0.8821183294057846, "num_tokens": 359220641.0, "step": 3359 }, { "epoch": 7.654503990877993, "grad_norm": 3.328125, "learning_rate": 7.169838911899276e-07, "loss": 0.5597, "mean_token_accuracy": 0.8826924860477448, "num_tokens": 359327949.0, "step": 3360 }, { "epoch": 7.656784492588369, "grad_norm": 3.640625, "learning_rate": 7.156641862452316e-07, "loss": 0.5937, "mean_token_accuracy": 0.8792467415332794, "num_tokens": 359434539.0, "step": 3361 }, { "epoch": 7.659064994298745, "grad_norm": 5.59375, "learning_rate": 7.143454940550948e-07, "loss": 0.5683, "mean_token_accuracy": 0.8802933245897293, "num_tokens": 359541747.0, "step": 3362 }, { "epoch": 7.661345496009122, "grad_norm": 2.953125, "learning_rate": 7.13027815367982e-07, "loss": 0.5676, "mean_token_accuracy": 0.8832086026668549, "num_tokens": 359648549.0, "step": 3363 }, { "epoch": 7.663625997719498, "grad_norm": 4.5, "learning_rate": 7.117111509317823e-07, "loss": 0.5735, "mean_token_accuracy": 0.8798936158418655, "num_tokens": 359755583.0, "step": 3364 }, { "epoch": 7.665906499429875, "grad_norm": 5.09375, "learning_rate": 7.103955014938099e-07, "loss": 0.5509, "mean_token_accuracy": 0.885413408279419, "num_tokens": 359862616.0, "step": 3365 }, { "epoch": 7.668187001140251, "grad_norm": 3.125, "learning_rate": 7.090808678008005e-07, "loss": 0.5825, "mean_token_accuracy": 0.8743928670883179, "num_tokens": 359969068.0, "step": 3366 }, { "epoch": 7.670467502850627, "grad_norm": 4.375, "learning_rate": 7.077672505989155e-07, "loss": 0.5938, "mean_token_accuracy": 0.8788380324840546, "num_tokens": 360076499.0, "step": 3367 }, { "epoch": 7.6727480045610035, "grad_norm": 3.09375, "learning_rate": 7.064546506337386e-07, "loss": 0.5747, "mean_token_accuracy": 0.8831783980131149, "num_tokens": 360183723.0, "step": 3368 }, { "epoch": 7.67502850627138, "grad_norm": 3.5625, "learning_rate": 7.051430686502764e-07, "loss": 0.5763, "mean_token_accuracy": 0.8796802908182144, "num_tokens": 360291600.0, "step": 3369 }, { "epoch": 7.677309007981756, "grad_norm": 3.671875, "learning_rate": 7.038325053929582e-07, "loss": 0.5892, "mean_token_accuracy": 0.8816768825054169, "num_tokens": 360398736.0, "step": 3370 }, { "epoch": 7.679589509692132, "grad_norm": 4.3125, "learning_rate": 7.025229616056326e-07, "loss": 0.5799, "mean_token_accuracy": 0.8814764469861984, "num_tokens": 360505355.0, "step": 3371 }, { "epoch": 7.681870011402508, "grad_norm": 2.9375, "learning_rate": 7.012144380315724e-07, "loss": 0.5622, "mean_token_accuracy": 0.8859875947237015, "num_tokens": 360612219.0, "step": 3372 }, { "epoch": 7.684150513112884, "grad_norm": 3.640625, "learning_rate": 6.999069354134703e-07, "loss": 0.5471, "mean_token_accuracy": 0.8861010521650314, "num_tokens": 360720165.0, "step": 3373 }, { "epoch": 7.6864310148232615, "grad_norm": 2.578125, "learning_rate": 6.986004544934394e-07, "loss": 0.588, "mean_token_accuracy": 0.8770526051521301, "num_tokens": 360827091.0, "step": 3374 }, { "epoch": 7.688711516533638, "grad_norm": 3.0, "learning_rate": 6.972949960130135e-07, "loss": 0.5684, "mean_token_accuracy": 0.8841615915298462, "num_tokens": 360934168.0, "step": 3375 }, { "epoch": 7.690992018244014, "grad_norm": 3.71875, "learning_rate": 6.959905607131457e-07, "loss": 0.5652, "mean_token_accuracy": 0.8876559138298035, "num_tokens": 361040859.0, "step": 3376 }, { "epoch": 7.69327251995439, "grad_norm": 3.828125, "learning_rate": 6.946871493342072e-07, "loss": 0.5742, "mean_token_accuracy": 0.8838866651058197, "num_tokens": 361148098.0, "step": 3377 }, { "epoch": 7.695553021664766, "grad_norm": 3.3125, "learning_rate": 6.933847626159898e-07, "loss": 0.5684, "mean_token_accuracy": 0.8839807361364365, "num_tokens": 361255309.0, "step": 3378 }, { "epoch": 7.6978335233751425, "grad_norm": 3.078125, "learning_rate": 6.920834012977032e-07, "loss": 0.5935, "mean_token_accuracy": 0.878353163599968, "num_tokens": 361362125.0, "step": 3379 }, { "epoch": 7.700114025085519, "grad_norm": 2.9375, "learning_rate": 6.907830661179757e-07, "loss": 0.555, "mean_token_accuracy": 0.8843338489532471, "num_tokens": 361468866.0, "step": 3380 }, { "epoch": 7.702394526795895, "grad_norm": 2.5, "learning_rate": 6.894837578148505e-07, "loss": 0.563, "mean_token_accuracy": 0.8871625512838364, "num_tokens": 361576767.0, "step": 3381 }, { "epoch": 7.704675028506271, "grad_norm": 2.953125, "learning_rate": 6.881854771257912e-07, "loss": 0.5575, "mean_token_accuracy": 0.8831998556852341, "num_tokens": 361683691.0, "step": 3382 }, { "epoch": 7.706955530216648, "grad_norm": 3.125, "learning_rate": 6.868882247876776e-07, "loss": 0.5786, "mean_token_accuracy": 0.8782538622617722, "num_tokens": 361790388.0, "step": 3383 }, { "epoch": 7.7092360319270234, "grad_norm": 3.75, "learning_rate": 6.855920015368032e-07, "loss": 0.5964, "mean_token_accuracy": 0.8798200190067291, "num_tokens": 361897277.0, "step": 3384 }, { "epoch": 7.7115165336374005, "grad_norm": 3.265625, "learning_rate": 6.8429680810888e-07, "loss": 0.5862, "mean_token_accuracy": 0.882276862859726, "num_tokens": 362003897.0, "step": 3385 }, { "epoch": 7.713797035347777, "grad_norm": 2.859375, "learning_rate": 6.830026452390354e-07, "loss": 0.5599, "mean_token_accuracy": 0.8820279538631439, "num_tokens": 362111309.0, "step": 3386 }, { "epoch": 7.716077537058153, "grad_norm": 2.71875, "learning_rate": 6.817095136618113e-07, "loss": 0.5743, "mean_token_accuracy": 0.8817304819822311, "num_tokens": 362218467.0, "step": 3387 }, { "epoch": 7.718358038768529, "grad_norm": 3.890625, "learning_rate": 6.804174141111631e-07, "loss": 0.5786, "mean_token_accuracy": 0.8817280679941177, "num_tokens": 362326122.0, "step": 3388 }, { "epoch": 7.720638540478905, "grad_norm": 3.765625, "learning_rate": 6.791263473204624e-07, "loss": 0.586, "mean_token_accuracy": 0.878828227519989, "num_tokens": 362433157.0, "step": 3389 }, { "epoch": 7.7229190421892815, "grad_norm": 2.625, "learning_rate": 6.778363140224933e-07, "loss": 0.5615, "mean_token_accuracy": 0.8845448940992355, "num_tokens": 362540668.0, "step": 3390 }, { "epoch": 7.725199543899658, "grad_norm": 3.46875, "learning_rate": 6.765473149494545e-07, "loss": 0.56, "mean_token_accuracy": 0.884161964058876, "num_tokens": 362648198.0, "step": 3391 }, { "epoch": 7.727480045610034, "grad_norm": 3.75, "learning_rate": 6.752593508329572e-07, "loss": 0.567, "mean_token_accuracy": 0.8821451961994171, "num_tokens": 362755339.0, "step": 3392 }, { "epoch": 7.72976054732041, "grad_norm": 2.625, "learning_rate": 6.739724224040236e-07, "loss": 0.5711, "mean_token_accuracy": 0.8820271492004395, "num_tokens": 362862783.0, "step": 3393 }, { "epoch": 7.732041049030787, "grad_norm": 3.203125, "learning_rate": 6.726865303930905e-07, "loss": 0.5809, "mean_token_accuracy": 0.8791635781526566, "num_tokens": 362969748.0, "step": 3394 }, { "epoch": 7.734321550741163, "grad_norm": 2.5625, "learning_rate": 6.714016755300048e-07, "loss": 0.5938, "mean_token_accuracy": 0.8794443905353546, "num_tokens": 363076599.0, "step": 3395 }, { "epoch": 7.7366020524515395, "grad_norm": 3.40625, "learning_rate": 6.701178585440257e-07, "loss": 0.5509, "mean_token_accuracy": 0.8836392611265182, "num_tokens": 363184034.0, "step": 3396 }, { "epoch": 7.738882554161916, "grad_norm": 2.75, "learning_rate": 6.688350801638235e-07, "loss": 0.5852, "mean_token_accuracy": 0.8802366703748703, "num_tokens": 363290837.0, "step": 3397 }, { "epoch": 7.741163055872292, "grad_norm": 4.84375, "learning_rate": 6.67553341117477e-07, "loss": 0.5755, "mean_token_accuracy": 0.8817736506462097, "num_tokens": 363398189.0, "step": 3398 }, { "epoch": 7.743443557582668, "grad_norm": 2.9375, "learning_rate": 6.662726421324775e-07, "loss": 0.5604, "mean_token_accuracy": 0.8845831155776978, "num_tokens": 363505198.0, "step": 3399 }, { "epoch": 7.745724059293044, "grad_norm": 4.84375, "learning_rate": 6.649929839357247e-07, "loss": 0.5612, "mean_token_accuracy": 0.8841813802719116, "num_tokens": 363612622.0, "step": 3400 }, { "epoch": 7.7480045610034205, "grad_norm": 4.1875, "learning_rate": 6.637143672535282e-07, "loss": 0.561, "mean_token_accuracy": 0.8850229233503342, "num_tokens": 363719127.0, "step": 3401 }, { "epoch": 7.750285062713797, "grad_norm": 3.078125, "learning_rate": 6.624367928116066e-07, "loss": 0.5797, "mean_token_accuracy": 0.8808367401361465, "num_tokens": 363826026.0, "step": 3402 }, { "epoch": 7.752565564424174, "grad_norm": 4.65625, "learning_rate": 6.611602613350854e-07, "loss": 0.5742, "mean_token_accuracy": 0.8835535049438477, "num_tokens": 363933444.0, "step": 3403 }, { "epoch": 7.75484606613455, "grad_norm": 2.78125, "learning_rate": 6.598847735485001e-07, "loss": 0.5656, "mean_token_accuracy": 0.8848689496517181, "num_tokens": 364040465.0, "step": 3404 }, { "epoch": 7.757126567844926, "grad_norm": 3.921875, "learning_rate": 6.586103301757918e-07, "loss": 0.5749, "mean_token_accuracy": 0.880999892950058, "num_tokens": 364147442.0, "step": 3405 }, { "epoch": 7.759407069555302, "grad_norm": 3.765625, "learning_rate": 6.573369319403108e-07, "loss": 0.5683, "mean_token_accuracy": 0.883205309510231, "num_tokens": 364255015.0, "step": 3406 }, { "epoch": 7.7616875712656785, "grad_norm": 3.59375, "learning_rate": 6.560645795648132e-07, "loss": 0.5716, "mean_token_accuracy": 0.8813123106956482, "num_tokens": 364362027.0, "step": 3407 }, { "epoch": 7.763968072976055, "grad_norm": 3.609375, "learning_rate": 6.547932737714624e-07, "loss": 0.5657, "mean_token_accuracy": 0.8851943165063858, "num_tokens": 364469642.0, "step": 3408 }, { "epoch": 7.766248574686431, "grad_norm": 3.484375, "learning_rate": 6.535230152818256e-07, "loss": 0.5576, "mean_token_accuracy": 0.8846663683652878, "num_tokens": 364576740.0, "step": 3409 }, { "epoch": 7.768529076396807, "grad_norm": 3.6875, "learning_rate": 6.522538048168777e-07, "loss": 0.5721, "mean_token_accuracy": 0.8818811625242233, "num_tokens": 364683980.0, "step": 3410 }, { "epoch": 7.770809578107183, "grad_norm": 4.71875, "learning_rate": 6.509856430969982e-07, "loss": 0.5715, "mean_token_accuracy": 0.8817369639873505, "num_tokens": 364790604.0, "step": 3411 }, { "epoch": 7.7730900798175595, "grad_norm": 4.71875, "learning_rate": 6.49718530841971e-07, "loss": 0.5796, "mean_token_accuracy": 0.876135990023613, "num_tokens": 364897881.0, "step": 3412 }, { "epoch": 7.775370581527936, "grad_norm": 4.375, "learning_rate": 6.484524687709853e-07, "loss": 0.5643, "mean_token_accuracy": 0.8819546699523926, "num_tokens": 365004584.0, "step": 3413 }, { "epoch": 7.777651083238313, "grad_norm": 2.640625, "learning_rate": 6.471874576026321e-07, "loss": 0.5695, "mean_token_accuracy": 0.8798770606517792, "num_tokens": 365111502.0, "step": 3414 }, { "epoch": 7.779931584948689, "grad_norm": 3.34375, "learning_rate": 6.459234980549081e-07, "loss": 0.5942, "mean_token_accuracy": 0.8755921274423599, "num_tokens": 365217932.0, "step": 3415 }, { "epoch": 7.782212086659065, "grad_norm": 3.3125, "learning_rate": 6.446605908452122e-07, "loss": 0.573, "mean_token_accuracy": 0.8808364421129227, "num_tokens": 365324803.0, "step": 3416 }, { "epoch": 7.784492588369441, "grad_norm": 3.0, "learning_rate": 6.433987366903461e-07, "loss": 0.5739, "mean_token_accuracy": 0.8788796812295914, "num_tokens": 365432029.0, "step": 3417 }, { "epoch": 7.7867730900798175, "grad_norm": 2.765625, "learning_rate": 6.421379363065142e-07, "loss": 0.5695, "mean_token_accuracy": 0.8842990696430206, "num_tokens": 365539725.0, "step": 3418 }, { "epoch": 7.789053591790194, "grad_norm": 2.78125, "learning_rate": 6.408781904093228e-07, "loss": 0.585, "mean_token_accuracy": 0.8797939419746399, "num_tokens": 365646379.0, "step": 3419 }, { "epoch": 7.79133409350057, "grad_norm": 3.21875, "learning_rate": 6.39619499713778e-07, "loss": 0.5456, "mean_token_accuracy": 0.8892961293458939, "num_tokens": 365753657.0, "step": 3420 }, { "epoch": 7.793614595210946, "grad_norm": 2.796875, "learning_rate": 6.383618649342894e-07, "loss": 0.5718, "mean_token_accuracy": 0.8814832419157028, "num_tokens": 365859878.0, "step": 3421 }, { "epoch": 7.795895096921322, "grad_norm": 3.40625, "learning_rate": 6.371052867846658e-07, "loss": 0.5887, "mean_token_accuracy": 0.8824429661035538, "num_tokens": 365966802.0, "step": 3422 }, { "epoch": 7.798175598631699, "grad_norm": 3.234375, "learning_rate": 6.358497659781177e-07, "loss": 0.5513, "mean_token_accuracy": 0.8872740119695663, "num_tokens": 366073841.0, "step": 3423 }, { "epoch": 7.800456100342076, "grad_norm": 4.1875, "learning_rate": 6.345953032272525e-07, "loss": 0.5757, "mean_token_accuracy": 0.8818953335285187, "num_tokens": 366180970.0, "step": 3424 }, { "epoch": 7.802736602052452, "grad_norm": 3.3125, "learning_rate": 6.333418992440804e-07, "loss": 0.5747, "mean_token_accuracy": 0.8802995383739471, "num_tokens": 366287962.0, "step": 3425 }, { "epoch": 7.805017103762828, "grad_norm": 3.015625, "learning_rate": 6.3208955474001e-07, "loss": 0.5699, "mean_token_accuracy": 0.8855053037405014, "num_tokens": 366395208.0, "step": 3426 }, { "epoch": 7.807297605473204, "grad_norm": 2.71875, "learning_rate": 6.308382704258459e-07, "loss": 0.5819, "mean_token_accuracy": 0.8811968117952347, "num_tokens": 366502174.0, "step": 3427 }, { "epoch": 7.80957810718358, "grad_norm": 5.4375, "learning_rate": 6.29588047011794e-07, "loss": 0.5882, "mean_token_accuracy": 0.8795420229434967, "num_tokens": 366609589.0, "step": 3428 }, { "epoch": 7.811858608893957, "grad_norm": 2.375, "learning_rate": 6.283388852074576e-07, "loss": 0.5691, "mean_token_accuracy": 0.8850322365760803, "num_tokens": 366716534.0, "step": 3429 }, { "epoch": 7.814139110604333, "grad_norm": 2.703125, "learning_rate": 6.270907857218356e-07, "loss": 0.5718, "mean_token_accuracy": 0.882201686501503, "num_tokens": 366823931.0, "step": 3430 }, { "epoch": 7.816419612314709, "grad_norm": 3.265625, "learning_rate": 6.258437492633254e-07, "loss": 0.552, "mean_token_accuracy": 0.887025773525238, "num_tokens": 366930916.0, "step": 3431 }, { "epoch": 7.818700114025085, "grad_norm": 2.484375, "learning_rate": 6.245977765397216e-07, "loss": 0.5631, "mean_token_accuracy": 0.8866972625255585, "num_tokens": 367038203.0, "step": 3432 }, { "epoch": 7.820980615735461, "grad_norm": 3.28125, "learning_rate": 6.233528682582132e-07, "loss": 0.5618, "mean_token_accuracy": 0.8818273693323135, "num_tokens": 367145120.0, "step": 3433 }, { "epoch": 7.823261117445838, "grad_norm": 3.796875, "learning_rate": 6.221090251253872e-07, "loss": 0.5571, "mean_token_accuracy": 0.8845161199569702, "num_tokens": 367252872.0, "step": 3434 }, { "epoch": 7.825541619156215, "grad_norm": 2.625, "learning_rate": 6.208662478472249e-07, "loss": 0.5619, "mean_token_accuracy": 0.8831181675195694, "num_tokens": 367359928.0, "step": 3435 }, { "epoch": 7.827822120866591, "grad_norm": 2.53125, "learning_rate": 6.196245371291015e-07, "loss": 0.566, "mean_token_accuracy": 0.8835492432117462, "num_tokens": 367467008.0, "step": 3436 }, { "epoch": 7.830102622576967, "grad_norm": 3.265625, "learning_rate": 6.183838936757891e-07, "loss": 0.5847, "mean_token_accuracy": 0.8792263269424438, "num_tokens": 367574251.0, "step": 3437 }, { "epoch": 7.832383124287343, "grad_norm": 2.84375, "learning_rate": 6.171443181914524e-07, "loss": 0.5484, "mean_token_accuracy": 0.8878850936889648, "num_tokens": 367682131.0, "step": 3438 }, { "epoch": 7.834663625997719, "grad_norm": 3.15625, "learning_rate": 6.159058113796507e-07, "loss": 0.5745, "mean_token_accuracy": 0.8801481425762177, "num_tokens": 367789432.0, "step": 3439 }, { "epoch": 7.836944127708096, "grad_norm": 2.71875, "learning_rate": 6.146683739433374e-07, "loss": 0.5901, "mean_token_accuracy": 0.8798154145479202, "num_tokens": 367896299.0, "step": 3440 }, { "epoch": 7.839224629418472, "grad_norm": 2.8125, "learning_rate": 6.134320065848564e-07, "loss": 0.5795, "mean_token_accuracy": 0.8814557641744614, "num_tokens": 368003358.0, "step": 3441 }, { "epoch": 7.841505131128848, "grad_norm": 2.828125, "learning_rate": 6.121967100059473e-07, "loss": 0.5912, "mean_token_accuracy": 0.8781363666057587, "num_tokens": 368110282.0, "step": 3442 }, { "epoch": 7.843785632839225, "grad_norm": 4.28125, "learning_rate": 6.109624849077397e-07, "loss": 0.5759, "mean_token_accuracy": 0.8851889967918396, "num_tokens": 368217316.0, "step": 3443 }, { "epoch": 7.846066134549601, "grad_norm": 2.84375, "learning_rate": 6.097293319907566e-07, "loss": 0.5741, "mean_token_accuracy": 0.8824188113212585, "num_tokens": 368324078.0, "step": 3444 }, { "epoch": 7.848346636259977, "grad_norm": 3.390625, "learning_rate": 6.084972519549123e-07, "loss": 0.5895, "mean_token_accuracy": 0.8789727538824081, "num_tokens": 368430695.0, "step": 3445 }, { "epoch": 7.850627137970354, "grad_norm": 4.78125, "learning_rate": 6.072662454995101e-07, "loss": 0.5727, "mean_token_accuracy": 0.8788609057664871, "num_tokens": 368537559.0, "step": 3446 }, { "epoch": 7.85290763968073, "grad_norm": 3.28125, "learning_rate": 6.060363133232472e-07, "loss": 0.5585, "mean_token_accuracy": 0.8846858441829681, "num_tokens": 368645105.0, "step": 3447 }, { "epoch": 7.855188141391106, "grad_norm": 3.421875, "learning_rate": 6.048074561242076e-07, "loss": 0.5778, "mean_token_accuracy": 0.8781778067350388, "num_tokens": 368752515.0, "step": 3448 }, { "epoch": 7.857468643101482, "grad_norm": 2.5625, "learning_rate": 6.035796745998679e-07, "loss": 0.5448, "mean_token_accuracy": 0.8889462649822235, "num_tokens": 368859963.0, "step": 3449 }, { "epoch": 7.859749144811858, "grad_norm": 3.265625, "learning_rate": 6.023529694470931e-07, "loss": 0.568, "mean_token_accuracy": 0.8810675591230392, "num_tokens": 368966665.0, "step": 3450 }, { "epoch": 7.862029646522235, "grad_norm": 3.0625, "learning_rate": 6.01127341362138e-07, "loss": 0.57, "mean_token_accuracy": 0.8824652433395386, "num_tokens": 369073545.0, "step": 3451 }, { "epoch": 7.864310148232612, "grad_norm": 2.78125, "learning_rate": 5.999027910406441e-07, "loss": 0.5804, "mean_token_accuracy": 0.8798021376132965, "num_tokens": 369180613.0, "step": 3452 }, { "epoch": 7.866590649942988, "grad_norm": 3.390625, "learning_rate": 5.98679319177643e-07, "loss": 0.5666, "mean_token_accuracy": 0.8812180906534195, "num_tokens": 369287568.0, "step": 3453 }, { "epoch": 7.868871151653364, "grad_norm": 3.125, "learning_rate": 5.974569264675542e-07, "loss": 0.5585, "mean_token_accuracy": 0.8855846971273422, "num_tokens": 369394613.0, "step": 3454 }, { "epoch": 7.87115165336374, "grad_norm": 3.25, "learning_rate": 5.962356136041835e-07, "loss": 0.5601, "mean_token_accuracy": 0.8867956697940826, "num_tokens": 369501322.0, "step": 3455 }, { "epoch": 7.873432155074116, "grad_norm": 3.265625, "learning_rate": 5.95015381280726e-07, "loss": 0.5581, "mean_token_accuracy": 0.8840297311544418, "num_tokens": 369608680.0, "step": 3456 }, { "epoch": 7.875712656784493, "grad_norm": 3.46875, "learning_rate": 5.937962301897604e-07, "loss": 0.5664, "mean_token_accuracy": 0.8828248530626297, "num_tokens": 369716370.0, "step": 3457 }, { "epoch": 7.877993158494869, "grad_norm": 2.9375, "learning_rate": 5.925781610232534e-07, "loss": 0.5658, "mean_token_accuracy": 0.8825751543045044, "num_tokens": 369823697.0, "step": 3458 }, { "epoch": 7.880273660205245, "grad_norm": 4.5625, "learning_rate": 5.913611744725584e-07, "loss": 0.5753, "mean_token_accuracy": 0.8807247132062912, "num_tokens": 369931399.0, "step": 3459 }, { "epoch": 7.882554161915621, "grad_norm": 3.109375, "learning_rate": 5.901452712284128e-07, "loss": 0.584, "mean_token_accuracy": 0.881399855017662, "num_tokens": 370038799.0, "step": 3460 }, { "epoch": 7.884834663625997, "grad_norm": 5.0, "learning_rate": 5.889304519809402e-07, "loss": 0.5778, "mean_token_accuracy": 0.8788302540779114, "num_tokens": 370146637.0, "step": 3461 }, { "epoch": 7.887115165336374, "grad_norm": 2.84375, "learning_rate": 5.877167174196491e-07, "loss": 0.5616, "mean_token_accuracy": 0.8876172602176666, "num_tokens": 370253754.0, "step": 3462 }, { "epoch": 7.889395667046751, "grad_norm": 3.015625, "learning_rate": 5.865040682334303e-07, "loss": 0.5761, "mean_token_accuracy": 0.8796623796224594, "num_tokens": 370360287.0, "step": 3463 }, { "epoch": 7.891676168757127, "grad_norm": 3.875, "learning_rate": 5.852925051105609e-07, "loss": 0.5755, "mean_token_accuracy": 0.8807369768619537, "num_tokens": 370467034.0, "step": 3464 }, { "epoch": 7.893956670467503, "grad_norm": 2.84375, "learning_rate": 5.840820287387009e-07, "loss": 0.5771, "mean_token_accuracy": 0.8808783739805222, "num_tokens": 370573928.0, "step": 3465 }, { "epoch": 7.896237172177879, "grad_norm": 2.953125, "learning_rate": 5.828726398048939e-07, "loss": 0.5845, "mean_token_accuracy": 0.8822729289531708, "num_tokens": 370680253.0, "step": 3466 }, { "epoch": 7.898517673888255, "grad_norm": 2.984375, "learning_rate": 5.816643389955642e-07, "loss": 0.5698, "mean_token_accuracy": 0.8849064260721207, "num_tokens": 370786997.0, "step": 3467 }, { "epoch": 7.900798175598632, "grad_norm": 3.5, "learning_rate": 5.804571269965206e-07, "loss": 0.5778, "mean_token_accuracy": 0.8811779320240021, "num_tokens": 370893845.0, "step": 3468 }, { "epoch": 7.903078677309008, "grad_norm": 4.5625, "learning_rate": 5.792510044929545e-07, "loss": 0.5769, "mean_token_accuracy": 0.8799442201852798, "num_tokens": 371000695.0, "step": 3469 }, { "epoch": 7.905359179019384, "grad_norm": 2.609375, "learning_rate": 5.780459721694359e-07, "loss": 0.5749, "mean_token_accuracy": 0.8812065422534943, "num_tokens": 371107589.0, "step": 3470 }, { "epoch": 7.90763968072976, "grad_norm": 3.203125, "learning_rate": 5.768420307099188e-07, "loss": 0.563, "mean_token_accuracy": 0.8848077207803726, "num_tokens": 371214705.0, "step": 3471 }, { "epoch": 7.909920182440137, "grad_norm": 3.28125, "learning_rate": 5.756391807977377e-07, "loss": 0.5721, "mean_token_accuracy": 0.8829392790794373, "num_tokens": 371321954.0, "step": 3472 }, { "epoch": 7.9122006841505135, "grad_norm": 3.84375, "learning_rate": 5.744374231156056e-07, "loss": 0.5769, "mean_token_accuracy": 0.87844018638134, "num_tokens": 371428415.0, "step": 3473 }, { "epoch": 7.91448118586089, "grad_norm": 4.96875, "learning_rate": 5.732367583456177e-07, "loss": 0.5671, "mean_token_accuracy": 0.879816859960556, "num_tokens": 371535322.0, "step": 3474 }, { "epoch": 7.916761687571266, "grad_norm": 2.625, "learning_rate": 5.720371871692484e-07, "loss": 0.5599, "mean_token_accuracy": 0.8838074952363968, "num_tokens": 371642326.0, "step": 3475 }, { "epoch": 7.919042189281642, "grad_norm": 3.0625, "learning_rate": 5.708387102673507e-07, "loss": 0.5495, "mean_token_accuracy": 0.8860156238079071, "num_tokens": 371749387.0, "step": 3476 }, { "epoch": 7.921322690992018, "grad_norm": 3.109375, "learning_rate": 5.696413283201571e-07, "loss": 0.5784, "mean_token_accuracy": 0.8818910270929337, "num_tokens": 371856387.0, "step": 3477 }, { "epoch": 7.923603192702394, "grad_norm": 2.75, "learning_rate": 5.684450420072792e-07, "loss": 0.5595, "mean_token_accuracy": 0.8838623762130737, "num_tokens": 371963635.0, "step": 3478 }, { "epoch": 7.925883694412771, "grad_norm": 2.78125, "learning_rate": 5.67249852007705e-07, "loss": 0.5367, "mean_token_accuracy": 0.8921998292207718, "num_tokens": 372071179.0, "step": 3479 }, { "epoch": 7.928164196123147, "grad_norm": 3.765625, "learning_rate": 5.660557589998014e-07, "loss": 0.5764, "mean_token_accuracy": 0.88170425593853, "num_tokens": 372178998.0, "step": 3480 }, { "epoch": 7.930444697833523, "grad_norm": 4.84375, "learning_rate": 5.648627636613127e-07, "loss": 0.5834, "mean_token_accuracy": 0.8786900341510773, "num_tokens": 372286187.0, "step": 3481 }, { "epoch": 7.932725199543899, "grad_norm": 3.78125, "learning_rate": 5.636708666693599e-07, "loss": 0.5475, "mean_token_accuracy": 0.887066513299942, "num_tokens": 372393703.0, "step": 3482 }, { "epoch": 7.935005701254276, "grad_norm": 2.8125, "learning_rate": 5.62480068700442e-07, "loss": 0.5779, "mean_token_accuracy": 0.879653736948967, "num_tokens": 372500635.0, "step": 3483 }, { "epoch": 7.9372862029646525, "grad_norm": 3.03125, "learning_rate": 5.612903704304309e-07, "loss": 0.5343, "mean_token_accuracy": 0.8889681696891785, "num_tokens": 372608260.0, "step": 3484 }, { "epoch": 7.939566704675029, "grad_norm": 3.984375, "learning_rate": 5.601017725345772e-07, "loss": 0.5619, "mean_token_accuracy": 0.8840508311986923, "num_tokens": 372715730.0, "step": 3485 }, { "epoch": 7.941847206385405, "grad_norm": 2.765625, "learning_rate": 5.589142756875065e-07, "loss": 0.5715, "mean_token_accuracy": 0.8822902888059616, "num_tokens": 372822393.0, "step": 3486 }, { "epoch": 7.944127708095781, "grad_norm": 3.6875, "learning_rate": 5.577278805632186e-07, "loss": 0.5717, "mean_token_accuracy": 0.8777175396680832, "num_tokens": 372929636.0, "step": 3487 }, { "epoch": 7.946408209806157, "grad_norm": 2.6875, "learning_rate": 5.565425878350895e-07, "loss": 0.581, "mean_token_accuracy": 0.8786440938711166, "num_tokens": 373036503.0, "step": 3488 }, { "epoch": 7.9486887115165334, "grad_norm": 2.34375, "learning_rate": 5.553583981758668e-07, "loss": 0.5385, "mean_token_accuracy": 0.8891723304986954, "num_tokens": 373143771.0, "step": 3489 }, { "epoch": 7.95096921322691, "grad_norm": 2.625, "learning_rate": 5.541753122576746e-07, "loss": 0.5589, "mean_token_accuracy": 0.8819065690040588, "num_tokens": 373250701.0, "step": 3490 }, { "epoch": 7.953249714937286, "grad_norm": 3.0625, "learning_rate": 5.529933307520102e-07, "loss": 0.5747, "mean_token_accuracy": 0.8800751566886902, "num_tokens": 373357804.0, "step": 3491 }, { "epoch": 7.955530216647663, "grad_norm": 2.75, "learning_rate": 5.518124543297423e-07, "loss": 0.5565, "mean_token_accuracy": 0.8874002993106842, "num_tokens": 373464625.0, "step": 3492 }, { "epoch": 7.957810718358039, "grad_norm": 4.09375, "learning_rate": 5.506326836611139e-07, "loss": 0.5783, "mean_token_accuracy": 0.8810268938541412, "num_tokens": 373572124.0, "step": 3493 }, { "epoch": 7.960091220068415, "grad_norm": 3.734375, "learning_rate": 5.494540194157411e-07, "loss": 0.5765, "mean_token_accuracy": 0.8803278654813766, "num_tokens": 373678778.0, "step": 3494 }, { "epoch": 7.9623717217787915, "grad_norm": 2.90625, "learning_rate": 5.482764622626094e-07, "loss": 0.554, "mean_token_accuracy": 0.8871753364801407, "num_tokens": 373785455.0, "step": 3495 }, { "epoch": 7.964652223489168, "grad_norm": 2.890625, "learning_rate": 5.471000128700784e-07, "loss": 0.5617, "mean_token_accuracy": 0.8817069977521896, "num_tokens": 373892856.0, "step": 3496 }, { "epoch": 7.966932725199544, "grad_norm": 3.265625, "learning_rate": 5.459246719058778e-07, "loss": 0.5691, "mean_token_accuracy": 0.8817310035228729, "num_tokens": 374000078.0, "step": 3497 }, { "epoch": 7.96921322690992, "grad_norm": 3.15625, "learning_rate": 5.447504400371084e-07, "loss": 0.5618, "mean_token_accuracy": 0.8855457454919815, "num_tokens": 374107031.0, "step": 3498 }, { "epoch": 7.971493728620296, "grad_norm": 3.40625, "learning_rate": 5.435773179302426e-07, "loss": 0.5873, "mean_token_accuracy": 0.8799401372671127, "num_tokens": 374213605.0, "step": 3499 }, { "epoch": 7.9737742303306725, "grad_norm": 2.75, "learning_rate": 5.4240530625112e-07, "loss": 0.5501, "mean_token_accuracy": 0.885622963309288, "num_tokens": 374321608.0, "step": 3500 }, { "epoch": 7.976054732041049, "grad_norm": 3.09375, "learning_rate": 5.412344056649527e-07, "loss": 0.5896, "mean_token_accuracy": 0.8806732445955276, "num_tokens": 374427692.0, "step": 3501 }, { "epoch": 7.978335233751425, "grad_norm": 3.234375, "learning_rate": 5.400646168363216e-07, "loss": 0.5627, "mean_token_accuracy": 0.885017529129982, "num_tokens": 374534923.0, "step": 3502 }, { "epoch": 7.980615735461802, "grad_norm": 4.21875, "learning_rate": 5.388959404291757e-07, "loss": 0.5489, "mean_token_accuracy": 0.8873688131570816, "num_tokens": 374641673.0, "step": 3503 }, { "epoch": 7.982896237172178, "grad_norm": 3.328125, "learning_rate": 5.377283771068342e-07, "loss": 0.5787, "mean_token_accuracy": 0.8803865015506744, "num_tokens": 374748674.0, "step": 3504 }, { "epoch": 7.985176738882554, "grad_norm": 2.90625, "learning_rate": 5.365619275319823e-07, "loss": 0.585, "mean_token_accuracy": 0.879953607916832, "num_tokens": 374855343.0, "step": 3505 }, { "epoch": 7.9874572405929305, "grad_norm": 3.15625, "learning_rate": 5.353965923666743e-07, "loss": 0.5742, "mean_token_accuracy": 0.8788940012454987, "num_tokens": 374962436.0, "step": 3506 }, { "epoch": 7.989737742303307, "grad_norm": 2.75, "learning_rate": 5.342323722723324e-07, "loss": 0.571, "mean_token_accuracy": 0.8827593922615051, "num_tokens": 375068957.0, "step": 3507 }, { "epoch": 7.992018244013683, "grad_norm": 3.125, "learning_rate": 5.330692679097457e-07, "loss": 0.5441, "mean_token_accuracy": 0.8856361508369446, "num_tokens": 375176504.0, "step": 3508 }, { "epoch": 7.994298745724059, "grad_norm": 3.78125, "learning_rate": 5.319072799390693e-07, "loss": 0.5954, "mean_token_accuracy": 0.8784863501787186, "num_tokens": 375283610.0, "step": 3509 }, { "epoch": 7.996579247434435, "grad_norm": 3.796875, "learning_rate": 5.307464090198258e-07, "loss": 0.5606, "mean_token_accuracy": 0.8854129165410995, "num_tokens": 375390831.0, "step": 3510 }, { "epoch": 7.9988597491448115, "grad_norm": 3.53125, "learning_rate": 5.295866558109023e-07, "loss": 0.5461, "mean_token_accuracy": 0.8877543658018112, "num_tokens": 375498660.0, "step": 3511 }, { "epoch": 8.0, "grad_norm": 5.6875, "learning_rate": 5.284280209705531e-07, "loss": 0.5913, "mean_token_accuracy": 0.8680516481399536, "num_tokens": 375537856.0, "step": 3512 }, { "epoch": 8.002280501710377, "grad_norm": 3.078125, "learning_rate": 5.272705051563959e-07, "loss": 0.5911, "mean_token_accuracy": 0.8796858638525009, "num_tokens": 375645015.0, "step": 3513 }, { "epoch": 8.004561003420752, "grad_norm": 2.78125, "learning_rate": 5.261141090254149e-07, "loss": 0.57, "mean_token_accuracy": 0.880715399980545, "num_tokens": 375752133.0, "step": 3514 }, { "epoch": 8.00684150513113, "grad_norm": 2.8125, "learning_rate": 5.249588332339589e-07, "loss": 0.5635, "mean_token_accuracy": 0.8854578584432602, "num_tokens": 375859485.0, "step": 3515 }, { "epoch": 8.009122006841505, "grad_norm": 3.09375, "learning_rate": 5.238046784377388e-07, "loss": 0.5818, "mean_token_accuracy": 0.8783604055643082, "num_tokens": 375965833.0, "step": 3516 }, { "epoch": 8.011402508551882, "grad_norm": 2.90625, "learning_rate": 5.226516452918315e-07, "loss": 0.5738, "mean_token_accuracy": 0.8814006298780441, "num_tokens": 376072854.0, "step": 3517 }, { "epoch": 8.013683010262257, "grad_norm": 2.84375, "learning_rate": 5.214997344506758e-07, "loss": 0.5739, "mean_token_accuracy": 0.884096547961235, "num_tokens": 376179522.0, "step": 3518 }, { "epoch": 8.015963511972634, "grad_norm": 3.140625, "learning_rate": 5.203489465680747e-07, "loss": 0.5921, "mean_token_accuracy": 0.8821551650762558, "num_tokens": 376286201.0, "step": 3519 }, { "epoch": 8.01824401368301, "grad_norm": 3.234375, "learning_rate": 5.19199282297193e-07, "loss": 0.5474, "mean_token_accuracy": 0.8864544034004211, "num_tokens": 376394233.0, "step": 3520 }, { "epoch": 8.01824401368301, "eval_loss": 0.5862906575202942, "eval_mean_token_accuracy": 0.8799242350085154, "eval_num_tokens": 376394233.0, "eval_runtime": 58.678, "eval_samples_per_second": 142.899, "eval_steps_per_second": 4.482, "step": 3520 }, { "epoch": 8.020524515393387, "grad_norm": 2.34375, "learning_rate": 5.180507422905585e-07, "loss": 0.57, "mean_token_accuracy": 0.8834462761878967, "num_tokens": 376501043.0, "step": 3521 }, { "epoch": 8.022805017103764, "grad_norm": 2.78125, "learning_rate": 5.169033272000587e-07, "loss": 0.5796, "mean_token_accuracy": 0.8795521408319473, "num_tokens": 376607841.0, "step": 3522 }, { "epoch": 8.025085518814139, "grad_norm": 2.765625, "learning_rate": 5.157570376769452e-07, "loss": 0.572, "mean_token_accuracy": 0.8829406499862671, "num_tokens": 376714615.0, "step": 3523 }, { "epoch": 8.027366020524516, "grad_norm": 2.796875, "learning_rate": 5.146118743718301e-07, "loss": 0.5691, "mean_token_accuracy": 0.885052427649498, "num_tokens": 376822883.0, "step": 3524 }, { "epoch": 8.029646522234891, "grad_norm": 3.390625, "learning_rate": 5.134678379346856e-07, "loss": 0.5587, "mean_token_accuracy": 0.8828334510326385, "num_tokens": 376929625.0, "step": 3525 }, { "epoch": 8.031927023945268, "grad_norm": 3.796875, "learning_rate": 5.123249290148452e-07, "loss": 0.5781, "mean_token_accuracy": 0.8789701759815216, "num_tokens": 377036782.0, "step": 3526 }, { "epoch": 8.034207525655644, "grad_norm": 3.328125, "learning_rate": 5.111831482610011e-07, "loss": 0.5675, "mean_token_accuracy": 0.8839947730302811, "num_tokens": 377143972.0, "step": 3527 }, { "epoch": 8.03648802736602, "grad_norm": 3.203125, "learning_rate": 5.100424963212064e-07, "loss": 0.5748, "mean_token_accuracy": 0.8813960254192352, "num_tokens": 377250375.0, "step": 3528 }, { "epoch": 8.038768529076396, "grad_norm": 2.46875, "learning_rate": 5.089029738428733e-07, "loss": 0.5559, "mean_token_accuracy": 0.8840717822313309, "num_tokens": 377357434.0, "step": 3529 }, { "epoch": 8.041049030786773, "grad_norm": 3.90625, "learning_rate": 5.077645814727725e-07, "loss": 0.5816, "mean_token_accuracy": 0.8795655369758606, "num_tokens": 377464466.0, "step": 3530 }, { "epoch": 8.043329532497149, "grad_norm": 3.53125, "learning_rate": 5.066273198570343e-07, "loss": 0.5863, "mean_token_accuracy": 0.8808377087116241, "num_tokens": 377570549.0, "step": 3531 }, { "epoch": 8.045610034207526, "grad_norm": 4.09375, "learning_rate": 5.054911896411452e-07, "loss": 0.5662, "mean_token_accuracy": 0.8835866749286652, "num_tokens": 377678050.0, "step": 3532 }, { "epoch": 8.047890535917903, "grad_norm": 2.65625, "learning_rate": 5.043561914699513e-07, "loss": 0.5657, "mean_token_accuracy": 0.8832181841135025, "num_tokens": 377785329.0, "step": 3533 }, { "epoch": 8.050171037628278, "grad_norm": 3.609375, "learning_rate": 5.032223259876565e-07, "loss": 0.5881, "mean_token_accuracy": 0.8773612827062607, "num_tokens": 377892051.0, "step": 3534 }, { "epoch": 8.052451539338655, "grad_norm": 3.71875, "learning_rate": 5.020895938378195e-07, "loss": 0.5738, "mean_token_accuracy": 0.8818919360637665, "num_tokens": 377999467.0, "step": 3535 }, { "epoch": 8.05473204104903, "grad_norm": 2.6875, "learning_rate": 5.009579956633578e-07, "loss": 0.5568, "mean_token_accuracy": 0.884328693151474, "num_tokens": 378106965.0, "step": 3536 }, { "epoch": 8.057012542759407, "grad_norm": 3.46875, "learning_rate": 4.998275321065454e-07, "loss": 0.5704, "mean_token_accuracy": 0.8801892846822739, "num_tokens": 378214261.0, "step": 3537 }, { "epoch": 8.059293044469783, "grad_norm": 3.5, "learning_rate": 4.986982038090104e-07, "loss": 0.5831, "mean_token_accuracy": 0.8822603821754456, "num_tokens": 378320449.0, "step": 3538 }, { "epoch": 8.06157354618016, "grad_norm": 2.921875, "learning_rate": 4.975700114117385e-07, "loss": 0.5418, "mean_token_accuracy": 0.8873136639595032, "num_tokens": 378427513.0, "step": 3539 }, { "epoch": 8.063854047890535, "grad_norm": 2.875, "learning_rate": 4.964429555550693e-07, "loss": 0.5841, "mean_token_accuracy": 0.8837717622518539, "num_tokens": 378534328.0, "step": 3540 }, { "epoch": 8.066134549600912, "grad_norm": 3.296875, "learning_rate": 4.953170368786985e-07, "loss": 0.5401, "mean_token_accuracy": 0.8866355121135712, "num_tokens": 378641467.0, "step": 3541 }, { "epoch": 8.06841505131129, "grad_norm": 2.90625, "learning_rate": 4.941922560216764e-07, "loss": 0.5616, "mean_token_accuracy": 0.8841693699359894, "num_tokens": 378748281.0, "step": 3542 }, { "epoch": 8.070695553021665, "grad_norm": 3.125, "learning_rate": 4.930686136224056e-07, "loss": 0.5686, "mean_token_accuracy": 0.8798070102930069, "num_tokens": 378854984.0, "step": 3543 }, { "epoch": 8.072976054732042, "grad_norm": 4.90625, "learning_rate": 4.91946110318644e-07, "loss": 0.5871, "mean_token_accuracy": 0.8762340992689133, "num_tokens": 378962257.0, "step": 3544 }, { "epoch": 8.075256556442417, "grad_norm": 2.390625, "learning_rate": 4.908247467475036e-07, "loss": 0.5628, "mean_token_accuracy": 0.8846005648374557, "num_tokens": 379069977.0, "step": 3545 }, { "epoch": 8.077537058152794, "grad_norm": 3.828125, "learning_rate": 4.897045235454481e-07, "loss": 0.5755, "mean_token_accuracy": 0.882202997803688, "num_tokens": 379177064.0, "step": 3546 }, { "epoch": 8.07981755986317, "grad_norm": 5.0, "learning_rate": 4.885854413482955e-07, "loss": 0.5663, "mean_token_accuracy": 0.8858351409435272, "num_tokens": 379284331.0, "step": 3547 }, { "epoch": 8.082098061573546, "grad_norm": 3.921875, "learning_rate": 4.874675007912138e-07, "loss": 0.56, "mean_token_accuracy": 0.8848386406898499, "num_tokens": 379391861.0, "step": 3548 }, { "epoch": 8.084378563283922, "grad_norm": 4.96875, "learning_rate": 4.863507025087255e-07, "loss": 0.5525, "mean_token_accuracy": 0.8858330547809601, "num_tokens": 379499126.0, "step": 3549 }, { "epoch": 8.086659064994299, "grad_norm": 3.015625, "learning_rate": 4.852350471347031e-07, "loss": 0.5846, "mean_token_accuracy": 0.8791882544755936, "num_tokens": 379605713.0, "step": 3550 }, { "epoch": 8.088939566704674, "grad_norm": 3.109375, "learning_rate": 4.841205353023715e-07, "loss": 0.5628, "mean_token_accuracy": 0.881306603550911, "num_tokens": 379712727.0, "step": 3551 }, { "epoch": 8.091220068415051, "grad_norm": 2.875, "learning_rate": 4.83007167644306e-07, "loss": 0.5438, "mean_token_accuracy": 0.8896623253822327, "num_tokens": 379819603.0, "step": 3552 }, { "epoch": 8.093500570125428, "grad_norm": 3.796875, "learning_rate": 4.818949447924334e-07, "loss": 0.5742, "mean_token_accuracy": 0.881472259759903, "num_tokens": 379926447.0, "step": 3553 }, { "epoch": 8.095781071835804, "grad_norm": 2.59375, "learning_rate": 4.807838673780282e-07, "loss": 0.5636, "mean_token_accuracy": 0.8816197216510773, "num_tokens": 380033718.0, "step": 3554 }, { "epoch": 8.09806157354618, "grad_norm": 2.96875, "learning_rate": 4.796739360317181e-07, "loss": 0.56, "mean_token_accuracy": 0.8843680173158646, "num_tokens": 380140392.0, "step": 3555 }, { "epoch": 8.100342075256556, "grad_norm": 2.8125, "learning_rate": 4.785651513834774e-07, "loss": 0.5553, "mean_token_accuracy": 0.8834531456232071, "num_tokens": 380247239.0, "step": 3556 }, { "epoch": 8.102622576966933, "grad_norm": 4.78125, "learning_rate": 4.774575140626317e-07, "loss": 0.5562, "mean_token_accuracy": 0.8883015066385269, "num_tokens": 380354451.0, "step": 3557 }, { "epoch": 8.104903078677308, "grad_norm": 4.125, "learning_rate": 4.763510246978548e-07, "loss": 0.5559, "mean_token_accuracy": 0.8845552057027817, "num_tokens": 380462073.0, "step": 3558 }, { "epoch": 8.107183580387685, "grad_norm": 4.71875, "learning_rate": 4.7524568391716736e-07, "loss": 0.5809, "mean_token_accuracy": 0.8840623497962952, "num_tokens": 380569477.0, "step": 3559 }, { "epoch": 8.10946408209806, "grad_norm": 3.21875, "learning_rate": 4.7414149234794064e-07, "loss": 0.5831, "mean_token_accuracy": 0.8819562494754791, "num_tokens": 380676862.0, "step": 3560 }, { "epoch": 8.111744583808438, "grad_norm": 3.8125, "learning_rate": 4.7303845061689197e-07, "loss": 0.5771, "mean_token_accuracy": 0.8820130825042725, "num_tokens": 380783938.0, "step": 3561 }, { "epoch": 8.114025085518815, "grad_norm": 3.1875, "learning_rate": 4.719365593500866e-07, "loss": 0.5504, "mean_token_accuracy": 0.8882433474063873, "num_tokens": 380891146.0, "step": 3562 }, { "epoch": 8.11630558722919, "grad_norm": 3.296875, "learning_rate": 4.7083581917293784e-07, "loss": 0.5798, "mean_token_accuracy": 0.8783487379550934, "num_tokens": 380998117.0, "step": 3563 }, { "epoch": 8.118586088939567, "grad_norm": 2.59375, "learning_rate": 4.6973623071020267e-07, "loss": 0.5682, "mean_token_accuracy": 0.8855432868003845, "num_tokens": 381104807.0, "step": 3564 }, { "epoch": 8.120866590649943, "grad_norm": 2.75, "learning_rate": 4.686377945859874e-07, "loss": 0.5585, "mean_token_accuracy": 0.8837299197912216, "num_tokens": 381212129.0, "step": 3565 }, { "epoch": 8.12314709236032, "grad_norm": 4.375, "learning_rate": 4.6754051142374275e-07, "loss": 0.5605, "mean_token_accuracy": 0.8824782967567444, "num_tokens": 381319310.0, "step": 3566 }, { "epoch": 8.125427594070695, "grad_norm": 3.375, "learning_rate": 4.664443818462658e-07, "loss": 0.5741, "mean_token_accuracy": 0.8837754726409912, "num_tokens": 381426819.0, "step": 3567 }, { "epoch": 8.127708095781072, "grad_norm": 4.40625, "learning_rate": 4.653494064756983e-07, "loss": 0.5864, "mean_token_accuracy": 0.882844015955925, "num_tokens": 381533803.0, "step": 3568 }, { "epoch": 8.129988597491447, "grad_norm": 2.71875, "learning_rate": 4.6425558593352796e-07, "loss": 0.5675, "mean_token_accuracy": 0.8839477002620697, "num_tokens": 381640832.0, "step": 3569 }, { "epoch": 8.132269099201825, "grad_norm": 4.125, "learning_rate": 4.631629208405847e-07, "loss": 0.5459, "mean_token_accuracy": 0.8868978321552277, "num_tokens": 381748214.0, "step": 3570 }, { "epoch": 8.134549600912202, "grad_norm": 3.359375, "learning_rate": 4.620714118170452e-07, "loss": 0.5508, "mean_token_accuracy": 0.8868376761674881, "num_tokens": 381855821.0, "step": 3571 }, { "epoch": 8.136830102622577, "grad_norm": 3.09375, "learning_rate": 4.609810594824282e-07, "loss": 0.5761, "mean_token_accuracy": 0.883231908082962, "num_tokens": 381962902.0, "step": 3572 }, { "epoch": 8.139110604332954, "grad_norm": 2.796875, "learning_rate": 4.598918644555975e-07, "loss": 0.5536, "mean_token_accuracy": 0.8844647407531738, "num_tokens": 382070254.0, "step": 3573 }, { "epoch": 8.14139110604333, "grad_norm": 2.828125, "learning_rate": 4.58803827354759e-07, "loss": 0.5937, "mean_token_accuracy": 0.8791768401861191, "num_tokens": 382177004.0, "step": 3574 }, { "epoch": 8.143671607753706, "grad_norm": 3.625, "learning_rate": 4.5771694879746087e-07, "loss": 0.5942, "mean_token_accuracy": 0.874402716755867, "num_tokens": 382283679.0, "step": 3575 }, { "epoch": 8.145952109464082, "grad_norm": 4.4375, "learning_rate": 4.566312294005948e-07, "loss": 0.5765, "mean_token_accuracy": 0.8818236142396927, "num_tokens": 382390417.0, "step": 3576 }, { "epoch": 8.148232611174459, "grad_norm": 3.03125, "learning_rate": 4.5554666978039455e-07, "loss": 0.5774, "mean_token_accuracy": 0.8824535757303238, "num_tokens": 382497498.0, "step": 3577 }, { "epoch": 8.150513112884834, "grad_norm": 3.90625, "learning_rate": 4.544632705524343e-07, "loss": 0.5755, "mean_token_accuracy": 0.8810164928436279, "num_tokens": 382604536.0, "step": 3578 }, { "epoch": 8.152793614595211, "grad_norm": 3.328125, "learning_rate": 4.5338103233163175e-07, "loss": 0.5817, "mean_token_accuracy": 0.8840733021497726, "num_tokens": 382711132.0, "step": 3579 }, { "epoch": 8.155074116305586, "grad_norm": 2.375, "learning_rate": 4.522999557322433e-07, "loss": 0.574, "mean_token_accuracy": 0.8804437518119812, "num_tokens": 382817671.0, "step": 3580 }, { "epoch": 8.157354618015964, "grad_norm": 3.421875, "learning_rate": 4.512200413678672e-07, "loss": 0.562, "mean_token_accuracy": 0.8838673382997513, "num_tokens": 382924893.0, "step": 3581 }, { "epoch": 8.15963511972634, "grad_norm": 3.0, "learning_rate": 4.501412898514426e-07, "loss": 0.5934, "mean_token_accuracy": 0.8765487670898438, "num_tokens": 383031251.0, "step": 3582 }, { "epoch": 8.161915621436716, "grad_norm": 2.40625, "learning_rate": 4.490637017952479e-07, "loss": 0.5583, "mean_token_accuracy": 0.8857912421226501, "num_tokens": 383138617.0, "step": 3583 }, { "epoch": 8.164196123147093, "grad_norm": 3.421875, "learning_rate": 4.4798727781090096e-07, "loss": 0.5761, "mean_token_accuracy": 0.883015900850296, "num_tokens": 383245038.0, "step": 3584 }, { "epoch": 8.166476624857468, "grad_norm": 3.109375, "learning_rate": 4.4691201850936034e-07, "loss": 0.5796, "mean_token_accuracy": 0.8790012896060944, "num_tokens": 383351713.0, "step": 3585 }, { "epoch": 8.168757126567845, "grad_norm": 3.578125, "learning_rate": 4.458379245009209e-07, "loss": 0.5598, "mean_token_accuracy": 0.8837389200925827, "num_tokens": 383458654.0, "step": 3586 }, { "epoch": 8.17103762827822, "grad_norm": 3.3125, "learning_rate": 4.447649963952183e-07, "loss": 0.584, "mean_token_accuracy": 0.8778487145900726, "num_tokens": 383565763.0, "step": 3587 }, { "epoch": 8.173318129988598, "grad_norm": 2.65625, "learning_rate": 4.43693234801226e-07, "loss": 0.5907, "mean_token_accuracy": 0.8766829818487167, "num_tokens": 383672820.0, "step": 3588 }, { "epoch": 8.175598631698973, "grad_norm": 5.4375, "learning_rate": 4.4262264032725517e-07, "loss": 0.5938, "mean_token_accuracy": 0.8824921548366547, "num_tokens": 383780010.0, "step": 3589 }, { "epoch": 8.17787913340935, "grad_norm": 3.125, "learning_rate": 4.41553213580955e-07, "loss": 0.5849, "mean_token_accuracy": 0.8807243406772614, "num_tokens": 383886905.0, "step": 3590 }, { "epoch": 8.180159635119727, "grad_norm": 2.703125, "learning_rate": 4.404849551693102e-07, "loss": 0.5775, "mean_token_accuracy": 0.8819932341575623, "num_tokens": 383994254.0, "step": 3591 }, { "epoch": 8.182440136830103, "grad_norm": 2.59375, "learning_rate": 4.394178656986448e-07, "loss": 0.5586, "mean_token_accuracy": 0.881662055850029, "num_tokens": 384101925.0, "step": 3592 }, { "epoch": 8.18472063854048, "grad_norm": 3.046875, "learning_rate": 4.383519457746174e-07, "loss": 0.5645, "mean_token_accuracy": 0.8843112587928772, "num_tokens": 384209051.0, "step": 3593 }, { "epoch": 8.187001140250855, "grad_norm": 2.8125, "learning_rate": 4.3728719600222374e-07, "loss": 0.5566, "mean_token_accuracy": 0.8843548893928528, "num_tokens": 384316445.0, "step": 3594 }, { "epoch": 8.189281641961232, "grad_norm": 2.875, "learning_rate": 4.3622361698579586e-07, "loss": 0.5965, "mean_token_accuracy": 0.8787325769662857, "num_tokens": 384422950.0, "step": 3595 }, { "epoch": 8.191562143671607, "grad_norm": 2.921875, "learning_rate": 4.351612093290006e-07, "loss": 0.5696, "mean_token_accuracy": 0.8826261758804321, "num_tokens": 384530108.0, "step": 3596 }, { "epoch": 8.193842645381984, "grad_norm": 3.171875, "learning_rate": 4.340999736348389e-07, "loss": 0.5549, "mean_token_accuracy": 0.8850667327642441, "num_tokens": 384637297.0, "step": 3597 }, { "epoch": 8.19612314709236, "grad_norm": 3.15625, "learning_rate": 4.3303991050564877e-07, "loss": 0.5562, "mean_token_accuracy": 0.8847819566726685, "num_tokens": 384744448.0, "step": 3598 }, { "epoch": 8.198403648802737, "grad_norm": 2.875, "learning_rate": 4.3198102054310157e-07, "loss": 0.5707, "mean_token_accuracy": 0.8816685527563095, "num_tokens": 384851534.0, "step": 3599 }, { "epoch": 8.200684150513112, "grad_norm": 4.25, "learning_rate": 4.30923304348202e-07, "loss": 0.5673, "mean_token_accuracy": 0.8833976686000824, "num_tokens": 384958686.0, "step": 3600 }, { "epoch": 8.20296465222349, "grad_norm": 2.90625, "learning_rate": 4.2986676252129047e-07, "loss": 0.58, "mean_token_accuracy": 0.8791931867599487, "num_tokens": 385065535.0, "step": 3601 }, { "epoch": 8.205245153933866, "grad_norm": 3.1875, "learning_rate": 4.288113956620382e-07, "loss": 0.5639, "mean_token_accuracy": 0.8821399062871933, "num_tokens": 385172528.0, "step": 3602 }, { "epoch": 8.207525655644242, "grad_norm": 2.828125, "learning_rate": 4.2775720436945225e-07, "loss": 0.5577, "mean_token_accuracy": 0.8869880139827728, "num_tokens": 385279822.0, "step": 3603 }, { "epoch": 8.209806157354619, "grad_norm": 2.984375, "learning_rate": 4.267041892418705e-07, "loss": 0.5724, "mean_token_accuracy": 0.8820626139640808, "num_tokens": 385387057.0, "step": 3604 }, { "epoch": 8.212086659064994, "grad_norm": 2.703125, "learning_rate": 4.256523508769647e-07, "loss": 0.5611, "mean_token_accuracy": 0.8827876895666122, "num_tokens": 385494227.0, "step": 3605 }, { "epoch": 8.214367160775371, "grad_norm": 3.28125, "learning_rate": 4.246016898717381e-07, "loss": 0.5853, "mean_token_accuracy": 0.8795175403356552, "num_tokens": 385600675.0, "step": 3606 }, { "epoch": 8.216647662485746, "grad_norm": 3.203125, "learning_rate": 4.235522068225248e-07, "loss": 0.5767, "mean_token_accuracy": 0.8818391412496567, "num_tokens": 385707591.0, "step": 3607 }, { "epoch": 8.218928164196123, "grad_norm": 3.015625, "learning_rate": 4.225039023249916e-07, "loss": 0.5563, "mean_token_accuracy": 0.8863593190908432, "num_tokens": 385814554.0, "step": 3608 }, { "epoch": 8.221208665906499, "grad_norm": 3.0, "learning_rate": 4.2145677697413566e-07, "loss": 0.5972, "mean_token_accuracy": 0.8759454041719437, "num_tokens": 385921328.0, "step": 3609 }, { "epoch": 8.223489167616876, "grad_norm": 4.3125, "learning_rate": 4.204108313642852e-07, "loss": 0.5589, "mean_token_accuracy": 0.8848637193441391, "num_tokens": 386028022.0, "step": 3610 }, { "epoch": 8.225769669327253, "grad_norm": 3.890625, "learning_rate": 4.1936606608909887e-07, "loss": 0.5767, "mean_token_accuracy": 0.883062332868576, "num_tokens": 386135191.0, "step": 3611 }, { "epoch": 8.228050171037628, "grad_norm": 3.078125, "learning_rate": 4.1832248174156597e-07, "loss": 0.5584, "mean_token_accuracy": 0.885178342461586, "num_tokens": 386242119.0, "step": 3612 }, { "epoch": 8.230330672748005, "grad_norm": 3.234375, "learning_rate": 4.1728007891400356e-07, "loss": 0.5714, "mean_token_accuracy": 0.8812027722597122, "num_tokens": 386349161.0, "step": 3613 }, { "epoch": 8.23261117445838, "grad_norm": 3.328125, "learning_rate": 4.1623885819805977e-07, "loss": 0.5699, "mean_token_accuracy": 0.8800331354141235, "num_tokens": 386456594.0, "step": 3614 }, { "epoch": 8.234891676168758, "grad_norm": 3.515625, "learning_rate": 4.151988201847112e-07, "loss": 0.5672, "mean_token_accuracy": 0.884452149271965, "num_tokens": 386563773.0, "step": 3615 }, { "epoch": 8.237172177879133, "grad_norm": 3.359375, "learning_rate": 4.141599654642642e-07, "loss": 0.5577, "mean_token_accuracy": 0.8853688091039658, "num_tokens": 386670901.0, "step": 3616 }, { "epoch": 8.23945267958951, "grad_norm": 2.78125, "learning_rate": 4.1312229462635243e-07, "loss": 0.5623, "mean_token_accuracy": 0.8857151865959167, "num_tokens": 386777914.0, "step": 3617 }, { "epoch": 8.241733181299885, "grad_norm": 3.296875, "learning_rate": 4.1208580825993686e-07, "loss": 0.5689, "mean_token_accuracy": 0.8840577751398087, "num_tokens": 386884846.0, "step": 3618 }, { "epoch": 8.244013683010262, "grad_norm": 3.15625, "learning_rate": 4.1105050695330774e-07, "loss": 0.5654, "mean_token_accuracy": 0.8825150728225708, "num_tokens": 386992013.0, "step": 3619 }, { "epoch": 8.246294184720638, "grad_norm": 3.953125, "learning_rate": 4.100163912940827e-07, "loss": 0.5946, "mean_token_accuracy": 0.8755682855844498, "num_tokens": 387098665.0, "step": 3620 }, { "epoch": 8.248574686431015, "grad_norm": 2.515625, "learning_rate": 4.0898346186920484e-07, "loss": 0.5697, "mean_token_accuracy": 0.8821796476840973, "num_tokens": 387205278.0, "step": 3621 }, { "epoch": 8.250855188141392, "grad_norm": 5.0625, "learning_rate": 4.0795171926494543e-07, "loss": 0.5953, "mean_token_accuracy": 0.8782723695039749, "num_tokens": 387312537.0, "step": 3622 }, { "epoch": 8.253135689851767, "grad_norm": 3.25, "learning_rate": 4.0692116406690214e-07, "loss": 0.5792, "mean_token_accuracy": 0.8817024827003479, "num_tokens": 387419037.0, "step": 3623 }, { "epoch": 8.255416191562144, "grad_norm": 4.84375, "learning_rate": 4.058917968599968e-07, "loss": 0.5844, "mean_token_accuracy": 0.8763107359409332, "num_tokens": 387525971.0, "step": 3624 }, { "epoch": 8.25769669327252, "grad_norm": 3.6875, "learning_rate": 4.048636182284796e-07, "loss": 0.5607, "mean_token_accuracy": 0.8834527432918549, "num_tokens": 387633084.0, "step": 3625 }, { "epoch": 8.259977194982897, "grad_norm": 2.84375, "learning_rate": 4.038366287559245e-07, "loss": 0.5754, "mean_token_accuracy": 0.8804467916488647, "num_tokens": 387740264.0, "step": 3626 }, { "epoch": 8.262257696693272, "grad_norm": 3.453125, "learning_rate": 4.0281082902523055e-07, "loss": 0.5722, "mean_token_accuracy": 0.8822118043899536, "num_tokens": 387847552.0, "step": 3627 }, { "epoch": 8.264538198403649, "grad_norm": 2.875, "learning_rate": 4.0178621961862315e-07, "loss": 0.572, "mean_token_accuracy": 0.8843962848186493, "num_tokens": 387954393.0, "step": 3628 }, { "epoch": 8.266818700114024, "grad_norm": 3.671875, "learning_rate": 4.0076280111764927e-07, "loss": 0.5857, "mean_token_accuracy": 0.8815756440162659, "num_tokens": 388061243.0, "step": 3629 }, { "epoch": 8.269099201824401, "grad_norm": 3.328125, "learning_rate": 3.997405741031821e-07, "loss": 0.5575, "mean_token_accuracy": 0.885826587677002, "num_tokens": 388168371.0, "step": 3630 }, { "epoch": 8.271379703534778, "grad_norm": 2.890625, "learning_rate": 3.98719539155418e-07, "loss": 0.5611, "mean_token_accuracy": 0.8817395716905594, "num_tokens": 388276038.0, "step": 3631 }, { "epoch": 8.273660205245154, "grad_norm": 2.890625, "learning_rate": 3.9769969685387684e-07, "loss": 0.5584, "mean_token_accuracy": 0.8838264048099518, "num_tokens": 388382766.0, "step": 3632 }, { "epoch": 8.27594070695553, "grad_norm": 3.625, "learning_rate": 3.966810477774016e-07, "loss": 0.5852, "mean_token_accuracy": 0.8818796724081039, "num_tokens": 388489939.0, "step": 3633 }, { "epoch": 8.278221208665906, "grad_norm": 3.84375, "learning_rate": 3.9566359250415686e-07, "loss": 0.5851, "mean_token_accuracy": 0.8771698474884033, "num_tokens": 388596579.0, "step": 3634 }, { "epoch": 8.280501710376283, "grad_norm": 3.078125, "learning_rate": 3.9464733161163144e-07, "loss": 0.5644, "mean_token_accuracy": 0.881603792309761, "num_tokens": 388703184.0, "step": 3635 }, { "epoch": 8.282782212086659, "grad_norm": 2.828125, "learning_rate": 3.9363226567663503e-07, "loss": 0.5835, "mean_token_accuracy": 0.8799589574337006, "num_tokens": 388809921.0, "step": 3636 }, { "epoch": 8.285062713797036, "grad_norm": 3.453125, "learning_rate": 3.926183952752999e-07, "loss": 0.5805, "mean_token_accuracy": 0.8831854462623596, "num_tokens": 388916867.0, "step": 3637 }, { "epoch": 8.287343215507411, "grad_norm": 3.03125, "learning_rate": 3.9160572098307923e-07, "loss": 0.5788, "mean_token_accuracy": 0.8826831877231598, "num_tokens": 389024041.0, "step": 3638 }, { "epoch": 8.289623717217788, "grad_norm": 2.859375, "learning_rate": 3.90594243374747e-07, "loss": 0.5683, "mean_token_accuracy": 0.8816553503274918, "num_tokens": 389131535.0, "step": 3639 }, { "epoch": 8.291904218928163, "grad_norm": 4.65625, "learning_rate": 3.895839630243983e-07, "loss": 0.5747, "mean_token_accuracy": 0.8834093660116196, "num_tokens": 389238175.0, "step": 3640 }, { "epoch": 8.29418472063854, "grad_norm": 3.390625, "learning_rate": 3.8857488050544903e-07, "loss": 0.5675, "mean_token_accuracy": 0.8817841410636902, "num_tokens": 389344785.0, "step": 3641 }, { "epoch": 8.296465222348917, "grad_norm": 2.625, "learning_rate": 3.875669963906356e-07, "loss": 0.5535, "mean_token_accuracy": 0.8869169652462006, "num_tokens": 389452290.0, "step": 3642 }, { "epoch": 8.298745724059293, "grad_norm": 2.578125, "learning_rate": 3.865603112520125e-07, "loss": 0.5573, "mean_token_accuracy": 0.8858313113451004, "num_tokens": 389559422.0, "step": 3643 }, { "epoch": 8.30102622576967, "grad_norm": 3.125, "learning_rate": 3.855548256609556e-07, "loss": 0.5437, "mean_token_accuracy": 0.8861667513847351, "num_tokens": 389667410.0, "step": 3644 }, { "epoch": 8.303306727480045, "grad_norm": 3.421875, "learning_rate": 3.8455054018815803e-07, "loss": 0.5598, "mean_token_accuracy": 0.8834993839263916, "num_tokens": 389774294.0, "step": 3645 }, { "epoch": 8.305587229190422, "grad_norm": 2.890625, "learning_rate": 3.8354745540363364e-07, "loss": 0.5594, "mean_token_accuracy": 0.8835310786962509, "num_tokens": 389881962.0, "step": 3646 }, { "epoch": 8.307867730900798, "grad_norm": 3.390625, "learning_rate": 3.8254557187671374e-07, "loss": 0.5657, "mean_token_accuracy": 0.881848931312561, "num_tokens": 389989039.0, "step": 3647 }, { "epoch": 8.310148232611175, "grad_norm": 3.0, "learning_rate": 3.815448901760485e-07, "loss": 0.5743, "mean_token_accuracy": 0.8844071328639984, "num_tokens": 390096310.0, "step": 3648 }, { "epoch": 8.31242873432155, "grad_norm": 3.4375, "learning_rate": 3.805454108696055e-07, "loss": 0.591, "mean_token_accuracy": 0.8806750923395157, "num_tokens": 390202884.0, "step": 3649 }, { "epoch": 8.314709236031927, "grad_norm": 2.578125, "learning_rate": 3.7954713452466927e-07, "loss": 0.5594, "mean_token_accuracy": 0.8855821341276169, "num_tokens": 390310246.0, "step": 3650 }, { "epoch": 8.316989737742304, "grad_norm": 3.578125, "learning_rate": 3.785500617078425e-07, "loss": 0.5854, "mean_token_accuracy": 0.8808927685022354, "num_tokens": 390416919.0, "step": 3651 }, { "epoch": 8.31927023945268, "grad_norm": 3.34375, "learning_rate": 3.775541929850443e-07, "loss": 0.5811, "mean_token_accuracy": 0.8799949437379837, "num_tokens": 390524810.0, "step": 3652 }, { "epoch": 8.321550741163056, "grad_norm": 2.390625, "learning_rate": 3.76559528921511e-07, "loss": 0.5762, "mean_token_accuracy": 0.8805143088102341, "num_tokens": 390631927.0, "step": 3653 }, { "epoch": 8.323831242873432, "grad_norm": 2.796875, "learning_rate": 3.7556607008179454e-07, "loss": 0.557, "mean_token_accuracy": 0.885184645652771, "num_tokens": 390738890.0, "step": 3654 }, { "epoch": 8.326111744583809, "grad_norm": 4.34375, "learning_rate": 3.745738170297633e-07, "loss": 0.5676, "mean_token_accuracy": 0.8800319731235504, "num_tokens": 390845227.0, "step": 3655 }, { "epoch": 8.328392246294184, "grad_norm": 4.28125, "learning_rate": 3.7358277032860016e-07, "loss": 0.5723, "mean_token_accuracy": 0.8837781846523285, "num_tokens": 390951990.0, "step": 3656 }, { "epoch": 8.330672748004561, "grad_norm": 3.046875, "learning_rate": 3.7259293054080435e-07, "loss": 0.594, "mean_token_accuracy": 0.8773994743824005, "num_tokens": 391059266.0, "step": 3657 }, { "epoch": 8.332953249714937, "grad_norm": 2.640625, "learning_rate": 3.7160429822819003e-07, "loss": 0.5713, "mean_token_accuracy": 0.8825381994247437, "num_tokens": 391166339.0, "step": 3658 }, { "epoch": 8.335233751425314, "grad_norm": 3.21875, "learning_rate": 3.706168739518859e-07, "loss": 0.5641, "mean_token_accuracy": 0.881363645195961, "num_tokens": 391273827.0, "step": 3659 }, { "epoch": 8.33751425313569, "grad_norm": 4.375, "learning_rate": 3.6963065827233524e-07, "loss": 0.5683, "mean_token_accuracy": 0.8808765411376953, "num_tokens": 391381343.0, "step": 3660 }, { "epoch": 8.339794754846066, "grad_norm": 4.28125, "learning_rate": 3.6864565174929393e-07, "loss": 0.5718, "mean_token_accuracy": 0.8825783133506775, "num_tokens": 391488599.0, "step": 3661 }, { "epoch": 8.342075256556443, "grad_norm": 2.9375, "learning_rate": 3.676618549418334e-07, "loss": 0.5825, "mean_token_accuracy": 0.8835724592208862, "num_tokens": 391596245.0, "step": 3662 }, { "epoch": 8.344355758266818, "grad_norm": 3.421875, "learning_rate": 3.666792684083381e-07, "loss": 0.5779, "mean_token_accuracy": 0.8827132731676102, "num_tokens": 391702960.0, "step": 3663 }, { "epoch": 8.346636259977195, "grad_norm": 2.96875, "learning_rate": 3.656978927065041e-07, "loss": 0.5646, "mean_token_accuracy": 0.8864104300737381, "num_tokens": 391811197.0, "step": 3664 }, { "epoch": 8.34891676168757, "grad_norm": 3.03125, "learning_rate": 3.64717728393342e-07, "loss": 0.602, "mean_token_accuracy": 0.8749131262302399, "num_tokens": 391918416.0, "step": 3665 }, { "epoch": 8.351197263397948, "grad_norm": 2.875, "learning_rate": 3.6373877602517457e-07, "loss": 0.5567, "mean_token_accuracy": 0.884322926402092, "num_tokens": 392025371.0, "step": 3666 }, { "epoch": 8.353477765108323, "grad_norm": 2.96875, "learning_rate": 3.627610361576353e-07, "loss": 0.5625, "mean_token_accuracy": 0.8826847523450851, "num_tokens": 392132210.0, "step": 3667 }, { "epoch": 8.3557582668187, "grad_norm": 3.0, "learning_rate": 3.6178450934567065e-07, "loss": 0.5909, "mean_token_accuracy": 0.8771587163209915, "num_tokens": 392239261.0, "step": 3668 }, { "epoch": 8.358038768529076, "grad_norm": 3.15625, "learning_rate": 3.6080919614353895e-07, "loss": 0.5765, "mean_token_accuracy": 0.8804507404565811, "num_tokens": 392345489.0, "step": 3669 }, { "epoch": 8.360319270239453, "grad_norm": 3.953125, "learning_rate": 3.598350971048087e-07, "loss": 0.5647, "mean_token_accuracy": 0.8826466053724289, "num_tokens": 392452753.0, "step": 3670 }, { "epoch": 8.36259977194983, "grad_norm": 2.796875, "learning_rate": 3.5886221278236045e-07, "loss": 0.5594, "mean_token_accuracy": 0.8848947435617447, "num_tokens": 392559803.0, "step": 3671 }, { "epoch": 8.364880273660205, "grad_norm": 3.859375, "learning_rate": 3.578905437283833e-07, "loss": 0.5829, "mean_token_accuracy": 0.8808190226554871, "num_tokens": 392666034.0, "step": 3672 }, { "epoch": 8.367160775370582, "grad_norm": 4.21875, "learning_rate": 3.569200904943784e-07, "loss": 0.5804, "mean_token_accuracy": 0.8789727836847305, "num_tokens": 392772892.0, "step": 3673 }, { "epoch": 8.369441277080957, "grad_norm": 3.765625, "learning_rate": 3.559508536311568e-07, "loss": 0.5528, "mean_token_accuracy": 0.8866614103317261, "num_tokens": 392880120.0, "step": 3674 }, { "epoch": 8.371721778791335, "grad_norm": 2.765625, "learning_rate": 3.549828336888378e-07, "loss": 0.5579, "mean_token_accuracy": 0.8827106803655624, "num_tokens": 392987873.0, "step": 3675 }, { "epoch": 8.37400228050171, "grad_norm": 2.71875, "learning_rate": 3.5401603121685197e-07, "loss": 0.5466, "mean_token_accuracy": 0.8857832849025726, "num_tokens": 393095077.0, "step": 3676 }, { "epoch": 8.376282782212087, "grad_norm": 4.3125, "learning_rate": 3.5305044676393645e-07, "loss": 0.5893, "mean_token_accuracy": 0.8792509883642197, "num_tokens": 393202410.0, "step": 3677 }, { "epoch": 8.378563283922462, "grad_norm": 4.28125, "learning_rate": 3.5208608087813874e-07, "loss": 0.5906, "mean_token_accuracy": 0.8767274022102356, "num_tokens": 393309342.0, "step": 3678 }, { "epoch": 8.38084378563284, "grad_norm": 2.609375, "learning_rate": 3.5112293410681455e-07, "loss": 0.5644, "mean_token_accuracy": 0.881255492568016, "num_tokens": 393416921.0, "step": 3679 }, { "epoch": 8.383124287343216, "grad_norm": 3.859375, "learning_rate": 3.501610069966271e-07, "loss": 0.5514, "mean_token_accuracy": 0.8843955397605896, "num_tokens": 393524843.0, "step": 3680 }, { "epoch": 8.385404789053592, "grad_norm": 2.84375, "learning_rate": 3.492003000935487e-07, "loss": 0.5547, "mean_token_accuracy": 0.8885585069656372, "num_tokens": 393632101.0, "step": 3681 }, { "epoch": 8.387685290763969, "grad_norm": 3.671875, "learning_rate": 3.482408139428564e-07, "loss": 0.571, "mean_token_accuracy": 0.8814975172281265, "num_tokens": 393739142.0, "step": 3682 }, { "epoch": 8.389965792474344, "grad_norm": 3.0, "learning_rate": 3.4728254908913683e-07, "loss": 0.5596, "mean_token_accuracy": 0.8834152221679688, "num_tokens": 393846302.0, "step": 3683 }, { "epoch": 8.392246294184721, "grad_norm": 2.90625, "learning_rate": 3.463255060762827e-07, "loss": 0.5646, "mean_token_accuracy": 0.8830453753471375, "num_tokens": 393952665.0, "step": 3684 }, { "epoch": 8.394526795895096, "grad_norm": 2.6875, "learning_rate": 3.4536968544749333e-07, "loss": 0.5683, "mean_token_accuracy": 0.8798201084136963, "num_tokens": 394059602.0, "step": 3685 }, { "epoch": 8.396807297605474, "grad_norm": 2.890625, "learning_rate": 3.4441508774527345e-07, "loss": 0.5612, "mean_token_accuracy": 0.8854030817747116, "num_tokens": 394166384.0, "step": 3686 }, { "epoch": 8.399087799315849, "grad_norm": 3.875, "learning_rate": 3.434617135114349e-07, "loss": 0.5723, "mean_token_accuracy": 0.8825076222419739, "num_tokens": 394273389.0, "step": 3687 }, { "epoch": 8.401368301026226, "grad_norm": 3.09375, "learning_rate": 3.425095632870937e-07, "loss": 0.594, "mean_token_accuracy": 0.8796204030513763, "num_tokens": 394380226.0, "step": 3688 }, { "epoch": 8.403648802736601, "grad_norm": 3.734375, "learning_rate": 3.4155863761267256e-07, "loss": 0.5841, "mean_token_accuracy": 0.8805309385061264, "num_tokens": 394486821.0, "step": 3689 }, { "epoch": 8.405929304446978, "grad_norm": 3.203125, "learning_rate": 3.406089370278981e-07, "loss": 0.572, "mean_token_accuracy": 0.8786486685276031, "num_tokens": 394593768.0, "step": 3690 }, { "epoch": 8.408209806157355, "grad_norm": 2.96875, "learning_rate": 3.396604620718025e-07, "loss": 0.5687, "mean_token_accuracy": 0.8826608210802078, "num_tokens": 394700911.0, "step": 3691 }, { "epoch": 8.41049030786773, "grad_norm": 3.109375, "learning_rate": 3.387132132827223e-07, "loss": 0.5861, "mean_token_accuracy": 0.8810716569423676, "num_tokens": 394807801.0, "step": 3692 }, { "epoch": 8.412770809578108, "grad_norm": 2.703125, "learning_rate": 3.377671911982963e-07, "loss": 0.5807, "mean_token_accuracy": 0.8786879032850266, "num_tokens": 394914319.0, "step": 3693 }, { "epoch": 8.415051311288483, "grad_norm": 3.046875, "learning_rate": 3.3682239635546927e-07, "loss": 0.5786, "mean_token_accuracy": 0.8792416155338287, "num_tokens": 395021338.0, "step": 3694 }, { "epoch": 8.41733181299886, "grad_norm": 2.671875, "learning_rate": 3.35878829290488e-07, "loss": 0.5768, "mean_token_accuracy": 0.8806908279657364, "num_tokens": 395128126.0, "step": 3695 }, { "epoch": 8.419612314709235, "grad_norm": 4.625, "learning_rate": 3.3493649053890325e-07, "loss": 0.5661, "mean_token_accuracy": 0.8870299756526947, "num_tokens": 395234907.0, "step": 3696 }, { "epoch": 8.421892816419613, "grad_norm": 2.953125, "learning_rate": 3.339953806355692e-07, "loss": 0.5706, "mean_token_accuracy": 0.8846122622489929, "num_tokens": 395342739.0, "step": 3697 }, { "epoch": 8.424173318129988, "grad_norm": 5.5625, "learning_rate": 3.330555001146399e-07, "loss": 0.5829, "mean_token_accuracy": 0.8779343664646149, "num_tokens": 395449718.0, "step": 3698 }, { "epoch": 8.426453819840365, "grad_norm": 2.796875, "learning_rate": 3.3211684950957416e-07, "loss": 0.5808, "mean_token_accuracy": 0.8803275525569916, "num_tokens": 395556795.0, "step": 3699 }, { "epoch": 8.428734321550742, "grad_norm": 3.0, "learning_rate": 3.311794293531323e-07, "loss": 0.5608, "mean_token_accuracy": 0.8883070945739746, "num_tokens": 395664009.0, "step": 3700 }, { "epoch": 8.431014823261117, "grad_norm": 4.125, "learning_rate": 3.3024324017737555e-07, "loss": 0.5747, "mean_token_accuracy": 0.8797996789216995, "num_tokens": 395771593.0, "step": 3701 }, { "epoch": 8.433295324971494, "grad_norm": 3.0625, "learning_rate": 3.2930828251366703e-07, "loss": 0.5726, "mean_token_accuracy": 0.8806504160165787, "num_tokens": 395878663.0, "step": 3702 }, { "epoch": 8.43557582668187, "grad_norm": 3.609375, "learning_rate": 3.283745568926708e-07, "loss": 0.5562, "mean_token_accuracy": 0.8880557268857956, "num_tokens": 395986031.0, "step": 3703 }, { "epoch": 8.437856328392247, "grad_norm": 3.09375, "learning_rate": 3.274420638443507e-07, "loss": 0.5772, "mean_token_accuracy": 0.8808362036943436, "num_tokens": 396093478.0, "step": 3704 }, { "epoch": 8.440136830102622, "grad_norm": 3.6875, "learning_rate": 3.2651080389797253e-07, "loss": 0.5438, "mean_token_accuracy": 0.8885146230459213, "num_tokens": 396200577.0, "step": 3705 }, { "epoch": 8.442417331813, "grad_norm": 2.640625, "learning_rate": 3.255807775821015e-07, "loss": 0.5767, "mean_token_accuracy": 0.8813702762126923, "num_tokens": 396307631.0, "step": 3706 }, { "epoch": 8.444697833523374, "grad_norm": 4.8125, "learning_rate": 3.246519854246022e-07, "loss": 0.5639, "mean_token_accuracy": 0.882218137383461, "num_tokens": 396415007.0, "step": 3707 }, { "epoch": 8.446978335233752, "grad_norm": 4.34375, "learning_rate": 3.2372442795263885e-07, "loss": 0.5712, "mean_token_accuracy": 0.8808966130018234, "num_tokens": 396521935.0, "step": 3708 }, { "epoch": 8.449258836944129, "grad_norm": 3.75, "learning_rate": 3.227981056926763e-07, "loss": 0.597, "mean_token_accuracy": 0.8786474019289017, "num_tokens": 396629351.0, "step": 3709 }, { "epoch": 8.451539338654504, "grad_norm": 3.25, "learning_rate": 3.218730191704758e-07, "loss": 0.5879, "mean_token_accuracy": 0.8776755332946777, "num_tokens": 396736310.0, "step": 3710 }, { "epoch": 8.453819840364881, "grad_norm": 3.5625, "learning_rate": 3.209491689110994e-07, "loss": 0.5609, "mean_token_accuracy": 0.8844281584024429, "num_tokens": 396843391.0, "step": 3711 }, { "epoch": 8.456100342075256, "grad_norm": 3.21875, "learning_rate": 3.2002655543890646e-07, "loss": 0.5692, "mean_token_accuracy": 0.8826111257076263, "num_tokens": 396950647.0, "step": 3712 }, { "epoch": 8.458380843785633, "grad_norm": 4.3125, "learning_rate": 3.1910517927755516e-07, "loss": 0.5732, "mean_token_accuracy": 0.879383772611618, "num_tokens": 397058121.0, "step": 3713 }, { "epoch": 8.460661345496009, "grad_norm": 2.640625, "learning_rate": 3.181850409499995e-07, "loss": 0.5669, "mean_token_accuracy": 0.8790426999330521, "num_tokens": 397165268.0, "step": 3714 }, { "epoch": 8.462941847206386, "grad_norm": 2.453125, "learning_rate": 3.1726614097849326e-07, "loss": 0.5596, "mean_token_accuracy": 0.8887834399938583, "num_tokens": 397272308.0, "step": 3715 }, { "epoch": 8.465222348916761, "grad_norm": 3.6875, "learning_rate": 3.163484798845862e-07, "loss": 0.5725, "mean_token_accuracy": 0.8812167048454285, "num_tokens": 397379017.0, "step": 3716 }, { "epoch": 8.467502850627138, "grad_norm": 3.703125, "learning_rate": 3.1543205818912484e-07, "loss": 0.5643, "mean_token_accuracy": 0.8864733725786209, "num_tokens": 397485914.0, "step": 3717 }, { "epoch": 8.469783352337513, "grad_norm": 2.8125, "learning_rate": 3.145168764122525e-07, "loss": 0.5769, "mean_token_accuracy": 0.8842067122459412, "num_tokens": 397592552.0, "step": 3718 }, { "epoch": 8.47206385404789, "grad_norm": 4.65625, "learning_rate": 3.1360293507340934e-07, "loss": 0.5665, "mean_token_accuracy": 0.8827557861804962, "num_tokens": 397699338.0, "step": 3719 }, { "epoch": 8.474344355758268, "grad_norm": 2.703125, "learning_rate": 3.1269023469132937e-07, "loss": 0.5658, "mean_token_accuracy": 0.8828287869691849, "num_tokens": 397806861.0, "step": 3720 }, { "epoch": 8.476624857468643, "grad_norm": 3.9375, "learning_rate": 3.117787757840449e-07, "loss": 0.5826, "mean_token_accuracy": 0.8807165771722794, "num_tokens": 397913908.0, "step": 3721 }, { "epoch": 8.47890535917902, "grad_norm": 2.921875, "learning_rate": 3.10868558868882e-07, "loss": 0.5543, "mean_token_accuracy": 0.8827250152826309, "num_tokens": 398021549.0, "step": 3722 }, { "epoch": 8.481185860889395, "grad_norm": 3.046875, "learning_rate": 3.0995958446246197e-07, "loss": 0.5783, "mean_token_accuracy": 0.881324291229248, "num_tokens": 398128775.0, "step": 3723 }, { "epoch": 8.483466362599772, "grad_norm": 3.21875, "learning_rate": 3.090518530807021e-07, "loss": 0.5687, "mean_token_accuracy": 0.8850361853837967, "num_tokens": 398235464.0, "step": 3724 }, { "epoch": 8.485746864310148, "grad_norm": 2.59375, "learning_rate": 3.0814536523881224e-07, "loss": 0.5474, "mean_token_accuracy": 0.8856307417154312, "num_tokens": 398342478.0, "step": 3725 }, { "epoch": 8.488027366020525, "grad_norm": 3.703125, "learning_rate": 3.072401214512974e-07, "loss": 0.5838, "mean_token_accuracy": 0.8800984472036362, "num_tokens": 398449103.0, "step": 3726 }, { "epoch": 8.4903078677309, "grad_norm": 3.0625, "learning_rate": 3.063361222319569e-07, "loss": 0.5692, "mean_token_accuracy": 0.8843390345573425, "num_tokens": 398556171.0, "step": 3727 }, { "epoch": 8.492588369441277, "grad_norm": 3.71875, "learning_rate": 3.054333680938837e-07, "loss": 0.5561, "mean_token_accuracy": 0.8837459683418274, "num_tokens": 398663128.0, "step": 3728 }, { "epoch": 8.494868871151652, "grad_norm": 4.25, "learning_rate": 3.045318595494623e-07, "loss": 0.5668, "mean_token_accuracy": 0.8831415474414825, "num_tokens": 398770220.0, "step": 3729 }, { "epoch": 8.49714937286203, "grad_norm": 2.796875, "learning_rate": 3.036315971103723e-07, "loss": 0.5735, "mean_token_accuracy": 0.8805525749921799, "num_tokens": 398876750.0, "step": 3730 }, { "epoch": 8.499429874572407, "grad_norm": 2.796875, "learning_rate": 3.0273258128758585e-07, "loss": 0.5894, "mean_token_accuracy": 0.8824233710765839, "num_tokens": 398983846.0, "step": 3731 }, { "epoch": 8.501710376282782, "grad_norm": 2.671875, "learning_rate": 3.018348125913659e-07, "loss": 0.5686, "mean_token_accuracy": 0.8839138001203537, "num_tokens": 399090705.0, "step": 3732 }, { "epoch": 8.503990877993159, "grad_norm": 3.015625, "learning_rate": 3.009382915312689e-07, "loss": 0.5718, "mean_token_accuracy": 0.8818164467811584, "num_tokens": 399197937.0, "step": 3733 }, { "epoch": 8.506271379703534, "grad_norm": 4.03125, "learning_rate": 3.000430186161432e-07, "loss": 0.5653, "mean_token_accuracy": 0.8860601335763931, "num_tokens": 399304810.0, "step": 3734 }, { "epoch": 8.508551881413911, "grad_norm": 3.265625, "learning_rate": 2.991489943541287e-07, "loss": 0.5684, "mean_token_accuracy": 0.8790812194347382, "num_tokens": 399411951.0, "step": 3735 }, { "epoch": 8.510832383124287, "grad_norm": 2.96875, "learning_rate": 2.982562192526556e-07, "loss": 0.5784, "mean_token_accuracy": 0.881022498011589, "num_tokens": 399519175.0, "step": 3736 }, { "epoch": 8.513112884834664, "grad_norm": 3.640625, "learning_rate": 2.97364693818446e-07, "loss": 0.5759, "mean_token_accuracy": 0.8782191127538681, "num_tokens": 399626295.0, "step": 3737 }, { "epoch": 8.515393386545039, "grad_norm": 2.90625, "learning_rate": 2.9647441855751274e-07, "loss": 0.5584, "mean_token_accuracy": 0.8843040764331818, "num_tokens": 399733595.0, "step": 3738 }, { "epoch": 8.517673888255416, "grad_norm": 4.59375, "learning_rate": 2.9558539397515905e-07, "loss": 0.5624, "mean_token_accuracy": 0.884150817990303, "num_tokens": 399840730.0, "step": 3739 }, { "epoch": 8.519954389965793, "grad_norm": 3.25, "learning_rate": 2.94697620575978e-07, "loss": 0.5754, "mean_token_accuracy": 0.8824001997709274, "num_tokens": 399947355.0, "step": 3740 }, { "epoch": 8.519954389965793, "eval_loss": 0.586450457572937, "eval_mean_token_accuracy": 0.8799830790708274, "eval_num_tokens": 399947355.0, "eval_runtime": 58.61, "eval_samples_per_second": 143.064, "eval_steps_per_second": 4.487, "step": 3740 }, { "epoch": 8.522234891676169, "grad_norm": 3.0, "learning_rate": 2.938110988638521e-07, "loss": 0.5732, "mean_token_accuracy": 0.8857548832893372, "num_tokens": 400053792.0, "step": 3741 }, { "epoch": 8.524515393386546, "grad_norm": 2.625, "learning_rate": 2.9292582934195427e-07, "loss": 0.5805, "mean_token_accuracy": 0.8823639154434204, "num_tokens": 400160716.0, "step": 3742 }, { "epoch": 8.526795895096921, "grad_norm": 4.75, "learning_rate": 2.9204181251274665e-07, "loss": 0.5734, "mean_token_accuracy": 0.8819014728069305, "num_tokens": 400267286.0, "step": 3743 }, { "epoch": 8.529076396807298, "grad_norm": 3.234375, "learning_rate": 2.9115904887798005e-07, "loss": 0.5767, "mean_token_accuracy": 0.8814072459936142, "num_tokens": 400374052.0, "step": 3744 }, { "epoch": 8.531356898517673, "grad_norm": 2.84375, "learning_rate": 2.9027753893869387e-07, "loss": 0.5511, "mean_token_accuracy": 0.8873879760503769, "num_tokens": 400481323.0, "step": 3745 }, { "epoch": 8.53363740022805, "grad_norm": 3.890625, "learning_rate": 2.893972831952166e-07, "loss": 0.5714, "mean_token_accuracy": 0.8847419172525406, "num_tokens": 400588604.0, "step": 3746 }, { "epoch": 8.535917901938426, "grad_norm": 2.765625, "learning_rate": 2.8851828214716383e-07, "loss": 0.5778, "mean_token_accuracy": 0.8783596158027649, "num_tokens": 400695471.0, "step": 3747 }, { "epoch": 8.538198403648803, "grad_norm": 2.859375, "learning_rate": 2.876405362934395e-07, "loss": 0.5835, "mean_token_accuracy": 0.8776167929172516, "num_tokens": 400802607.0, "step": 3748 }, { "epoch": 8.54047890535918, "grad_norm": 2.96875, "learning_rate": 2.8676404613223573e-07, "loss": 0.5807, "mean_token_accuracy": 0.8812108039855957, "num_tokens": 400909340.0, "step": 3749 }, { "epoch": 8.542759407069555, "grad_norm": 4.0, "learning_rate": 2.858888121610315e-07, "loss": 0.5536, "mean_token_accuracy": 0.887622594833374, "num_tokens": 401015859.0, "step": 3750 }, { "epoch": 8.545039908779932, "grad_norm": 3.4375, "learning_rate": 2.8501483487659217e-07, "loss": 0.5961, "mean_token_accuracy": 0.8768871873617172, "num_tokens": 401122411.0, "step": 3751 }, { "epoch": 8.547320410490308, "grad_norm": 2.578125, "learning_rate": 2.841421147749709e-07, "loss": 0.5755, "mean_token_accuracy": 0.8786147236824036, "num_tokens": 401229660.0, "step": 3752 }, { "epoch": 8.549600912200685, "grad_norm": 2.65625, "learning_rate": 2.832706523515061e-07, "loss": 0.5717, "mean_token_accuracy": 0.8837973475456238, "num_tokens": 401336325.0, "step": 3753 }, { "epoch": 8.55188141391106, "grad_norm": 2.890625, "learning_rate": 2.824004481008233e-07, "loss": 0.5764, "mean_token_accuracy": 0.8784725517034531, "num_tokens": 401444403.0, "step": 3754 }, { "epoch": 8.554161915621437, "grad_norm": 3.15625, "learning_rate": 2.815315025168339e-07, "loss": 0.5673, "mean_token_accuracy": 0.880395233631134, "num_tokens": 401551536.0, "step": 3755 }, { "epoch": 8.556442417331812, "grad_norm": 2.578125, "learning_rate": 2.8066381609273497e-07, "loss": 0.5666, "mean_token_accuracy": 0.8854324221611023, "num_tokens": 401658365.0, "step": 3756 }, { "epoch": 8.55872291904219, "grad_norm": 3.09375, "learning_rate": 2.7979738932100734e-07, "loss": 0.5559, "mean_token_accuracy": 0.8865665048360825, "num_tokens": 401765495.0, "step": 3757 }, { "epoch": 8.561003420752566, "grad_norm": 3.578125, "learning_rate": 2.7893222269341906e-07, "loss": 0.5725, "mean_token_accuracy": 0.8840383142232895, "num_tokens": 401872546.0, "step": 3758 }, { "epoch": 8.563283922462942, "grad_norm": 3.171875, "learning_rate": 2.7806831670102176e-07, "loss": 0.5856, "mean_token_accuracy": 0.8813675791025162, "num_tokens": 401979397.0, "step": 3759 }, { "epoch": 8.565564424173319, "grad_norm": 2.875, "learning_rate": 2.7720567183415175e-07, "loss": 0.5786, "mean_token_accuracy": 0.8806767165660858, "num_tokens": 402086207.0, "step": 3760 }, { "epoch": 8.567844925883694, "grad_norm": 3.015625, "learning_rate": 2.7634428858242995e-07, "loss": 0.5909, "mean_token_accuracy": 0.8802158385515213, "num_tokens": 402192757.0, "step": 3761 }, { "epoch": 8.570125427594071, "grad_norm": 2.796875, "learning_rate": 2.754841674347608e-07, "loss": 0.5772, "mean_token_accuracy": 0.8806091845035553, "num_tokens": 402299546.0, "step": 3762 }, { "epoch": 8.572405929304447, "grad_norm": 3.46875, "learning_rate": 2.7462530887933216e-07, "loss": 0.559, "mean_token_accuracy": 0.8859734088182449, "num_tokens": 402406468.0, "step": 3763 }, { "epoch": 8.574686431014824, "grad_norm": 3.375, "learning_rate": 2.737677134036154e-07, "loss": 0.5665, "mean_token_accuracy": 0.8834273219108582, "num_tokens": 402513385.0, "step": 3764 }, { "epoch": 8.576966932725199, "grad_norm": 3.015625, "learning_rate": 2.729113814943654e-07, "loss": 0.5879, "mean_token_accuracy": 0.8773778825998306, "num_tokens": 402620374.0, "step": 3765 }, { "epoch": 8.579247434435576, "grad_norm": 3.25, "learning_rate": 2.7205631363761976e-07, "loss": 0.5701, "mean_token_accuracy": 0.8817746788263321, "num_tokens": 402727727.0, "step": 3766 }, { "epoch": 8.581527936145951, "grad_norm": 2.90625, "learning_rate": 2.7120251031869884e-07, "loss": 0.5487, "mean_token_accuracy": 0.886865645647049, "num_tokens": 402834747.0, "step": 3767 }, { "epoch": 8.583808437856328, "grad_norm": 3.0, "learning_rate": 2.7034997202220384e-07, "loss": 0.5794, "mean_token_accuracy": 0.8793386965990067, "num_tokens": 402941219.0, "step": 3768 }, { "epoch": 8.586088939566705, "grad_norm": 2.953125, "learning_rate": 2.6949869923202e-07, "loss": 0.563, "mean_token_accuracy": 0.8822144120931625, "num_tokens": 403048515.0, "step": 3769 }, { "epoch": 8.58836944127708, "grad_norm": 3.0, "learning_rate": 2.686486924313128e-07, "loss": 0.5786, "mean_token_accuracy": 0.8802978694438934, "num_tokens": 403156281.0, "step": 3770 }, { "epoch": 8.590649942987458, "grad_norm": 2.8125, "learning_rate": 2.6779995210253015e-07, "loss": 0.5822, "mean_token_accuracy": 0.8790141940116882, "num_tokens": 403263439.0, "step": 3771 }, { "epoch": 8.592930444697833, "grad_norm": 3.21875, "learning_rate": 2.6695247872740027e-07, "loss": 0.5862, "mean_token_accuracy": 0.8814330548048019, "num_tokens": 403370460.0, "step": 3772 }, { "epoch": 8.59521094640821, "grad_norm": 2.53125, "learning_rate": 2.6610627278693265e-07, "loss": 0.572, "mean_token_accuracy": 0.8820819407701492, "num_tokens": 403477203.0, "step": 3773 }, { "epoch": 8.597491448118586, "grad_norm": 2.65625, "learning_rate": 2.6526133476141804e-07, "loss": 0.5748, "mean_token_accuracy": 0.8831133842468262, "num_tokens": 403583814.0, "step": 3774 }, { "epoch": 8.599771949828963, "grad_norm": 3.765625, "learning_rate": 2.644176651304259e-07, "loss": 0.5623, "mean_token_accuracy": 0.8832801431417465, "num_tokens": 403690937.0, "step": 3775 }, { "epoch": 8.602052451539338, "grad_norm": 2.671875, "learning_rate": 2.6357526437280764e-07, "loss": 0.5697, "mean_token_accuracy": 0.8839538246393204, "num_tokens": 403798148.0, "step": 3776 }, { "epoch": 8.604332953249715, "grad_norm": 2.828125, "learning_rate": 2.6273413296669353e-07, "loss": 0.5672, "mean_token_accuracy": 0.8829332441091537, "num_tokens": 403904875.0, "step": 3777 }, { "epoch": 8.60661345496009, "grad_norm": 2.75, "learning_rate": 2.618942713894937e-07, "loss": 0.5744, "mean_token_accuracy": 0.8815491497516632, "num_tokens": 404011913.0, "step": 3778 }, { "epoch": 8.608893956670467, "grad_norm": 2.703125, "learning_rate": 2.610556801178968e-07, "loss": 0.5873, "mean_token_accuracy": 0.8805450052022934, "num_tokens": 404119324.0, "step": 3779 }, { "epoch": 8.611174458380844, "grad_norm": 2.578125, "learning_rate": 2.602183596278715e-07, "loss": 0.5592, "mean_token_accuracy": 0.889171689748764, "num_tokens": 404226267.0, "step": 3780 }, { "epoch": 8.61345496009122, "grad_norm": 2.8125, "learning_rate": 2.5938231039466436e-07, "loss": 0.5535, "mean_token_accuracy": 0.8866444677114487, "num_tokens": 404333323.0, "step": 3781 }, { "epoch": 8.615735461801597, "grad_norm": 2.734375, "learning_rate": 2.585475328928011e-07, "loss": 0.5725, "mean_token_accuracy": 0.8843964338302612, "num_tokens": 404440123.0, "step": 3782 }, { "epoch": 8.618015963511972, "grad_norm": 3.71875, "learning_rate": 2.577140275960857e-07, "loss": 0.5852, "mean_token_accuracy": 0.882148802280426, "num_tokens": 404546230.0, "step": 3783 }, { "epoch": 8.62029646522235, "grad_norm": 2.671875, "learning_rate": 2.5688179497759895e-07, "loss": 0.5542, "mean_token_accuracy": 0.8827191293239594, "num_tokens": 404653034.0, "step": 3784 }, { "epoch": 8.622576966932725, "grad_norm": 3.140625, "learning_rate": 2.560508355097002e-07, "loss": 0.5588, "mean_token_accuracy": 0.8882575631141663, "num_tokens": 404760581.0, "step": 3785 }, { "epoch": 8.624857468643102, "grad_norm": 3.21875, "learning_rate": 2.552211496640261e-07, "loss": 0.5561, "mean_token_accuracy": 0.8854646235704422, "num_tokens": 404868271.0, "step": 3786 }, { "epoch": 8.627137970353477, "grad_norm": 2.375, "learning_rate": 2.543927379114902e-07, "loss": 0.5736, "mean_token_accuracy": 0.8811837881803513, "num_tokens": 404974915.0, "step": 3787 }, { "epoch": 8.629418472063854, "grad_norm": 2.875, "learning_rate": 2.5356560072228335e-07, "loss": 0.5688, "mean_token_accuracy": 0.8805698156356812, "num_tokens": 405081754.0, "step": 3788 }, { "epoch": 8.631698973774231, "grad_norm": 2.921875, "learning_rate": 2.5273973856587283e-07, "loss": 0.579, "mean_token_accuracy": 0.879248857498169, "num_tokens": 405188743.0, "step": 3789 }, { "epoch": 8.633979475484606, "grad_norm": 3.046875, "learning_rate": 2.5191515191100107e-07, "loss": 0.5769, "mean_token_accuracy": 0.8826890587806702, "num_tokens": 405296046.0, "step": 3790 }, { "epoch": 8.636259977194984, "grad_norm": 3.71875, "learning_rate": 2.5109184122568797e-07, "loss": 0.5745, "mean_token_accuracy": 0.88176129758358, "num_tokens": 405402833.0, "step": 3791 }, { "epoch": 8.638540478905359, "grad_norm": 2.796875, "learning_rate": 2.502698069772294e-07, "loss": 0.5776, "mean_token_accuracy": 0.8807597756385803, "num_tokens": 405509745.0, "step": 3792 }, { "epoch": 8.640820980615736, "grad_norm": 2.953125, "learning_rate": 2.494490496321958e-07, "loss": 0.5805, "mean_token_accuracy": 0.8785258382558823, "num_tokens": 405616827.0, "step": 3793 }, { "epoch": 8.643101482326111, "grad_norm": 2.375, "learning_rate": 2.4862956965643253e-07, "loss": 0.582, "mean_token_accuracy": 0.8794488459825516, "num_tokens": 405723490.0, "step": 3794 }, { "epoch": 8.645381984036488, "grad_norm": 3.0625, "learning_rate": 2.4781136751506176e-07, "loss": 0.5634, "mean_token_accuracy": 0.8818620294332504, "num_tokens": 405830225.0, "step": 3795 }, { "epoch": 8.647662485746864, "grad_norm": 3.171875, "learning_rate": 2.4699444367247834e-07, "loss": 0.5658, "mean_token_accuracy": 0.886525347828865, "num_tokens": 405937278.0, "step": 3796 }, { "epoch": 8.64994298745724, "grad_norm": 2.625, "learning_rate": 2.461787985923525e-07, "loss": 0.5698, "mean_token_accuracy": 0.8777887523174286, "num_tokens": 406044118.0, "step": 3797 }, { "epoch": 8.652223489167618, "grad_norm": 3.21875, "learning_rate": 2.4536443273762864e-07, "loss": 0.5695, "mean_token_accuracy": 0.8797616511583328, "num_tokens": 406151330.0, "step": 3798 }, { "epoch": 8.654503990877993, "grad_norm": 5.15625, "learning_rate": 2.4455134657052626e-07, "loss": 0.5616, "mean_token_accuracy": 0.8845881819725037, "num_tokens": 406258516.0, "step": 3799 }, { "epoch": 8.65678449258837, "grad_norm": 2.9375, "learning_rate": 2.437395405525356e-07, "loss": 0.5844, "mean_token_accuracy": 0.8821278810501099, "num_tokens": 406364734.0, "step": 3800 }, { "epoch": 8.659064994298745, "grad_norm": 4.46875, "learning_rate": 2.429290151444233e-07, "loss": 0.5531, "mean_token_accuracy": 0.8842455595731735, "num_tokens": 406471849.0, "step": 3801 }, { "epoch": 8.661345496009123, "grad_norm": 3.359375, "learning_rate": 2.421197708062273e-07, "loss": 0.5721, "mean_token_accuracy": 0.8807191848754883, "num_tokens": 406579175.0, "step": 3802 }, { "epoch": 8.663625997719498, "grad_norm": 4.59375, "learning_rate": 2.413118079972593e-07, "loss": 0.5752, "mean_token_accuracy": 0.8790168464183807, "num_tokens": 406686082.0, "step": 3803 }, { "epoch": 8.665906499429875, "grad_norm": 4.0, "learning_rate": 2.405051271761036e-07, "loss": 0.5646, "mean_token_accuracy": 0.8817109614610672, "num_tokens": 406793281.0, "step": 3804 }, { "epoch": 8.66818700114025, "grad_norm": 3.234375, "learning_rate": 2.396997288006167e-07, "loss": 0.5687, "mean_token_accuracy": 0.8831426650285721, "num_tokens": 406900397.0, "step": 3805 }, { "epoch": 8.670467502850627, "grad_norm": 2.984375, "learning_rate": 2.388956133279266e-07, "loss": 0.5637, "mean_token_accuracy": 0.885161817073822, "num_tokens": 407007825.0, "step": 3806 }, { "epoch": 8.672748004561003, "grad_norm": 2.875, "learning_rate": 2.3809278121443403e-07, "loss": 0.5742, "mean_token_accuracy": 0.8806300610303879, "num_tokens": 407115154.0, "step": 3807 }, { "epoch": 8.67502850627138, "grad_norm": 4.625, "learning_rate": 2.3729123291581112e-07, "loss": 0.5783, "mean_token_accuracy": 0.8829029500484467, "num_tokens": 407222499.0, "step": 3808 }, { "epoch": 8.677309007981757, "grad_norm": 2.96875, "learning_rate": 2.3649096888700095e-07, "loss": 0.5737, "mean_token_accuracy": 0.8817735910415649, "num_tokens": 407329789.0, "step": 3809 }, { "epoch": 8.679589509692132, "grad_norm": 2.421875, "learning_rate": 2.356919895822188e-07, "loss": 0.5606, "mean_token_accuracy": 0.885416716337204, "num_tokens": 407436906.0, "step": 3810 }, { "epoch": 8.68187001140251, "grad_norm": 2.859375, "learning_rate": 2.3489429545494851e-07, "loss": 0.5764, "mean_token_accuracy": 0.8833657205104828, "num_tokens": 407544131.0, "step": 3811 }, { "epoch": 8.684150513112884, "grad_norm": 2.625, "learning_rate": 2.3409788695794688e-07, "loss": 0.5696, "mean_token_accuracy": 0.8843205720186234, "num_tokens": 407652008.0, "step": 3812 }, { "epoch": 8.686431014823262, "grad_norm": 2.9375, "learning_rate": 2.3330276454323926e-07, "loss": 0.5669, "mean_token_accuracy": 0.8810292482376099, "num_tokens": 407759542.0, "step": 3813 }, { "epoch": 8.688711516533637, "grad_norm": 3.03125, "learning_rate": 2.3250892866212294e-07, "loss": 0.5823, "mean_token_accuracy": 0.8779249489307404, "num_tokens": 407865934.0, "step": 3814 }, { "epoch": 8.690992018244014, "grad_norm": 3.046875, "learning_rate": 2.3171637976516253e-07, "loss": 0.5513, "mean_token_accuracy": 0.8835948258638382, "num_tokens": 407973474.0, "step": 3815 }, { "epoch": 8.69327251995439, "grad_norm": 2.78125, "learning_rate": 2.3092511830219405e-07, "loss": 0.5723, "mean_token_accuracy": 0.8819569945335388, "num_tokens": 408080757.0, "step": 3816 }, { "epoch": 8.695553021664766, "grad_norm": 2.671875, "learning_rate": 2.3013514472232295e-07, "loss": 0.5788, "mean_token_accuracy": 0.8784191906452179, "num_tokens": 408188778.0, "step": 3817 }, { "epoch": 8.697833523375142, "grad_norm": 2.421875, "learning_rate": 2.293464594739214e-07, "loss": 0.5593, "mean_token_accuracy": 0.8858485221862793, "num_tokens": 408295942.0, "step": 3818 }, { "epoch": 8.700114025085519, "grad_norm": 2.921875, "learning_rate": 2.2855906300463305e-07, "loss": 0.5894, "mean_token_accuracy": 0.8777198940515518, "num_tokens": 408402995.0, "step": 3819 }, { "epoch": 8.702394526795896, "grad_norm": 3.109375, "learning_rate": 2.2777295576136865e-07, "loss": 0.5557, "mean_token_accuracy": 0.888022854924202, "num_tokens": 408510620.0, "step": 3820 }, { "epoch": 8.704675028506271, "grad_norm": 3.203125, "learning_rate": 2.2698813819030802e-07, "loss": 0.567, "mean_token_accuracy": 0.8824697285890579, "num_tokens": 408617724.0, "step": 3821 }, { "epoch": 8.706955530216648, "grad_norm": 3.484375, "learning_rate": 2.2620461073689732e-07, "loss": 0.5641, "mean_token_accuracy": 0.8824862539768219, "num_tokens": 408724694.0, "step": 3822 }, { "epoch": 8.709236031927023, "grad_norm": 3.640625, "learning_rate": 2.254223738458522e-07, "loss": 0.5593, "mean_token_accuracy": 0.8855371475219727, "num_tokens": 408832529.0, "step": 3823 }, { "epoch": 8.7115165336374, "grad_norm": 2.703125, "learning_rate": 2.2464142796115557e-07, "loss": 0.553, "mean_token_accuracy": 0.8828513324260712, "num_tokens": 408939774.0, "step": 3824 }, { "epoch": 8.713797035347776, "grad_norm": 3.125, "learning_rate": 2.2386177352605677e-07, "loss": 0.5758, "mean_token_accuracy": 0.8790740817785263, "num_tokens": 409046308.0, "step": 3825 }, { "epoch": 8.716077537058153, "grad_norm": 2.546875, "learning_rate": 2.2308341098307318e-07, "loss": 0.554, "mean_token_accuracy": 0.8837715536355972, "num_tokens": 409153352.0, "step": 3826 }, { "epoch": 8.718358038768528, "grad_norm": 3.0, "learning_rate": 2.2230634077398755e-07, "loss": 0.5774, "mean_token_accuracy": 0.8824053555727005, "num_tokens": 409260531.0, "step": 3827 }, { "epoch": 8.720638540478905, "grad_norm": 3.0, "learning_rate": 2.2153056333985014e-07, "loss": 0.5613, "mean_token_accuracy": 0.8850871324539185, "num_tokens": 409367568.0, "step": 3828 }, { "epoch": 8.722919042189282, "grad_norm": 3.96875, "learning_rate": 2.2075607912097758e-07, "loss": 0.566, "mean_token_accuracy": 0.8864067494869232, "num_tokens": 409474914.0, "step": 3829 }, { "epoch": 8.725199543899658, "grad_norm": 5.375, "learning_rate": 2.1998288855695189e-07, "loss": 0.5718, "mean_token_accuracy": 0.8827249854803085, "num_tokens": 409581981.0, "step": 3830 }, { "epoch": 8.727480045610035, "grad_norm": 3.46875, "learning_rate": 2.1921099208662173e-07, "loss": 0.5828, "mean_token_accuracy": 0.8793773353099823, "num_tokens": 409689221.0, "step": 3831 }, { "epoch": 8.72976054732041, "grad_norm": 4.1875, "learning_rate": 2.184403901480997e-07, "loss": 0.5921, "mean_token_accuracy": 0.8778216391801834, "num_tokens": 409796146.0, "step": 3832 }, { "epoch": 8.732041049030787, "grad_norm": 3.28125, "learning_rate": 2.176710831787651e-07, "loss": 0.565, "mean_token_accuracy": 0.8842185884714127, "num_tokens": 409902678.0, "step": 3833 }, { "epoch": 8.734321550741162, "grad_norm": 2.921875, "learning_rate": 2.1690307161526148e-07, "loss": 0.5482, "mean_token_accuracy": 0.8871366381645203, "num_tokens": 410010272.0, "step": 3834 }, { "epoch": 8.73660205245154, "grad_norm": 3.890625, "learning_rate": 2.1613635589349756e-07, "loss": 0.5492, "mean_token_accuracy": 0.885500431060791, "num_tokens": 410117832.0, "step": 3835 }, { "epoch": 8.738882554161915, "grad_norm": 2.84375, "learning_rate": 2.153709364486467e-07, "loss": 0.5601, "mean_token_accuracy": 0.8852385729551315, "num_tokens": 410225406.0, "step": 3836 }, { "epoch": 8.741163055872292, "grad_norm": 2.640625, "learning_rate": 2.1460681371514552e-07, "loss": 0.5574, "mean_token_accuracy": 0.8837675005197525, "num_tokens": 410332633.0, "step": 3837 }, { "epoch": 8.743443557582669, "grad_norm": 2.703125, "learning_rate": 2.13843988126696e-07, "loss": 0.5727, "mean_token_accuracy": 0.8787273913621902, "num_tokens": 410439603.0, "step": 3838 }, { "epoch": 8.745724059293044, "grad_norm": 3.421875, "learning_rate": 2.130824601162626e-07, "loss": 0.5633, "mean_token_accuracy": 0.8858155012130737, "num_tokens": 410546706.0, "step": 3839 }, { "epoch": 8.748004561003421, "grad_norm": 3.5625, "learning_rate": 2.1232223011607406e-07, "loss": 0.5916, "mean_token_accuracy": 0.8737548291683197, "num_tokens": 410653837.0, "step": 3840 }, { "epoch": 8.750285062713797, "grad_norm": 2.828125, "learning_rate": 2.1156329855762243e-07, "loss": 0.5517, "mean_token_accuracy": 0.8844601809978485, "num_tokens": 410761316.0, "step": 3841 }, { "epoch": 8.752565564424174, "grad_norm": 3.234375, "learning_rate": 2.1080566587166286e-07, "loss": 0.5618, "mean_token_accuracy": 0.8839665204286575, "num_tokens": 410868322.0, "step": 3842 }, { "epoch": 8.754846066134549, "grad_norm": 3.25, "learning_rate": 2.1004933248821247e-07, "loss": 0.5743, "mean_token_accuracy": 0.8804609328508377, "num_tokens": 410976264.0, "step": 3843 }, { "epoch": 8.757126567844926, "grad_norm": 3.578125, "learning_rate": 2.0929429883655151e-07, "loss": 0.5793, "mean_token_accuracy": 0.8776623606681824, "num_tokens": 411083822.0, "step": 3844 }, { "epoch": 8.759407069555301, "grad_norm": 3.484375, "learning_rate": 2.08540565345223e-07, "loss": 0.5747, "mean_token_accuracy": 0.8814112395048141, "num_tokens": 411191400.0, "step": 3845 }, { "epoch": 8.761687571265679, "grad_norm": 3.921875, "learning_rate": 2.0778813244203111e-07, "loss": 0.5745, "mean_token_accuracy": 0.8791114687919617, "num_tokens": 411298004.0, "step": 3846 }, { "epoch": 8.763968072976056, "grad_norm": 2.828125, "learning_rate": 2.0703700055404285e-07, "loss": 0.5826, "mean_token_accuracy": 0.879044771194458, "num_tokens": 411404179.0, "step": 3847 }, { "epoch": 8.766248574686431, "grad_norm": 3.015625, "learning_rate": 2.0628717010758526e-07, "loss": 0.5598, "mean_token_accuracy": 0.8879903703927994, "num_tokens": 411511695.0, "step": 3848 }, { "epoch": 8.768529076396808, "grad_norm": 2.8125, "learning_rate": 2.0553864152824815e-07, "loss": 0.5709, "mean_token_accuracy": 0.8788136094808578, "num_tokens": 411619621.0, "step": 3849 }, { "epoch": 8.770809578107183, "grad_norm": 2.71875, "learning_rate": 2.0479141524088169e-07, "loss": 0.5834, "mean_token_accuracy": 0.883419394493103, "num_tokens": 411726749.0, "step": 3850 }, { "epoch": 8.77309007981756, "grad_norm": 3.0, "learning_rate": 2.040454916695972e-07, "loss": 0.5781, "mean_token_accuracy": 0.8799424916505814, "num_tokens": 411833865.0, "step": 3851 }, { "epoch": 8.775370581527936, "grad_norm": 2.46875, "learning_rate": 2.0330087123776655e-07, "loss": 0.5619, "mean_token_accuracy": 0.884945884346962, "num_tokens": 411941568.0, "step": 3852 }, { "epoch": 8.777651083238313, "grad_norm": 2.8125, "learning_rate": 2.0255755436802248e-07, "loss": 0.5659, "mean_token_accuracy": 0.884887769818306, "num_tokens": 412048897.0, "step": 3853 }, { "epoch": 8.779931584948688, "grad_norm": 3.640625, "learning_rate": 2.0181554148225618e-07, "loss": 0.5732, "mean_token_accuracy": 0.8816755414009094, "num_tokens": 412155508.0, "step": 3854 }, { "epoch": 8.782212086659065, "grad_norm": 3.34375, "learning_rate": 2.0107483300162018e-07, "loss": 0.5635, "mean_token_accuracy": 0.885670393705368, "num_tokens": 412262951.0, "step": 3855 }, { "epoch": 8.78449258836944, "grad_norm": 2.546875, "learning_rate": 2.0033542934652679e-07, "loss": 0.5682, "mean_token_accuracy": 0.8826614916324615, "num_tokens": 412370424.0, "step": 3856 }, { "epoch": 8.786773090079818, "grad_norm": 2.734375, "learning_rate": 1.9959733093664696e-07, "loss": 0.5513, "mean_token_accuracy": 0.8853038400411606, "num_tokens": 412478059.0, "step": 3857 }, { "epoch": 8.789053591790195, "grad_norm": 2.875, "learning_rate": 1.9886053819091116e-07, "loss": 0.5739, "mean_token_accuracy": 0.8780199140310287, "num_tokens": 412584823.0, "step": 3858 }, { "epoch": 8.79133409350057, "grad_norm": 3.671875, "learning_rate": 1.981250515275085e-07, "loss": 0.5713, "mean_token_accuracy": 0.8827866017818451, "num_tokens": 412692334.0, "step": 3859 }, { "epoch": 8.793614595210947, "grad_norm": 3.203125, "learning_rate": 1.973908713638878e-07, "loss": 0.5509, "mean_token_accuracy": 0.8867141306400299, "num_tokens": 412799411.0, "step": 3860 }, { "epoch": 8.795895096921322, "grad_norm": 2.65625, "learning_rate": 1.9665799811675407e-07, "loss": 0.5644, "mean_token_accuracy": 0.8858144879341125, "num_tokens": 412906629.0, "step": 3861 }, { "epoch": 8.7981755986317, "grad_norm": 2.734375, "learning_rate": 1.959264322020732e-07, "loss": 0.5846, "mean_token_accuracy": 0.8803358823060989, "num_tokens": 413013873.0, "step": 3862 }, { "epoch": 8.800456100342075, "grad_norm": 4.15625, "learning_rate": 1.9519617403506747e-07, "loss": 0.5746, "mean_token_accuracy": 0.8833809643983841, "num_tokens": 413120858.0, "step": 3863 }, { "epoch": 8.802736602052452, "grad_norm": 3.0, "learning_rate": 1.9446722403021757e-07, "loss": 0.5868, "mean_token_accuracy": 0.8800954222679138, "num_tokens": 413227696.0, "step": 3864 }, { "epoch": 8.805017103762827, "grad_norm": 2.90625, "learning_rate": 1.9373958260126113e-07, "loss": 0.5579, "mean_token_accuracy": 0.8825733810663223, "num_tokens": 413334739.0, "step": 3865 }, { "epoch": 8.807297605473204, "grad_norm": 3.609375, "learning_rate": 1.9301325016119338e-07, "loss": 0.5645, "mean_token_accuracy": 0.8862461596727371, "num_tokens": 413441973.0, "step": 3866 }, { "epoch": 8.80957810718358, "grad_norm": 3.03125, "learning_rate": 1.9228822712226675e-07, "loss": 0.5728, "mean_token_accuracy": 0.880901426076889, "num_tokens": 413549047.0, "step": 3867 }, { "epoch": 8.811858608893957, "grad_norm": 3.1875, "learning_rate": 1.915645138959904e-07, "loss": 0.5737, "mean_token_accuracy": 0.8816043883562088, "num_tokens": 413656651.0, "step": 3868 }, { "epoch": 8.814139110604334, "grad_norm": 3.671875, "learning_rate": 1.908421108931302e-07, "loss": 0.572, "mean_token_accuracy": 0.8817258775234222, "num_tokens": 413763796.0, "step": 3869 }, { "epoch": 8.816419612314709, "grad_norm": 3.484375, "learning_rate": 1.9012101852370763e-07, "loss": 0.5479, "mean_token_accuracy": 0.8846326470375061, "num_tokens": 413871019.0, "step": 3870 }, { "epoch": 8.818700114025086, "grad_norm": 3.453125, "learning_rate": 1.894012371970008e-07, "loss": 0.556, "mean_token_accuracy": 0.8854041546583176, "num_tokens": 413978295.0, "step": 3871 }, { "epoch": 8.820980615735461, "grad_norm": 3.21875, "learning_rate": 1.8868276732154384e-07, "loss": 0.5792, "mean_token_accuracy": 0.8823095858097076, "num_tokens": 414085389.0, "step": 3872 }, { "epoch": 8.823261117445838, "grad_norm": 3.296875, "learning_rate": 1.879656093051266e-07, "loss": 0.5627, "mean_token_accuracy": 0.8850487917661667, "num_tokens": 414192622.0, "step": 3873 }, { "epoch": 8.825541619156214, "grad_norm": 3.28125, "learning_rate": 1.872497635547943e-07, "loss": 0.5427, "mean_token_accuracy": 0.8869512677192688, "num_tokens": 414299632.0, "step": 3874 }, { "epoch": 8.82782212086659, "grad_norm": 2.6875, "learning_rate": 1.8653523047684642e-07, "loss": 0.5858, "mean_token_accuracy": 0.882158100605011, "num_tokens": 414406824.0, "step": 3875 }, { "epoch": 8.830102622576966, "grad_norm": 3.203125, "learning_rate": 1.858220104768385e-07, "loss": 0.5762, "mean_token_accuracy": 0.8781119138002396, "num_tokens": 414513673.0, "step": 3876 }, { "epoch": 8.832383124287343, "grad_norm": 2.8125, "learning_rate": 1.8511010395958067e-07, "loss": 0.5843, "mean_token_accuracy": 0.8815079480409622, "num_tokens": 414621071.0, "step": 3877 }, { "epoch": 8.83466362599772, "grad_norm": 2.765625, "learning_rate": 1.843995113291372e-07, "loss": 0.544, "mean_token_accuracy": 0.8865767568349838, "num_tokens": 414728284.0, "step": 3878 }, { "epoch": 8.836944127708096, "grad_norm": 2.59375, "learning_rate": 1.836902329888268e-07, "loss": 0.5876, "mean_token_accuracy": 0.8805812895298004, "num_tokens": 414835107.0, "step": 3879 }, { "epoch": 8.839224629418473, "grad_norm": 3.875, "learning_rate": 1.829822693412217e-07, "loss": 0.5468, "mean_token_accuracy": 0.8886751085519791, "num_tokens": 414942335.0, "step": 3880 }, { "epoch": 8.841505131128848, "grad_norm": 3.984375, "learning_rate": 1.8227562078814903e-07, "loss": 0.5498, "mean_token_accuracy": 0.8845852613449097, "num_tokens": 415050100.0, "step": 3881 }, { "epoch": 8.843785632839225, "grad_norm": 3.6875, "learning_rate": 1.815702877306888e-07, "loss": 0.5837, "mean_token_accuracy": 0.8805902749300003, "num_tokens": 415157356.0, "step": 3882 }, { "epoch": 8.8460661345496, "grad_norm": 5.375, "learning_rate": 1.8086627056917382e-07, "loss": 0.5927, "mean_token_accuracy": 0.878188282251358, "num_tokens": 415264053.0, "step": 3883 }, { "epoch": 8.848346636259977, "grad_norm": 3.328125, "learning_rate": 1.8016356970319116e-07, "loss": 0.5798, "mean_token_accuracy": 0.883014589548111, "num_tokens": 415371061.0, "step": 3884 }, { "epoch": 8.850627137970353, "grad_norm": 4.5625, "learning_rate": 1.7946218553158062e-07, "loss": 0.5621, "mean_token_accuracy": 0.8840430676937103, "num_tokens": 415478709.0, "step": 3885 }, { "epoch": 8.85290763968073, "grad_norm": 3.375, "learning_rate": 1.7876211845243325e-07, "loss": 0.5506, "mean_token_accuracy": 0.8877336978912354, "num_tokens": 415586877.0, "step": 3886 }, { "epoch": 8.855188141391107, "grad_norm": 3.03125, "learning_rate": 1.780633688630942e-07, "loss": 0.5946, "mean_token_accuracy": 0.8771243989467621, "num_tokens": 415693262.0, "step": 3887 }, { "epoch": 8.857468643101482, "grad_norm": 3.6875, "learning_rate": 1.773659371601605e-07, "loss": 0.537, "mean_token_accuracy": 0.8894240111112595, "num_tokens": 415800803.0, "step": 3888 }, { "epoch": 8.85974914481186, "grad_norm": 4.1875, "learning_rate": 1.7666982373948038e-07, "loss": 0.5917, "mean_token_accuracy": 0.8793479204177856, "num_tokens": 415907854.0, "step": 3889 }, { "epoch": 8.862029646522235, "grad_norm": 4.09375, "learning_rate": 1.7597502899615538e-07, "loss": 0.5561, "mean_token_accuracy": 0.8833720535039902, "num_tokens": 416015169.0, "step": 3890 }, { "epoch": 8.864310148232612, "grad_norm": 5.15625, "learning_rate": 1.752815533245364e-07, "loss": 0.5711, "mean_token_accuracy": 0.8827664852142334, "num_tokens": 416122300.0, "step": 3891 }, { "epoch": 8.866590649942987, "grad_norm": 4.4375, "learning_rate": 1.745893971182272e-07, "loss": 0.6053, "mean_token_accuracy": 0.8742794394493103, "num_tokens": 416228785.0, "step": 3892 }, { "epoch": 8.868871151653364, "grad_norm": 2.96875, "learning_rate": 1.7389856077008245e-07, "loss": 0.5594, "mean_token_accuracy": 0.8836385905742645, "num_tokens": 416336321.0, "step": 3893 }, { "epoch": 8.87115165336374, "grad_norm": 3.34375, "learning_rate": 1.7320904467220762e-07, "loss": 0.5702, "mean_token_accuracy": 0.8823292106389999, "num_tokens": 416443558.0, "step": 3894 }, { "epoch": 8.873432155074116, "grad_norm": 4.59375, "learning_rate": 1.725208492159583e-07, "loss": 0.5829, "mean_token_accuracy": 0.8806789815425873, "num_tokens": 416550695.0, "step": 3895 }, { "epoch": 8.875712656784494, "grad_norm": 3.1875, "learning_rate": 1.7183397479194175e-07, "loss": 0.5769, "mean_token_accuracy": 0.8829580396413803, "num_tokens": 416657754.0, "step": 3896 }, { "epoch": 8.877993158494869, "grad_norm": 3.171875, "learning_rate": 1.711484217900139e-07, "loss": 0.5661, "mean_token_accuracy": 0.8825534284114838, "num_tokens": 416764992.0, "step": 3897 }, { "epoch": 8.880273660205246, "grad_norm": 2.796875, "learning_rate": 1.7046419059928154e-07, "loss": 0.5671, "mean_token_accuracy": 0.8812769651412964, "num_tokens": 416872041.0, "step": 3898 }, { "epoch": 8.882554161915621, "grad_norm": 3.546875, "learning_rate": 1.6978128160810098e-07, "loss": 0.5447, "mean_token_accuracy": 0.8902477920055389, "num_tokens": 416979187.0, "step": 3899 }, { "epoch": 8.884834663625998, "grad_norm": 3.328125, "learning_rate": 1.6909969520407854e-07, "loss": 0.5819, "mean_token_accuracy": 0.8812951445579529, "num_tokens": 417086032.0, "step": 3900 }, { "epoch": 8.887115165336374, "grad_norm": 2.765625, "learning_rate": 1.6841943177406976e-07, "loss": 0.5647, "mean_token_accuracy": 0.8830399364233017, "num_tokens": 417192794.0, "step": 3901 }, { "epoch": 8.88939566704675, "grad_norm": 2.84375, "learning_rate": 1.6774049170417806e-07, "loss": 0.5702, "mean_token_accuracy": 0.8833406120538712, "num_tokens": 417300018.0, "step": 3902 }, { "epoch": 8.891676168757126, "grad_norm": 2.703125, "learning_rate": 1.6706287537975763e-07, "loss": 0.5555, "mean_token_accuracy": 0.8872157335281372, "num_tokens": 417407045.0, "step": 3903 }, { "epoch": 8.893956670467503, "grad_norm": 3.375, "learning_rate": 1.6638658318540973e-07, "loss": 0.5741, "mean_token_accuracy": 0.8826020061969757, "num_tokens": 417514079.0, "step": 3904 }, { "epoch": 8.896237172177878, "grad_norm": 3.421875, "learning_rate": 1.657116155049851e-07, "loss": 0.5573, "mean_token_accuracy": 0.8848617970943451, "num_tokens": 417621403.0, "step": 3905 }, { "epoch": 8.898517673888255, "grad_norm": 3.03125, "learning_rate": 1.6503797272158284e-07, "loss": 0.571, "mean_token_accuracy": 0.8829448074102402, "num_tokens": 417728575.0, "step": 3906 }, { "epoch": 8.900798175598633, "grad_norm": 3.5625, "learning_rate": 1.643656552175485e-07, "loss": 0.5632, "mean_token_accuracy": 0.8809054046869278, "num_tokens": 417835651.0, "step": 3907 }, { "epoch": 8.903078677309008, "grad_norm": 3.140625, "learning_rate": 1.6369466337447708e-07, "loss": 0.5772, "mean_token_accuracy": 0.8784161955118179, "num_tokens": 417943079.0, "step": 3908 }, { "epoch": 8.905359179019385, "grad_norm": 2.609375, "learning_rate": 1.6302499757321066e-07, "loss": 0.5859, "mean_token_accuracy": 0.8787552714347839, "num_tokens": 418050175.0, "step": 3909 }, { "epoch": 8.90763968072976, "grad_norm": 3.484375, "learning_rate": 1.623566581938385e-07, "loss": 0.5655, "mean_token_accuracy": 0.8856528103351593, "num_tokens": 418157102.0, "step": 3910 }, { "epoch": 8.909920182440137, "grad_norm": 3.5, "learning_rate": 1.6168964561569716e-07, "loss": 0.5756, "mean_token_accuracy": 0.880318284034729, "num_tokens": 418264196.0, "step": 3911 }, { "epoch": 8.912200684150513, "grad_norm": 2.9375, "learning_rate": 1.6102396021737077e-07, "loss": 0.5442, "mean_token_accuracy": 0.886517733335495, "num_tokens": 418371298.0, "step": 3912 }, { "epoch": 8.91448118586089, "grad_norm": 3.65625, "learning_rate": 1.6035960237668818e-07, "loss": 0.5621, "mean_token_accuracy": 0.8844355195760727, "num_tokens": 418478409.0, "step": 3913 }, { "epoch": 8.916761687571265, "grad_norm": 2.734375, "learning_rate": 1.5969657247072695e-07, "loss": 0.5548, "mean_token_accuracy": 0.8860626816749573, "num_tokens": 418585134.0, "step": 3914 }, { "epoch": 8.919042189281642, "grad_norm": 2.84375, "learning_rate": 1.5903487087580994e-07, "loss": 0.5435, "mean_token_accuracy": 0.8894283324480057, "num_tokens": 418691838.0, "step": 3915 }, { "epoch": 8.921322690992017, "grad_norm": 3.0, "learning_rate": 1.5837449796750588e-07, "loss": 0.5838, "mean_token_accuracy": 0.8798937052488327, "num_tokens": 418798999.0, "step": 3916 }, { "epoch": 8.923603192702394, "grad_norm": 2.6875, "learning_rate": 1.577154541206305e-07, "loss": 0.5511, "mean_token_accuracy": 0.8867987394332886, "num_tokens": 418906023.0, "step": 3917 }, { "epoch": 8.925883694412772, "grad_norm": 3.453125, "learning_rate": 1.5705773970924349e-07, "loss": 0.5719, "mean_token_accuracy": 0.8823074996471405, "num_tokens": 419013460.0, "step": 3918 }, { "epoch": 8.928164196123147, "grad_norm": 3.71875, "learning_rate": 1.5640135510665094e-07, "loss": 0.5883, "mean_token_accuracy": 0.8819030374288559, "num_tokens": 419120119.0, "step": 3919 }, { "epoch": 8.930444697833524, "grad_norm": 3.234375, "learning_rate": 1.5574630068540458e-07, "loss": 0.5609, "mean_token_accuracy": 0.8858517855405807, "num_tokens": 419227218.0, "step": 3920 }, { "epoch": 8.9327251995439, "grad_norm": 2.59375, "learning_rate": 1.5509257681730034e-07, "loss": 0.5511, "mean_token_accuracy": 0.8868541866540909, "num_tokens": 419334292.0, "step": 3921 }, { "epoch": 8.935005701254276, "grad_norm": 3.96875, "learning_rate": 1.5444018387337946e-07, "loss": 0.5592, "mean_token_accuracy": 0.8833621442317963, "num_tokens": 419441056.0, "step": 3922 }, { "epoch": 8.937286202964652, "grad_norm": 3.453125, "learning_rate": 1.537891222239271e-07, "loss": 0.5803, "mean_token_accuracy": 0.8798842430114746, "num_tokens": 419548438.0, "step": 3923 }, { "epoch": 8.939566704675029, "grad_norm": 3.84375, "learning_rate": 1.5313939223847384e-07, "loss": 0.5782, "mean_token_accuracy": 0.8819588273763657, "num_tokens": 419655399.0, "step": 3924 }, { "epoch": 8.941847206385404, "grad_norm": 2.578125, "learning_rate": 1.5249099428579383e-07, "loss": 0.5513, "mean_token_accuracy": 0.8892662078142166, "num_tokens": 419763644.0, "step": 3925 }, { "epoch": 8.944127708095781, "grad_norm": 2.875, "learning_rate": 1.5184392873390463e-07, "loss": 0.5756, "mean_token_accuracy": 0.8778876066207886, "num_tokens": 419870546.0, "step": 3926 }, { "epoch": 8.946408209806158, "grad_norm": 2.6875, "learning_rate": 1.5119819595006857e-07, "loss": 0.5773, "mean_token_accuracy": 0.8791385740041733, "num_tokens": 419977393.0, "step": 3927 }, { "epoch": 8.948688711516533, "grad_norm": 3.3125, "learning_rate": 1.5055379630079163e-07, "loss": 0.5893, "mean_token_accuracy": 0.8777313679456711, "num_tokens": 420084260.0, "step": 3928 }, { "epoch": 8.95096921322691, "grad_norm": 2.734375, "learning_rate": 1.4991073015182184e-07, "loss": 0.5734, "mean_token_accuracy": 0.8823111951351166, "num_tokens": 420191156.0, "step": 3929 }, { "epoch": 8.953249714937286, "grad_norm": 3.953125, "learning_rate": 1.4926899786815107e-07, "loss": 0.5715, "mean_token_accuracy": 0.883946105837822, "num_tokens": 420298497.0, "step": 3930 }, { "epoch": 8.955530216647663, "grad_norm": 2.859375, "learning_rate": 1.4862859981401468e-07, "loss": 0.583, "mean_token_accuracy": 0.882349282503128, "num_tokens": 420405474.0, "step": 3931 }, { "epoch": 8.957810718358038, "grad_norm": 3.765625, "learning_rate": 1.4798953635288994e-07, "loss": 0.5589, "mean_token_accuracy": 0.8836222589015961, "num_tokens": 420513022.0, "step": 3932 }, { "epoch": 8.960091220068415, "grad_norm": 2.90625, "learning_rate": 1.4735180784749754e-07, "loss": 0.5709, "mean_token_accuracy": 0.880601704120636, "num_tokens": 420620084.0, "step": 3933 }, { "epoch": 8.96237172177879, "grad_norm": 2.859375, "learning_rate": 1.4671541465979877e-07, "loss": 0.5701, "mean_token_accuracy": 0.8797517418861389, "num_tokens": 420726931.0, "step": 3934 }, { "epoch": 8.964652223489168, "grad_norm": 3.296875, "learning_rate": 1.460803571509989e-07, "loss": 0.5746, "mean_token_accuracy": 0.8833180367946625, "num_tokens": 420834187.0, "step": 3935 }, { "epoch": 8.966932725199545, "grad_norm": 2.65625, "learning_rate": 1.4544663568154427e-07, "loss": 0.5821, "mean_token_accuracy": 0.8844758570194244, "num_tokens": 420940822.0, "step": 3936 }, { "epoch": 8.96921322690992, "grad_norm": 3.125, "learning_rate": 1.448142506111225e-07, "loss": 0.568, "mean_token_accuracy": 0.8841772228479385, "num_tokens": 421048221.0, "step": 3937 }, { "epoch": 8.971493728620297, "grad_norm": 2.609375, "learning_rate": 1.441832022986636e-07, "loss": 0.58, "mean_token_accuracy": 0.8786625862121582, "num_tokens": 421154952.0, "step": 3938 }, { "epoch": 8.973774230330672, "grad_norm": 2.609375, "learning_rate": 1.4355349110233868e-07, "loss": 0.5533, "mean_token_accuracy": 0.8868290036916733, "num_tokens": 421261851.0, "step": 3939 }, { "epoch": 8.97605473204105, "grad_norm": 4.1875, "learning_rate": 1.42925117379559e-07, "loss": 0.5804, "mean_token_accuracy": 0.8779198080301285, "num_tokens": 421368450.0, "step": 3940 }, { "epoch": 8.978335233751425, "grad_norm": 2.96875, "learning_rate": 1.4229808148697732e-07, "loss": 0.5751, "mean_token_accuracy": 0.8809890896081924, "num_tokens": 421475775.0, "step": 3941 }, { "epoch": 8.980615735461802, "grad_norm": 3.015625, "learning_rate": 1.416723837804876e-07, "loss": 0.5647, "mean_token_accuracy": 0.8843148350715637, "num_tokens": 421582954.0, "step": 3942 }, { "epoch": 8.982896237172177, "grad_norm": 3.34375, "learning_rate": 1.410480246152235e-07, "loss": 0.567, "mean_token_accuracy": 0.8814869672060013, "num_tokens": 421690190.0, "step": 3943 }, { "epoch": 8.985176738882554, "grad_norm": 3.546875, "learning_rate": 1.4042500434555961e-07, "loss": 0.5812, "mean_token_accuracy": 0.8802603036165237, "num_tokens": 421797293.0, "step": 3944 }, { "epoch": 8.987457240592931, "grad_norm": 2.890625, "learning_rate": 1.398033233251095e-07, "loss": 0.5761, "mean_token_accuracy": 0.8830568790435791, "num_tokens": 421904481.0, "step": 3945 }, { "epoch": 8.989737742303307, "grad_norm": 2.609375, "learning_rate": 1.3918298190672806e-07, "loss": 0.5455, "mean_token_accuracy": 0.8889565318822861, "num_tokens": 422011408.0, "step": 3946 }, { "epoch": 8.992018244013684, "grad_norm": 2.90625, "learning_rate": 1.3856398044250846e-07, "loss": 0.5774, "mean_token_accuracy": 0.8796057999134064, "num_tokens": 422118456.0, "step": 3947 }, { "epoch": 8.994298745724059, "grad_norm": 2.65625, "learning_rate": 1.3794631928378434e-07, "loss": 0.5586, "mean_token_accuracy": 0.8840108662843704, "num_tokens": 422225963.0, "step": 3948 }, { "epoch": 8.996579247434436, "grad_norm": 2.515625, "learning_rate": 1.3732999878112856e-07, "loss": 0.5605, "mean_token_accuracy": 0.8850581645965576, "num_tokens": 422333254.0, "step": 3949 }, { "epoch": 8.998859749144811, "grad_norm": 2.890625, "learning_rate": 1.3671501928435193e-07, "loss": 0.5784, "mean_token_accuracy": 0.8808369934558868, "num_tokens": 422440753.0, "step": 3950 }, { "epoch": 9.0, "grad_norm": 9.0, "learning_rate": 1.361013811425052e-07, "loss": 0.5721, "mean_token_accuracy": 0.8868737518787384, "num_tokens": 422480088.0, "step": 3951 }, { "epoch": 9.002280501710377, "grad_norm": 3.015625, "learning_rate": 1.3548908470387783e-07, "loss": 0.5611, "mean_token_accuracy": 0.8844682276248932, "num_tokens": 422587127.0, "step": 3952 }, { "epoch": 9.004561003420752, "grad_norm": 2.421875, "learning_rate": 1.348781303159974e-07, "loss": 0.5231, "mean_token_accuracy": 0.8936333805322647, "num_tokens": 422694591.0, "step": 3953 }, { "epoch": 9.00684150513113, "grad_norm": 3.234375, "learning_rate": 1.3426851832562982e-07, "loss": 0.5644, "mean_token_accuracy": 0.8864496648311615, "num_tokens": 422802195.0, "step": 3954 }, { "epoch": 9.009122006841505, "grad_norm": 2.5, "learning_rate": 1.3366024907877917e-07, "loss": 0.5561, "mean_token_accuracy": 0.8846766799688339, "num_tokens": 422909976.0, "step": 3955 }, { "epoch": 9.011402508551882, "grad_norm": 2.921875, "learning_rate": 1.3305332292068706e-07, "loss": 0.5821, "mean_token_accuracy": 0.8805938214063644, "num_tokens": 423016740.0, "step": 3956 }, { "epoch": 9.013683010262257, "grad_norm": 3.546875, "learning_rate": 1.3244774019583296e-07, "loss": 0.5726, "mean_token_accuracy": 0.8852843195199966, "num_tokens": 423123757.0, "step": 3957 }, { "epoch": 9.015963511972634, "grad_norm": 3.0625, "learning_rate": 1.318435012479341e-07, "loss": 0.5802, "mean_token_accuracy": 0.8831017911434174, "num_tokens": 423229906.0, "step": 3958 }, { "epoch": 9.01824401368301, "grad_norm": 2.8125, "learning_rate": 1.3124060641994507e-07, "loss": 0.5664, "mean_token_accuracy": 0.88319331407547, "num_tokens": 423337912.0, "step": 3959 }, { "epoch": 9.020524515393387, "grad_norm": 3.421875, "learning_rate": 1.306390560540577e-07, "loss": 0.5514, "mean_token_accuracy": 0.8833757936954498, "num_tokens": 423445320.0, "step": 3960 }, { "epoch": 9.020524515393387, "eval_loss": 0.5863176584243774, "eval_mean_token_accuracy": 0.8799216114976107, "eval_num_tokens": 423445320.0, "eval_runtime": 58.6917, "eval_samples_per_second": 142.865, "eval_steps_per_second": 4.481, "step": 3960 }, { "epoch": 9.022805017103764, "grad_norm": 3.03125, "learning_rate": 1.300388504916991e-07, "loss": 0.5857, "mean_token_accuracy": 0.8808793723583221, "num_tokens": 423551972.0, "step": 3961 }, { "epoch": 9.025085518814139, "grad_norm": 3.375, "learning_rate": 1.2943999007353518e-07, "loss": 0.5633, "mean_token_accuracy": 0.8862560987472534, "num_tokens": 423658903.0, "step": 3962 }, { "epoch": 9.027366020524516, "grad_norm": 3.6875, "learning_rate": 1.2884247513946761e-07, "loss": 0.5818, "mean_token_accuracy": 0.8841415643692017, "num_tokens": 423765920.0, "step": 3963 }, { "epoch": 9.029646522234891, "grad_norm": 2.796875, "learning_rate": 1.2824630602863402e-07, "loss": 0.5812, "mean_token_accuracy": 0.8800481855869293, "num_tokens": 423872709.0, "step": 3964 }, { "epoch": 9.031927023945268, "grad_norm": 2.75, "learning_rate": 1.2765148307940927e-07, "loss": 0.572, "mean_token_accuracy": 0.8826144337654114, "num_tokens": 423979876.0, "step": 3965 }, { "epoch": 9.034207525655644, "grad_norm": 4.09375, "learning_rate": 1.270580066294022e-07, "loss": 0.563, "mean_token_accuracy": 0.8837986141443253, "num_tokens": 424086963.0, "step": 3966 }, { "epoch": 9.03648802736602, "grad_norm": 3.5625, "learning_rate": 1.264658770154592e-07, "loss": 0.5676, "mean_token_accuracy": 0.8831162303686142, "num_tokens": 424193595.0, "step": 3967 }, { "epoch": 9.038768529076396, "grad_norm": 3.78125, "learning_rate": 1.258750945736617e-07, "loss": 0.5552, "mean_token_accuracy": 0.8876445442438126, "num_tokens": 424300328.0, "step": 3968 }, { "epoch": 9.041049030786773, "grad_norm": 3.46875, "learning_rate": 1.252856596393262e-07, "loss": 0.5579, "mean_token_accuracy": 0.8856015801429749, "num_tokens": 424407500.0, "step": 3969 }, { "epoch": 9.043329532497149, "grad_norm": 2.75, "learning_rate": 1.2469757254700454e-07, "loss": 0.5624, "mean_token_accuracy": 0.8857114911079407, "num_tokens": 424515230.0, "step": 3970 }, { "epoch": 9.045610034207526, "grad_norm": 2.828125, "learning_rate": 1.2411083363048386e-07, "loss": 0.5901, "mean_token_accuracy": 0.878177598118782, "num_tokens": 424622085.0, "step": 3971 }, { "epoch": 9.047890535917903, "grad_norm": 2.921875, "learning_rate": 1.2352544322278558e-07, "loss": 0.5995, "mean_token_accuracy": 0.8785828799009323, "num_tokens": 424728586.0, "step": 3972 }, { "epoch": 9.050171037628278, "grad_norm": 3.109375, "learning_rate": 1.2294140165616613e-07, "loss": 0.5772, "mean_token_accuracy": 0.883171334862709, "num_tokens": 424835689.0, "step": 3973 }, { "epoch": 9.052451539338655, "grad_norm": 2.6875, "learning_rate": 1.223587092621162e-07, "loss": 0.5783, "mean_token_accuracy": 0.880466416478157, "num_tokens": 424942400.0, "step": 3974 }, { "epoch": 9.05473204104903, "grad_norm": 3.921875, "learning_rate": 1.2177736637136063e-07, "loss": 0.5743, "mean_token_accuracy": 0.8840536028146744, "num_tokens": 425049454.0, "step": 3975 }, { "epoch": 9.057012542759407, "grad_norm": 4.125, "learning_rate": 1.2119737331385885e-07, "loss": 0.5884, "mean_token_accuracy": 0.8803068101406097, "num_tokens": 425156346.0, "step": 3976 }, { "epoch": 9.059293044469783, "grad_norm": 3.734375, "learning_rate": 1.2061873041880335e-07, "loss": 0.5798, "mean_token_accuracy": 0.8801010698080063, "num_tokens": 425262768.0, "step": 3977 }, { "epoch": 9.06157354618016, "grad_norm": 2.546875, "learning_rate": 1.200414380146206e-07, "loss": 0.5709, "mean_token_accuracy": 0.882665142416954, "num_tokens": 425370275.0, "step": 3978 }, { "epoch": 9.063854047890535, "grad_norm": 3.203125, "learning_rate": 1.1946549642897043e-07, "loss": 0.5668, "mean_token_accuracy": 0.8818689733743668, "num_tokens": 425477099.0, "step": 3979 }, { "epoch": 9.066134549600912, "grad_norm": 3.125, "learning_rate": 1.1889090598874692e-07, "loss": 0.5815, "mean_token_accuracy": 0.8815028220415115, "num_tokens": 425584752.0, "step": 3980 }, { "epoch": 9.06841505131129, "grad_norm": 4.25, "learning_rate": 1.1831766702007613e-07, "loss": 0.5884, "mean_token_accuracy": 0.8803604692220688, "num_tokens": 425691333.0, "step": 3981 }, { "epoch": 9.070695553021665, "grad_norm": 3.0, "learning_rate": 1.1774577984831725e-07, "loss": 0.5801, "mean_token_accuracy": 0.8821363896131516, "num_tokens": 425798165.0, "step": 3982 }, { "epoch": 9.072976054732042, "grad_norm": 3.484375, "learning_rate": 1.1717524479806231e-07, "loss": 0.578, "mean_token_accuracy": 0.8794075697660446, "num_tokens": 425904837.0, "step": 3983 }, { "epoch": 9.075256556442417, "grad_norm": 3.046875, "learning_rate": 1.1660606219313642e-07, "loss": 0.5743, "mean_token_accuracy": 0.8835705667734146, "num_tokens": 426012137.0, "step": 3984 }, { "epoch": 9.077537058152794, "grad_norm": 3.5625, "learning_rate": 1.1603823235659644e-07, "loss": 0.5691, "mean_token_accuracy": 0.8850951492786407, "num_tokens": 426119123.0, "step": 3985 }, { "epoch": 9.07981755986317, "grad_norm": 2.78125, "learning_rate": 1.1547175561073154e-07, "loss": 0.5737, "mean_token_accuracy": 0.8812219202518463, "num_tokens": 426226278.0, "step": 3986 }, { "epoch": 9.082098061573546, "grad_norm": 2.953125, "learning_rate": 1.1490663227706311e-07, "loss": 0.5866, "mean_token_accuracy": 0.88066066801548, "num_tokens": 426333015.0, "step": 3987 }, { "epoch": 9.084378563283922, "grad_norm": 2.484375, "learning_rate": 1.1434286267634432e-07, "loss": 0.5532, "mean_token_accuracy": 0.8882465213537216, "num_tokens": 426440373.0, "step": 3988 }, { "epoch": 9.086659064994299, "grad_norm": 3.28125, "learning_rate": 1.1378044712855946e-07, "loss": 0.6018, "mean_token_accuracy": 0.8752111494541168, "num_tokens": 426546913.0, "step": 3989 }, { "epoch": 9.088939566704674, "grad_norm": 2.96875, "learning_rate": 1.1321938595292542e-07, "loss": 0.568, "mean_token_accuracy": 0.8806653469800949, "num_tokens": 426654053.0, "step": 3990 }, { "epoch": 9.091220068415051, "grad_norm": 3.015625, "learning_rate": 1.1265967946788913e-07, "loss": 0.5653, "mean_token_accuracy": 0.8836180865764618, "num_tokens": 426761077.0, "step": 3991 }, { "epoch": 9.093500570125428, "grad_norm": 3.671875, "learning_rate": 1.1210132799112954e-07, "loss": 0.5426, "mean_token_accuracy": 0.8866991996765137, "num_tokens": 426868400.0, "step": 3992 }, { "epoch": 9.095781071835804, "grad_norm": 2.734375, "learning_rate": 1.1154433183955593e-07, "loss": 0.5666, "mean_token_accuracy": 0.880575105547905, "num_tokens": 426975681.0, "step": 3993 }, { "epoch": 9.09806157354618, "grad_norm": 3.03125, "learning_rate": 1.1098869132930846e-07, "loss": 0.5781, "mean_token_accuracy": 0.880921483039856, "num_tokens": 427082443.0, "step": 3994 }, { "epoch": 9.100342075256556, "grad_norm": 2.71875, "learning_rate": 1.1043440677575818e-07, "loss": 0.5642, "mean_token_accuracy": 0.88336580991745, "num_tokens": 427189040.0, "step": 3995 }, { "epoch": 9.102622576966933, "grad_norm": 3.015625, "learning_rate": 1.0988147849350623e-07, "loss": 0.5718, "mean_token_accuracy": 0.8802250772714615, "num_tokens": 427295944.0, "step": 3996 }, { "epoch": 9.104903078677308, "grad_norm": 2.46875, "learning_rate": 1.0932990679638406e-07, "loss": 0.5563, "mean_token_accuracy": 0.8847872316837311, "num_tokens": 427402191.0, "step": 3997 }, { "epoch": 9.107183580387685, "grad_norm": 3.5, "learning_rate": 1.0877969199745347e-07, "loss": 0.5764, "mean_token_accuracy": 0.8834807723760605, "num_tokens": 427509134.0, "step": 3998 }, { "epoch": 9.10946408209806, "grad_norm": 3.078125, "learning_rate": 1.0823083440900523e-07, "loss": 0.5802, "mean_token_accuracy": 0.8807051777839661, "num_tokens": 427616092.0, "step": 3999 }, { "epoch": 9.111744583808438, "grad_norm": 3.578125, "learning_rate": 1.0768333434256039e-07, "loss": 0.587, "mean_token_accuracy": 0.8833329081535339, "num_tokens": 427723041.0, "step": 4000 }, { "epoch": 9.114025085518815, "grad_norm": 2.765625, "learning_rate": 1.071371921088693e-07, "loss": 0.5673, "mean_token_accuracy": 0.8830773681402206, "num_tokens": 427830278.0, "step": 4001 }, { "epoch": 9.11630558722919, "grad_norm": 2.828125, "learning_rate": 1.0659240801791204e-07, "loss": 0.575, "mean_token_accuracy": 0.8835585117340088, "num_tokens": 427937452.0, "step": 4002 }, { "epoch": 9.118586088939567, "grad_norm": 3.125, "learning_rate": 1.0604898237889794e-07, "loss": 0.5665, "mean_token_accuracy": 0.8840138912200928, "num_tokens": 428044476.0, "step": 4003 }, { "epoch": 9.120866590649943, "grad_norm": 3.984375, "learning_rate": 1.0550691550026415e-07, "loss": 0.5759, "mean_token_accuracy": 0.8799526989459991, "num_tokens": 428151794.0, "step": 4004 }, { "epoch": 9.12314709236032, "grad_norm": 3.6875, "learning_rate": 1.0496620768967736e-07, "loss": 0.5874, "mean_token_accuracy": 0.8784212470054626, "num_tokens": 428258124.0, "step": 4005 }, { "epoch": 9.125427594070695, "grad_norm": 3.859375, "learning_rate": 1.0442685925403346e-07, "loss": 0.5583, "mean_token_accuracy": 0.8846355378627777, "num_tokens": 428365889.0, "step": 4006 }, { "epoch": 9.127708095781072, "grad_norm": 4.125, "learning_rate": 1.0388887049945589e-07, "loss": 0.5837, "mean_token_accuracy": 0.8810495138168335, "num_tokens": 428472825.0, "step": 4007 }, { "epoch": 9.129988597491447, "grad_norm": 2.8125, "learning_rate": 1.0335224173129683e-07, "loss": 0.5729, "mean_token_accuracy": 0.8812502026557922, "num_tokens": 428580741.0, "step": 4008 }, { "epoch": 9.132269099201825, "grad_norm": 3.046875, "learning_rate": 1.0281697325413593e-07, "loss": 0.5508, "mean_token_accuracy": 0.8845276087522507, "num_tokens": 428687964.0, "step": 4009 }, { "epoch": 9.134549600912202, "grad_norm": 3.078125, "learning_rate": 1.0228306537178185e-07, "loss": 0.5828, "mean_token_accuracy": 0.8836958706378937, "num_tokens": 428794803.0, "step": 4010 }, { "epoch": 9.136830102622577, "grad_norm": 2.859375, "learning_rate": 1.0175051838727023e-07, "loss": 0.5516, "mean_token_accuracy": 0.8883339315652847, "num_tokens": 428901676.0, "step": 4011 }, { "epoch": 9.139110604332954, "grad_norm": 3.046875, "learning_rate": 1.0121933260286432e-07, "loss": 0.5539, "mean_token_accuracy": 0.8851373344659805, "num_tokens": 429009258.0, "step": 4012 }, { "epoch": 9.14139110604333, "grad_norm": 3.296875, "learning_rate": 1.0068950832005487e-07, "loss": 0.57, "mean_token_accuracy": 0.8837276846170425, "num_tokens": 429116708.0, "step": 4013 }, { "epoch": 9.143671607753706, "grad_norm": 3.0625, "learning_rate": 1.0016104583956021e-07, "loss": 0.5865, "mean_token_accuracy": 0.8805951774120331, "num_tokens": 429223799.0, "step": 4014 }, { "epoch": 9.145952109464082, "grad_norm": 3.6875, "learning_rate": 9.963394546132488e-08, "loss": 0.5714, "mean_token_accuracy": 0.8829237967729568, "num_tokens": 429330842.0, "step": 4015 }, { "epoch": 9.148232611174459, "grad_norm": 2.625, "learning_rate": 9.91082074845215e-08, "loss": 0.5576, "mean_token_accuracy": 0.8858160525560379, "num_tokens": 429438548.0, "step": 4016 }, { "epoch": 9.150513112884834, "grad_norm": 3.0625, "learning_rate": 9.85838322075483e-08, "loss": 0.5708, "mean_token_accuracy": 0.8821554481983185, "num_tokens": 429545699.0, "step": 4017 }, { "epoch": 9.152793614595211, "grad_norm": 3.78125, "learning_rate": 9.806081992803084e-08, "loss": 0.5884, "mean_token_accuracy": 0.8780340850353241, "num_tokens": 429653689.0, "step": 4018 }, { "epoch": 9.155074116305586, "grad_norm": 3.03125, "learning_rate": 9.753917094282112e-08, "loss": 0.5746, "mean_token_accuracy": 0.8829029351472855, "num_tokens": 429759979.0, "step": 4019 }, { "epoch": 9.157354618015964, "grad_norm": 2.46875, "learning_rate": 9.701888554799643e-08, "loss": 0.5683, "mean_token_accuracy": 0.883645236492157, "num_tokens": 429867491.0, "step": 4020 }, { "epoch": 9.15963511972634, "grad_norm": 2.953125, "learning_rate": 9.649996403886086e-08, "loss": 0.5653, "mean_token_accuracy": 0.8832023441791534, "num_tokens": 429974470.0, "step": 4021 }, { "epoch": 9.161915621436716, "grad_norm": 3.0625, "learning_rate": 9.598240670994435e-08, "loss": 0.5901, "mean_token_accuracy": 0.8796355575323105, "num_tokens": 430081389.0, "step": 4022 }, { "epoch": 9.164196123147093, "grad_norm": 3.359375, "learning_rate": 9.546621385500249e-08, "loss": 0.5847, "mean_token_accuracy": 0.8781373351812363, "num_tokens": 430188309.0, "step": 4023 }, { "epoch": 9.166476624857468, "grad_norm": 2.921875, "learning_rate": 9.495138576701673e-08, "loss": 0.5693, "mean_token_accuracy": 0.883067861199379, "num_tokens": 430295602.0, "step": 4024 }, { "epoch": 9.168757126567845, "grad_norm": 3.078125, "learning_rate": 9.443792273819252e-08, "loss": 0.5593, "mean_token_accuracy": 0.8858152031898499, "num_tokens": 430402252.0, "step": 4025 }, { "epoch": 9.17103762827822, "grad_norm": 2.734375, "learning_rate": 9.392582505996256e-08, "loss": 0.5787, "mean_token_accuracy": 0.8808012902736664, "num_tokens": 430508615.0, "step": 4026 }, { "epoch": 9.173318129988598, "grad_norm": 3.4375, "learning_rate": 9.341509302298295e-08, "loss": 0.5636, "mean_token_accuracy": 0.8854473978281021, "num_tokens": 430615447.0, "step": 4027 }, { "epoch": 9.175598631698973, "grad_norm": 2.703125, "learning_rate": 9.290572691713573e-08, "loss": 0.5635, "mean_token_accuracy": 0.885080024600029, "num_tokens": 430722374.0, "step": 4028 }, { "epoch": 9.17787913340935, "grad_norm": 2.609375, "learning_rate": 9.23977270315271e-08, "loss": 0.5819, "mean_token_accuracy": 0.8824837505817413, "num_tokens": 430829818.0, "step": 4029 }, { "epoch": 9.180159635119727, "grad_norm": 2.640625, "learning_rate": 9.18910936544884e-08, "loss": 0.548, "mean_token_accuracy": 0.8877168446779251, "num_tokens": 430936968.0, "step": 4030 }, { "epoch": 9.182440136830103, "grad_norm": 3.21875, "learning_rate": 9.138582707357429e-08, "loss": 0.5572, "mean_token_accuracy": 0.8826006203889847, "num_tokens": 431043889.0, "step": 4031 }, { "epoch": 9.18472063854048, "grad_norm": 3.96875, "learning_rate": 9.088192757556457e-08, "loss": 0.576, "mean_token_accuracy": 0.8780194073915482, "num_tokens": 431150431.0, "step": 4032 }, { "epoch": 9.187001140250855, "grad_norm": 2.984375, "learning_rate": 9.037939544646324e-08, "loss": 0.551, "mean_token_accuracy": 0.8834515959024429, "num_tokens": 431257816.0, "step": 4033 }, { "epoch": 9.189281641961232, "grad_norm": 3.046875, "learning_rate": 8.987823097149739e-08, "loss": 0.5718, "mean_token_accuracy": 0.8835292160511017, "num_tokens": 431364325.0, "step": 4034 }, { "epoch": 9.191562143671607, "grad_norm": 3.0, "learning_rate": 8.93784344351184e-08, "loss": 0.5672, "mean_token_accuracy": 0.8845600485801697, "num_tokens": 431471086.0, "step": 4035 }, { "epoch": 9.193842645381984, "grad_norm": 3.0, "learning_rate": 8.888000612100128e-08, "loss": 0.5694, "mean_token_accuracy": 0.8847863525152206, "num_tokens": 431578776.0, "step": 4036 }, { "epoch": 9.19612314709236, "grad_norm": 2.875, "learning_rate": 8.838294631204391e-08, "loss": 0.5679, "mean_token_accuracy": 0.8858127593994141, "num_tokens": 431685845.0, "step": 4037 }, { "epoch": 9.198403648802737, "grad_norm": 3.21875, "learning_rate": 8.788725529036812e-08, "loss": 0.5587, "mean_token_accuracy": 0.885520726442337, "num_tokens": 431793922.0, "step": 4038 }, { "epoch": 9.200684150513112, "grad_norm": 3.109375, "learning_rate": 8.739293333731886e-08, "loss": 0.5813, "mean_token_accuracy": 0.8812271952629089, "num_tokens": 431900233.0, "step": 4039 }, { "epoch": 9.20296465222349, "grad_norm": 2.75, "learning_rate": 8.689998073346361e-08, "loss": 0.5602, "mean_token_accuracy": 0.8870201855897903, "num_tokens": 432007117.0, "step": 4040 }, { "epoch": 9.205245153933866, "grad_norm": 3.25, "learning_rate": 8.640839775859222e-08, "loss": 0.5591, "mean_token_accuracy": 0.8841793239116669, "num_tokens": 432114160.0, "step": 4041 }, { "epoch": 9.207525655644242, "grad_norm": 2.75, "learning_rate": 8.591818469171815e-08, "loss": 0.5726, "mean_token_accuracy": 0.8814292848110199, "num_tokens": 432221294.0, "step": 4042 }, { "epoch": 9.209806157354619, "grad_norm": 3.078125, "learning_rate": 8.542934181107687e-08, "loss": 0.5693, "mean_token_accuracy": 0.8811886161565781, "num_tokens": 432328223.0, "step": 4043 }, { "epoch": 9.212086659064994, "grad_norm": 3.390625, "learning_rate": 8.494186939412591e-08, "loss": 0.5692, "mean_token_accuracy": 0.8848684132099152, "num_tokens": 432435810.0, "step": 4044 }, { "epoch": 9.214367160775371, "grad_norm": 2.984375, "learning_rate": 8.44557677175456e-08, "loss": 0.5732, "mean_token_accuracy": 0.8800568580627441, "num_tokens": 432542846.0, "step": 4045 }, { "epoch": 9.216647662485746, "grad_norm": 2.5625, "learning_rate": 8.397103705723774e-08, "loss": 0.5613, "mean_token_accuracy": 0.8849961012601852, "num_tokens": 432649987.0, "step": 4046 }, { "epoch": 9.218928164196123, "grad_norm": 2.8125, "learning_rate": 8.348767768832561e-08, "loss": 0.566, "mean_token_accuracy": 0.8844825327396393, "num_tokens": 432756682.0, "step": 4047 }, { "epoch": 9.221208665906499, "grad_norm": 3.015625, "learning_rate": 8.300568988515529e-08, "loss": 0.5756, "mean_token_accuracy": 0.8798407763242722, "num_tokens": 432863598.0, "step": 4048 }, { "epoch": 9.223489167616876, "grad_norm": 3.703125, "learning_rate": 8.25250739212935e-08, "loss": 0.5694, "mean_token_accuracy": 0.8815117180347443, "num_tokens": 432970619.0, "step": 4049 }, { "epoch": 9.225769669327253, "grad_norm": 2.96875, "learning_rate": 8.204583006952843e-08, "loss": 0.5811, "mean_token_accuracy": 0.88057541847229, "num_tokens": 433077375.0, "step": 4050 }, { "epoch": 9.228050171037628, "grad_norm": 2.890625, "learning_rate": 8.156795860187028e-08, "loss": 0.5483, "mean_token_accuracy": 0.8875467032194138, "num_tokens": 433184582.0, "step": 4051 }, { "epoch": 9.230330672748005, "grad_norm": 2.65625, "learning_rate": 8.109145978954874e-08, "loss": 0.582, "mean_token_accuracy": 0.8819152861833572, "num_tokens": 433291314.0, "step": 4052 }, { "epoch": 9.23261117445838, "grad_norm": 2.75, "learning_rate": 8.061633390301582e-08, "loss": 0.5711, "mean_token_accuracy": 0.8808294236660004, "num_tokens": 433398910.0, "step": 4053 }, { "epoch": 9.234891676168758, "grad_norm": 3.03125, "learning_rate": 8.014258121194385e-08, "loss": 0.5633, "mean_token_accuracy": 0.885577842593193, "num_tokens": 433505751.0, "step": 4054 }, { "epoch": 9.237172177879133, "grad_norm": 3.984375, "learning_rate": 7.967020198522579e-08, "loss": 0.5536, "mean_token_accuracy": 0.885611966252327, "num_tokens": 433613189.0, "step": 4055 }, { "epoch": 9.23945267958951, "grad_norm": 2.3125, "learning_rate": 7.91991964909744e-08, "loss": 0.5444, "mean_token_accuracy": 0.8886383771896362, "num_tokens": 433720725.0, "step": 4056 }, { "epoch": 9.241733181299885, "grad_norm": 3.0625, "learning_rate": 7.872956499652418e-08, "loss": 0.5946, "mean_token_accuracy": 0.8744540363550186, "num_tokens": 433827682.0, "step": 4057 }, { "epoch": 9.244013683010262, "grad_norm": 3.0625, "learning_rate": 7.826130776842828e-08, "loss": 0.5958, "mean_token_accuracy": 0.8749952912330627, "num_tokens": 433934460.0, "step": 4058 }, { "epoch": 9.246294184720638, "grad_norm": 3.375, "learning_rate": 7.779442507246021e-08, "loss": 0.5542, "mean_token_accuracy": 0.8850563615560532, "num_tokens": 434041663.0, "step": 4059 }, { "epoch": 9.248574686431015, "grad_norm": 3.5625, "learning_rate": 7.73289171736144e-08, "loss": 0.5755, "mean_token_accuracy": 0.882745549082756, "num_tokens": 434147970.0, "step": 4060 }, { "epoch": 9.250855188141392, "grad_norm": 3.21875, "learning_rate": 7.686478433610339e-08, "loss": 0.5937, "mean_token_accuracy": 0.8776877820491791, "num_tokens": 434255682.0, "step": 4061 }, { "epoch": 9.253135689851767, "grad_norm": 2.96875, "learning_rate": 7.64020268233609e-08, "loss": 0.5453, "mean_token_accuracy": 0.8867078274488449, "num_tokens": 434362871.0, "step": 4062 }, { "epoch": 9.255416191562144, "grad_norm": 3.140625, "learning_rate": 7.594064489803821e-08, "loss": 0.5594, "mean_token_accuracy": 0.8832486718893051, "num_tokens": 434470335.0, "step": 4063 }, { "epoch": 9.25769669327252, "grad_norm": 3.015625, "learning_rate": 7.548063882200724e-08, "loss": 0.5691, "mean_token_accuracy": 0.8802042752504349, "num_tokens": 434577136.0, "step": 4064 }, { "epoch": 9.259977194982897, "grad_norm": 3.609375, "learning_rate": 7.502200885635858e-08, "loss": 0.5797, "mean_token_accuracy": 0.8827760517597198, "num_tokens": 434684066.0, "step": 4065 }, { "epoch": 9.262257696693272, "grad_norm": 5.09375, "learning_rate": 7.45647552614015e-08, "loss": 0.5649, "mean_token_accuracy": 0.8822989910840988, "num_tokens": 434791074.0, "step": 4066 }, { "epoch": 9.264538198403649, "grad_norm": 3.1875, "learning_rate": 7.410887829666479e-08, "loss": 0.5713, "mean_token_accuracy": 0.8809485137462616, "num_tokens": 434897922.0, "step": 4067 }, { "epoch": 9.266818700114024, "grad_norm": 2.65625, "learning_rate": 7.365437822089482e-08, "loss": 0.5737, "mean_token_accuracy": 0.8806920945644379, "num_tokens": 435005339.0, "step": 4068 }, { "epoch": 9.269099201824401, "grad_norm": 3.03125, "learning_rate": 7.320125529205746e-08, "loss": 0.5583, "mean_token_accuracy": 0.8838251531124115, "num_tokens": 435112853.0, "step": 4069 }, { "epoch": 9.271379703534778, "grad_norm": 3.265625, "learning_rate": 7.274950976733642e-08, "loss": 0.5781, "mean_token_accuracy": 0.8822825849056244, "num_tokens": 435220066.0, "step": 4070 }, { "epoch": 9.273660205245154, "grad_norm": 2.875, "learning_rate": 7.22991419031338e-08, "loss": 0.5787, "mean_token_accuracy": 0.8838555067777634, "num_tokens": 435327142.0, "step": 4071 }, { "epoch": 9.27594070695553, "grad_norm": 2.796875, "learning_rate": 7.185015195506961e-08, "loss": 0.5666, "mean_token_accuracy": 0.8847849667072296, "num_tokens": 435433996.0, "step": 4072 }, { "epoch": 9.278221208665906, "grad_norm": 3.46875, "learning_rate": 7.140254017798221e-08, "loss": 0.5752, "mean_token_accuracy": 0.8813326507806778, "num_tokens": 435541275.0, "step": 4073 }, { "epoch": 9.280501710376283, "grad_norm": 3.40625, "learning_rate": 7.095630682592669e-08, "loss": 0.5792, "mean_token_accuracy": 0.8786799758672714, "num_tokens": 435648369.0, "step": 4074 }, { "epoch": 9.282782212086659, "grad_norm": 2.578125, "learning_rate": 7.051145215217715e-08, "loss": 0.5671, "mean_token_accuracy": 0.8824814110994339, "num_tokens": 435755124.0, "step": 4075 }, { "epoch": 9.285062713797036, "grad_norm": 3.484375, "learning_rate": 7.006797640922436e-08, "loss": 0.5746, "mean_token_accuracy": 0.8815957754850388, "num_tokens": 435861525.0, "step": 4076 }, { "epoch": 9.287343215507411, "grad_norm": 3.015625, "learning_rate": 6.962587984877617e-08, "loss": 0.5816, "mean_token_accuracy": 0.8820008486509323, "num_tokens": 435968074.0, "step": 4077 }, { "epoch": 9.289623717217788, "grad_norm": 3.4375, "learning_rate": 6.918516272175879e-08, "loss": 0.5769, "mean_token_accuracy": 0.8813347816467285, "num_tokens": 436075305.0, "step": 4078 }, { "epoch": 9.291904218928163, "grad_norm": 2.78125, "learning_rate": 6.874582527831409e-08, "loss": 0.5631, "mean_token_accuracy": 0.8813544809818268, "num_tokens": 436183200.0, "step": 4079 }, { "epoch": 9.29418472063854, "grad_norm": 3.203125, "learning_rate": 6.830786776780174e-08, "loss": 0.607, "mean_token_accuracy": 0.8745275288820267, "num_tokens": 436289903.0, "step": 4080 }, { "epoch": 9.296465222348917, "grad_norm": 2.953125, "learning_rate": 6.78712904387982e-08, "loss": 0.5851, "mean_token_accuracy": 0.8781695067882538, "num_tokens": 436396884.0, "step": 4081 }, { "epoch": 9.298745724059293, "grad_norm": 3.15625, "learning_rate": 6.74360935390958e-08, "loss": 0.5561, "mean_token_accuracy": 0.8871065676212311, "num_tokens": 436504033.0, "step": 4082 }, { "epoch": 9.30102622576967, "grad_norm": 7.09375, "learning_rate": 6.700227731570475e-08, "loss": 0.5711, "mean_token_accuracy": 0.8803330808877945, "num_tokens": 436610843.0, "step": 4083 }, { "epoch": 9.303306727480045, "grad_norm": 2.734375, "learning_rate": 6.656984201485001e-08, "loss": 0.5725, "mean_token_accuracy": 0.8825246691703796, "num_tokens": 436717813.0, "step": 4084 }, { "epoch": 9.305587229190422, "grad_norm": 3.40625, "learning_rate": 6.613878788197359e-08, "loss": 0.5887, "mean_token_accuracy": 0.8762739151716232, "num_tokens": 436824381.0, "step": 4085 }, { "epoch": 9.307867730900798, "grad_norm": 2.46875, "learning_rate": 6.570911516173368e-08, "loss": 0.5568, "mean_token_accuracy": 0.8874360471963882, "num_tokens": 436931568.0, "step": 4086 }, { "epoch": 9.310148232611175, "grad_norm": 3.828125, "learning_rate": 6.528082409800434e-08, "loss": 0.5697, "mean_token_accuracy": 0.8826966434717178, "num_tokens": 437038163.0, "step": 4087 }, { "epoch": 9.31242873432155, "grad_norm": 2.578125, "learning_rate": 6.485391493387505e-08, "loss": 0.5684, "mean_token_accuracy": 0.883079007267952, "num_tokens": 437145250.0, "step": 4088 }, { "epoch": 9.314709236031927, "grad_norm": 3.25, "learning_rate": 6.442838791165168e-08, "loss": 0.5927, "mean_token_accuracy": 0.8812314420938492, "num_tokens": 437251824.0, "step": 4089 }, { "epoch": 9.316989737742304, "grad_norm": 3.578125, "learning_rate": 6.400424327285437e-08, "loss": 0.581, "mean_token_accuracy": 0.8754829615354538, "num_tokens": 437358958.0, "step": 4090 }, { "epoch": 9.31927023945268, "grad_norm": 2.703125, "learning_rate": 6.358148125822e-08, "loss": 0.5529, "mean_token_accuracy": 0.8846077919006348, "num_tokens": 437466352.0, "step": 4091 }, { "epoch": 9.321550741163056, "grad_norm": 2.96875, "learning_rate": 6.316010210769997e-08, "loss": 0.5987, "mean_token_accuracy": 0.8769486397504807, "num_tokens": 437573435.0, "step": 4092 }, { "epoch": 9.323831242873432, "grad_norm": 2.78125, "learning_rate": 6.274010606046071e-08, "loss": 0.5611, "mean_token_accuracy": 0.8839032351970673, "num_tokens": 437680600.0, "step": 4093 }, { "epoch": 9.326111744583809, "grad_norm": 2.796875, "learning_rate": 6.232149335488463e-08, "loss": 0.572, "mean_token_accuracy": 0.8846787661314011, "num_tokens": 437787269.0, "step": 4094 }, { "epoch": 9.328392246294184, "grad_norm": 3.84375, "learning_rate": 6.190426422856749e-08, "loss": 0.5708, "mean_token_accuracy": 0.8827256411314011, "num_tokens": 437893943.0, "step": 4095 }, { "epoch": 9.330672748004561, "grad_norm": 2.65625, "learning_rate": 6.148841891832069e-08, "loss": 0.5731, "mean_token_accuracy": 0.8799492716789246, "num_tokens": 438000911.0, "step": 4096 }, { "epoch": 9.332953249714937, "grad_norm": 2.796875, "learning_rate": 6.107395766016988e-08, "loss": 0.5613, "mean_token_accuracy": 0.8834062963724136, "num_tokens": 438107441.0, "step": 4097 }, { "epoch": 9.335233751425314, "grad_norm": 2.5625, "learning_rate": 6.066088068935577e-08, "loss": 0.5703, "mean_token_accuracy": 0.8836114853620529, "num_tokens": 438214716.0, "step": 4098 }, { "epoch": 9.33751425313569, "grad_norm": 2.671875, "learning_rate": 6.024918824033221e-08, "loss": 0.5945, "mean_token_accuracy": 0.878153920173645, "num_tokens": 438320941.0, "step": 4099 }, { "epoch": 9.339794754846066, "grad_norm": 2.921875, "learning_rate": 5.983888054676867e-08, "loss": 0.5657, "mean_token_accuracy": 0.8838547617197037, "num_tokens": 438428175.0, "step": 4100 }, { "epoch": 9.342075256556443, "grad_norm": 3.296875, "learning_rate": 5.9429957841546926e-08, "loss": 0.5726, "mean_token_accuracy": 0.8802156448364258, "num_tokens": 438534915.0, "step": 4101 }, { "epoch": 9.344355758266818, "grad_norm": 5.0625, "learning_rate": 5.902242035676409e-08, "loss": 0.563, "mean_token_accuracy": 0.8827090561389923, "num_tokens": 438641893.0, "step": 4102 }, { "epoch": 9.346636259977195, "grad_norm": 2.984375, "learning_rate": 5.8616268323730685e-08, "loss": 0.5755, "mean_token_accuracy": 0.8797716945409775, "num_tokens": 438749068.0, "step": 4103 }, { "epoch": 9.34891676168757, "grad_norm": 5.15625, "learning_rate": 5.821150197297038e-08, "loss": 0.5804, "mean_token_accuracy": 0.878428652882576, "num_tokens": 438855850.0, "step": 4104 }, { "epoch": 9.351197263397948, "grad_norm": 2.859375, "learning_rate": 5.780812153422161e-08, "loss": 0.5663, "mean_token_accuracy": 0.8853023201227188, "num_tokens": 438962709.0, "step": 4105 }, { "epoch": 9.353477765108323, "grad_norm": 4.53125, "learning_rate": 5.7406127236434016e-08, "loss": 0.5649, "mean_token_accuracy": 0.8817013502120972, "num_tokens": 439069382.0, "step": 4106 }, { "epoch": 9.3557582668187, "grad_norm": 4.15625, "learning_rate": 5.700551930777287e-08, "loss": 0.5525, "mean_token_accuracy": 0.8876905292272568, "num_tokens": 439176711.0, "step": 4107 }, { "epoch": 9.358038768529076, "grad_norm": 2.765625, "learning_rate": 5.66062979756149e-08, "loss": 0.568, "mean_token_accuracy": 0.8845077753067017, "num_tokens": 439284100.0, "step": 4108 }, { "epoch": 9.360319270239453, "grad_norm": 3.28125, "learning_rate": 5.620846346655079e-08, "loss": 0.5581, "mean_token_accuracy": 0.8825329095125198, "num_tokens": 439391460.0, "step": 4109 }, { "epoch": 9.36259977194983, "grad_norm": 2.765625, "learning_rate": 5.5812016006383805e-08, "loss": 0.5723, "mean_token_accuracy": 0.8812670260667801, "num_tokens": 439498912.0, "step": 4110 }, { "epoch": 9.364880273660205, "grad_norm": 2.828125, "learning_rate": 5.5416955820129515e-08, "loss": 0.5616, "mean_token_accuracy": 0.8810815811157227, "num_tokens": 439606318.0, "step": 4111 }, { "epoch": 9.367160775370582, "grad_norm": 3.1875, "learning_rate": 5.50232831320166e-08, "loss": 0.5562, "mean_token_accuracy": 0.8831993341445923, "num_tokens": 439712921.0, "step": 4112 }, { "epoch": 9.369441277080957, "grad_norm": 2.8125, "learning_rate": 5.463099816548578e-08, "loss": 0.5788, "mean_token_accuracy": 0.8801111429929733, "num_tokens": 439819707.0, "step": 4113 }, { "epoch": 9.371721778791335, "grad_norm": 3.015625, "learning_rate": 5.424010114319117e-08, "loss": 0.5637, "mean_token_accuracy": 0.8822034150362015, "num_tokens": 439926343.0, "step": 4114 }, { "epoch": 9.37400228050171, "grad_norm": 3.84375, "learning_rate": 5.385059228699779e-08, "loss": 0.5728, "mean_token_accuracy": 0.8807101249694824, "num_tokens": 440033050.0, "step": 4115 }, { "epoch": 9.376282782212087, "grad_norm": 3.078125, "learning_rate": 5.346247181798325e-08, "loss": 0.5855, "mean_token_accuracy": 0.8771592527627945, "num_tokens": 440140455.0, "step": 4116 }, { "epoch": 9.378563283922462, "grad_norm": 3.015625, "learning_rate": 5.307573995643772e-08, "loss": 0.5551, "mean_token_accuracy": 0.8888624608516693, "num_tokens": 440247555.0, "step": 4117 }, { "epoch": 9.38084378563284, "grad_norm": 2.890625, "learning_rate": 5.2690396921862284e-08, "loss": 0.5652, "mean_token_accuracy": 0.8854438215494156, "num_tokens": 440354832.0, "step": 4118 }, { "epoch": 9.383124287343216, "grad_norm": 2.828125, "learning_rate": 5.230644293297088e-08, "loss": 0.5617, "mean_token_accuracy": 0.8868270665407181, "num_tokens": 440462224.0, "step": 4119 }, { "epoch": 9.385404789053592, "grad_norm": 2.953125, "learning_rate": 5.192387820768752e-08, "loss": 0.5614, "mean_token_accuracy": 0.8861428201198578, "num_tokens": 440569106.0, "step": 4120 }, { "epoch": 9.387685290763969, "grad_norm": 3.515625, "learning_rate": 5.154270296314878e-08, "loss": 0.5661, "mean_token_accuracy": 0.8851838260889053, "num_tokens": 440676323.0, "step": 4121 }, { "epoch": 9.389965792474344, "grad_norm": 2.796875, "learning_rate": 5.116291741570301e-08, "loss": 0.5632, "mean_token_accuracy": 0.8838883936405182, "num_tokens": 440784286.0, "step": 4122 }, { "epoch": 9.392246294184721, "grad_norm": 2.921875, "learning_rate": 5.078452178090831e-08, "loss": 0.561, "mean_token_accuracy": 0.8820521384477615, "num_tokens": 440891324.0, "step": 4123 }, { "epoch": 9.394526795895096, "grad_norm": 2.5625, "learning_rate": 5.040751627353513e-08, "loss": 0.5779, "mean_token_accuracy": 0.8817588239908218, "num_tokens": 440998383.0, "step": 4124 }, { "epoch": 9.396807297605474, "grad_norm": 2.65625, "learning_rate": 5.003190110756451e-08, "loss": 0.5587, "mean_token_accuracy": 0.8854601830244064, "num_tokens": 441105621.0, "step": 4125 }, { "epoch": 9.399087799315849, "grad_norm": 2.953125, "learning_rate": 4.965767649618869e-08, "loss": 0.5863, "mean_token_accuracy": 0.8789174407720566, "num_tokens": 441212053.0, "step": 4126 }, { "epoch": 9.401368301026226, "grad_norm": 2.625, "learning_rate": 4.928484265180972e-08, "loss": 0.5602, "mean_token_accuracy": 0.8848675191402435, "num_tokens": 441318500.0, "step": 4127 }, { "epoch": 9.403648802736601, "grad_norm": 2.8125, "learning_rate": 4.8913399786041097e-08, "loss": 0.591, "mean_token_accuracy": 0.8787374198436737, "num_tokens": 441425061.0, "step": 4128 }, { "epoch": 9.405929304446978, "grad_norm": 2.875, "learning_rate": 4.854334810970668e-08, "loss": 0.5563, "mean_token_accuracy": 0.8844193369150162, "num_tokens": 441531921.0, "step": 4129 }, { "epoch": 9.408209806157355, "grad_norm": 3.171875, "learning_rate": 4.817468783284096e-08, "loss": 0.5782, "mean_token_accuracy": 0.8798136711120605, "num_tokens": 441639273.0, "step": 4130 }, { "epoch": 9.41049030786773, "grad_norm": 2.625, "learning_rate": 4.7807419164687673e-08, "loss": 0.5543, "mean_token_accuracy": 0.8829266577959061, "num_tokens": 441745920.0, "step": 4131 }, { "epoch": 9.412770809578108, "grad_norm": 6.03125, "learning_rate": 4.7441542313702293e-08, "loss": 0.5991, "mean_token_accuracy": 0.8739556968212128, "num_tokens": 441852952.0, "step": 4132 }, { "epoch": 9.415051311288483, "grad_norm": 3.09375, "learning_rate": 4.707705748754898e-08, "loss": 0.5619, "mean_token_accuracy": 0.8853324800729752, "num_tokens": 441960257.0, "step": 4133 }, { "epoch": 9.41733181299886, "grad_norm": 3.734375, "learning_rate": 4.671396489310198e-08, "loss": 0.5689, "mean_token_accuracy": 0.8818874061107635, "num_tokens": 442067515.0, "step": 4134 }, { "epoch": 9.419612314709235, "grad_norm": 4.375, "learning_rate": 4.635226473644616e-08, "loss": 0.5711, "mean_token_accuracy": 0.8827540129423141, "num_tokens": 442174146.0, "step": 4135 }, { "epoch": 9.421892816419613, "grad_norm": 3.046875, "learning_rate": 4.599195722287536e-08, "loss": 0.5512, "mean_token_accuracy": 0.8860030323266983, "num_tokens": 442281613.0, "step": 4136 }, { "epoch": 9.424173318129988, "grad_norm": 2.6875, "learning_rate": 4.5633042556893493e-08, "loss": 0.5853, "mean_token_accuracy": 0.8759856522083282, "num_tokens": 442388874.0, "step": 4137 }, { "epoch": 9.426453819840365, "grad_norm": 2.765625, "learning_rate": 4.527552094221288e-08, "loss": 0.5799, "mean_token_accuracy": 0.8808791786432266, "num_tokens": 442495992.0, "step": 4138 }, { "epoch": 9.428734321550742, "grad_norm": 2.953125, "learning_rate": 4.4919392581756204e-08, "loss": 0.5786, "mean_token_accuracy": 0.8829675912857056, "num_tokens": 442603136.0, "step": 4139 }, { "epoch": 9.431014823261117, "grad_norm": 3.4375, "learning_rate": 4.456465767765539e-08, "loss": 0.5803, "mean_token_accuracy": 0.8808294236660004, "num_tokens": 442710669.0, "step": 4140 }, { "epoch": 9.433295324971494, "grad_norm": 3.265625, "learning_rate": 4.421131643125104e-08, "loss": 0.5673, "mean_token_accuracy": 0.8814819753170013, "num_tokens": 442817853.0, "step": 4141 }, { "epoch": 9.43557582668187, "grad_norm": 2.53125, "learning_rate": 4.3859369043092183e-08, "loss": 0.558, "mean_token_accuracy": 0.8864561766386032, "num_tokens": 442925142.0, "step": 4142 }, { "epoch": 9.437856328392247, "grad_norm": 3.34375, "learning_rate": 4.350881571293819e-08, "loss": 0.5963, "mean_token_accuracy": 0.8762660771608353, "num_tokens": 443032724.0, "step": 4143 }, { "epoch": 9.440136830102622, "grad_norm": 2.984375, "learning_rate": 4.315965663975602e-08, "loss": 0.5754, "mean_token_accuracy": 0.8806591182947159, "num_tokens": 443140265.0, "step": 4144 }, { "epoch": 9.442417331813, "grad_norm": 3.296875, "learning_rate": 4.281189202172131e-08, "loss": 0.5769, "mean_token_accuracy": 0.8811807930469513, "num_tokens": 443247055.0, "step": 4145 }, { "epoch": 9.444697833523374, "grad_norm": 2.734375, "learning_rate": 4.246552205621896e-08, "loss": 0.5625, "mean_token_accuracy": 0.8876753896474838, "num_tokens": 443353977.0, "step": 4146 }, { "epoch": 9.446978335233752, "grad_norm": 2.703125, "learning_rate": 4.212054693984169e-08, "loss": 0.5843, "mean_token_accuracy": 0.8812119662761688, "num_tokens": 443460573.0, "step": 4147 }, { "epoch": 9.449258836944129, "grad_norm": 2.859375, "learning_rate": 4.177696686839094e-08, "loss": 0.593, "mean_token_accuracy": 0.8792661875486374, "num_tokens": 443567499.0, "step": 4148 }, { "epoch": 9.451539338654504, "grad_norm": 2.96875, "learning_rate": 4.143478203687573e-08, "loss": 0.5428, "mean_token_accuracy": 0.8875003904104233, "num_tokens": 443674500.0, "step": 4149 }, { "epoch": 9.453819840364881, "grad_norm": 2.46875, "learning_rate": 4.1093992639514026e-08, "loss": 0.5456, "mean_token_accuracy": 0.8857799172401428, "num_tokens": 443781405.0, "step": 4150 }, { "epoch": 9.456100342075256, "grad_norm": 2.71875, "learning_rate": 4.0754598869730824e-08, "loss": 0.5592, "mean_token_accuracy": 0.8854545056819916, "num_tokens": 443888679.0, "step": 4151 }, { "epoch": 9.458380843785633, "grad_norm": 3.15625, "learning_rate": 4.041660092015981e-08, "loss": 0.5611, "mean_token_accuracy": 0.8850967884063721, "num_tokens": 443996366.0, "step": 4152 }, { "epoch": 9.460661345496009, "grad_norm": 3.234375, "learning_rate": 4.007999898264225e-08, "loss": 0.596, "mean_token_accuracy": 0.8776397556066513, "num_tokens": 444104029.0, "step": 4153 }, { "epoch": 9.462941847206386, "grad_norm": 3.203125, "learning_rate": 3.9744793248226446e-08, "loss": 0.5659, "mean_token_accuracy": 0.8829027712345123, "num_tokens": 444211032.0, "step": 4154 }, { "epoch": 9.465222348916761, "grad_norm": 3.84375, "learning_rate": 3.9410983907169076e-08, "loss": 0.5758, "mean_token_accuracy": 0.8814502954483032, "num_tokens": 444317977.0, "step": 4155 }, { "epoch": 9.467502850627138, "grad_norm": 4.15625, "learning_rate": 3.90785711489336e-08, "loss": 0.594, "mean_token_accuracy": 0.8752471357584, "num_tokens": 444424810.0, "step": 4156 }, { "epoch": 9.469783352337513, "grad_norm": 2.734375, "learning_rate": 3.874755516219103e-08, "loss": 0.5697, "mean_token_accuracy": 0.8843945115804672, "num_tokens": 444532283.0, "step": 4157 }, { "epoch": 9.47206385404789, "grad_norm": 2.828125, "learning_rate": 3.8417936134820255e-08, "loss": 0.5483, "mean_token_accuracy": 0.8853261023759842, "num_tokens": 444639261.0, "step": 4158 }, { "epoch": 9.474344355758268, "grad_norm": 2.796875, "learning_rate": 3.808971425390606e-08, "loss": 0.5861, "mean_token_accuracy": 0.8793308734893799, "num_tokens": 444745716.0, "step": 4159 }, { "epoch": 9.476624857468643, "grad_norm": 4.1875, "learning_rate": 3.7762889705740824e-08, "loss": 0.5629, "mean_token_accuracy": 0.8815562427043915, "num_tokens": 444852792.0, "step": 4160 }, { "epoch": 9.47890535917902, "grad_norm": 2.828125, "learning_rate": 3.743746267582421e-08, "loss": 0.5619, "mean_token_accuracy": 0.8823586702346802, "num_tokens": 444960631.0, "step": 4161 }, { "epoch": 9.481185860889395, "grad_norm": 3.65625, "learning_rate": 3.711343334886236e-08, "loss": 0.5929, "mean_token_accuracy": 0.8763528019189835, "num_tokens": 445067005.0, "step": 4162 }, { "epoch": 9.483466362599772, "grad_norm": 5.28125, "learning_rate": 3.679080190876788e-08, "loss": 0.558, "mean_token_accuracy": 0.8835884630680084, "num_tokens": 445174337.0, "step": 4163 }, { "epoch": 9.485746864310148, "grad_norm": 2.875, "learning_rate": 3.646956853865985e-08, "loss": 0.5779, "mean_token_accuracy": 0.8791737705469131, "num_tokens": 445281162.0, "step": 4164 }, { "epoch": 9.488027366020525, "grad_norm": 2.953125, "learning_rate": 3.614973342086464e-08, "loss": 0.5696, "mean_token_accuracy": 0.8816744834184647, "num_tokens": 445388038.0, "step": 4165 }, { "epoch": 9.4903078677309, "grad_norm": 3.75, "learning_rate": 3.583129673691427e-08, "loss": 0.5796, "mean_token_accuracy": 0.8827396035194397, "num_tokens": 445494974.0, "step": 4166 }, { "epoch": 9.492588369441277, "grad_norm": 3.96875, "learning_rate": 3.551425866754693e-08, "loss": 0.5716, "mean_token_accuracy": 0.8809922337532043, "num_tokens": 445602110.0, "step": 4167 }, { "epoch": 9.494868871151652, "grad_norm": 3.0625, "learning_rate": 3.519861939270786e-08, "loss": 0.5779, "mean_token_accuracy": 0.8822459131479263, "num_tokens": 445709330.0, "step": 4168 }, { "epoch": 9.49714937286203, "grad_norm": 2.578125, "learning_rate": 3.4884379091547905e-08, "loss": 0.59, "mean_token_accuracy": 0.8781626224517822, "num_tokens": 445816251.0, "step": 4169 }, { "epoch": 9.499429874572407, "grad_norm": 3.703125, "learning_rate": 3.457153794242302e-08, "loss": 0.581, "mean_token_accuracy": 0.8822884410619736, "num_tokens": 445923058.0, "step": 4170 }, { "epoch": 9.501710376282782, "grad_norm": 2.734375, "learning_rate": 3.4260096122896435e-08, "loss": 0.5643, "mean_token_accuracy": 0.8818854689598083, "num_tokens": 446030612.0, "step": 4171 }, { "epoch": 9.503990877993159, "grad_norm": 2.71875, "learning_rate": 3.3950053809736204e-08, "loss": 0.5733, "mean_token_accuracy": 0.8821684867143631, "num_tokens": 446137524.0, "step": 4172 }, { "epoch": 9.506271379703534, "grad_norm": 3.046875, "learning_rate": 3.364141117891656e-08, "loss": 0.5467, "mean_token_accuracy": 0.8861701488494873, "num_tokens": 446245159.0, "step": 4173 }, { "epoch": 9.508551881413911, "grad_norm": 4.3125, "learning_rate": 3.333416840561709e-08, "loss": 0.5643, "mean_token_accuracy": 0.8825655877590179, "num_tokens": 446351813.0, "step": 4174 }, { "epoch": 9.510832383124287, "grad_norm": 2.890625, "learning_rate": 3.302832566422276e-08, "loss": 0.5637, "mean_token_accuracy": 0.8839205503463745, "num_tokens": 446459491.0, "step": 4175 }, { "epoch": 9.513112884834664, "grad_norm": 3.109375, "learning_rate": 3.272388312832414e-08, "loss": 0.5609, "mean_token_accuracy": 0.88397616147995, "num_tokens": 446566325.0, "step": 4176 }, { "epoch": 9.515393386545039, "grad_norm": 2.8125, "learning_rate": 3.242084097071663e-08, "loss": 0.5719, "mean_token_accuracy": 0.8808989524841309, "num_tokens": 446672682.0, "step": 4177 }, { "epoch": 9.517673888255416, "grad_norm": 2.578125, "learning_rate": 3.211919936340152e-08, "loss": 0.5851, "mean_token_accuracy": 0.8773170709609985, "num_tokens": 446779847.0, "step": 4178 }, { "epoch": 9.519954389965793, "grad_norm": 3.4375, "learning_rate": 3.1818958477584375e-08, "loss": 0.562, "mean_token_accuracy": 0.8823383450508118, "num_tokens": 446886733.0, "step": 4179 }, { "epoch": 9.522234891676169, "grad_norm": 2.78125, "learning_rate": 3.152011848367664e-08, "loss": 0.5527, "mean_token_accuracy": 0.8837304264307022, "num_tokens": 446994279.0, "step": 4180 }, { "epoch": 9.522234891676169, "eval_loss": 0.5863840579986572, "eval_mean_token_accuracy": 0.8800425250720615, "eval_num_tokens": 446994279.0, "eval_runtime": 58.647, "eval_samples_per_second": 142.974, "eval_steps_per_second": 4.484, "step": 4180 }, { "epoch": 9.524515393386546, "grad_norm": 3.328125, "learning_rate": 3.1222679551293486e-08, "loss": 0.5806, "mean_token_accuracy": 0.878794476389885, "num_tokens": 447100998.0, "step": 4181 }, { "epoch": 9.526795895096921, "grad_norm": 3.6875, "learning_rate": 3.0926641849255976e-08, "loss": 0.5735, "mean_token_accuracy": 0.8829492926597595, "num_tokens": 447208281.0, "step": 4182 }, { "epoch": 9.529076396807298, "grad_norm": 3.09375, "learning_rate": 3.063200554558915e-08, "loss": 0.5751, "mean_token_accuracy": 0.8793532252311707, "num_tokens": 447315119.0, "step": 4183 }, { "epoch": 9.531356898517673, "grad_norm": 2.453125, "learning_rate": 3.033877080752312e-08, "loss": 0.5559, "mean_token_accuracy": 0.8861254155635834, "num_tokens": 447422365.0, "step": 4184 }, { "epoch": 9.53363740022805, "grad_norm": 2.875, "learning_rate": 3.0046937801491983e-08, "loss": 0.5595, "mean_token_accuracy": 0.8834296315908432, "num_tokens": 447529666.0, "step": 4185 }, { "epoch": 9.535917901938426, "grad_norm": 2.921875, "learning_rate": 2.97565066931349e-08, "loss": 0.5897, "mean_token_accuracy": 0.8798715174198151, "num_tokens": 447636941.0, "step": 4186 }, { "epoch": 9.538198403648803, "grad_norm": 3.984375, "learning_rate": 2.9467477647294464e-08, "loss": 0.5894, "mean_token_accuracy": 0.8795375227928162, "num_tokens": 447744498.0, "step": 4187 }, { "epoch": 9.54047890535918, "grad_norm": 3.9375, "learning_rate": 2.917985082801833e-08, "loss": 0.5647, "mean_token_accuracy": 0.8826041370630264, "num_tokens": 447852156.0, "step": 4188 }, { "epoch": 9.542759407069555, "grad_norm": 2.9375, "learning_rate": 2.8893626398557583e-08, "loss": 0.5683, "mean_token_accuracy": 0.8823677599430084, "num_tokens": 447958991.0, "step": 4189 }, { "epoch": 9.545039908779932, "grad_norm": 3.71875, "learning_rate": 2.8608804521368382e-08, "loss": 0.5586, "mean_token_accuracy": 0.8871615529060364, "num_tokens": 448066219.0, "step": 4190 }, { "epoch": 9.547320410490308, "grad_norm": 3.640625, "learning_rate": 2.832538535810947e-08, "loss": 0.5837, "mean_token_accuracy": 0.8768136501312256, "num_tokens": 448173466.0, "step": 4191 }, { "epoch": 9.549600912200685, "grad_norm": 3.1875, "learning_rate": 2.804336906964439e-08, "loss": 0.5775, "mean_token_accuracy": 0.8800145536661148, "num_tokens": 448280207.0, "step": 4192 }, { "epoch": 9.55188141391106, "grad_norm": 4.375, "learning_rate": 2.7762755816039823e-08, "loss": 0.5778, "mean_token_accuracy": 0.8806869983673096, "num_tokens": 448387602.0, "step": 4193 }, { "epoch": 9.554161915621437, "grad_norm": 3.578125, "learning_rate": 2.74835457565667e-08, "loss": 0.5452, "mean_token_accuracy": 0.8861921578645706, "num_tokens": 448495042.0, "step": 4194 }, { "epoch": 9.556442417331812, "grad_norm": 2.796875, "learning_rate": 2.7205739049699365e-08, "loss": 0.5467, "mean_token_accuracy": 0.8862817734479904, "num_tokens": 448603055.0, "step": 4195 }, { "epoch": 9.55872291904219, "grad_norm": 3.75, "learning_rate": 2.6929335853115302e-08, "loss": 0.5855, "mean_token_accuracy": 0.878138080239296, "num_tokens": 448710105.0, "step": 4196 }, { "epoch": 9.561003420752566, "grad_norm": 2.640625, "learning_rate": 2.6654336323695963e-08, "loss": 0.5668, "mean_token_accuracy": 0.8829863220453262, "num_tokens": 448817588.0, "step": 4197 }, { "epoch": 9.563283922462942, "grad_norm": 2.828125, "learning_rate": 2.63807406175251e-08, "loss": 0.5701, "mean_token_accuracy": 0.8825019598007202, "num_tokens": 448924426.0, "step": 4198 }, { "epoch": 9.565564424173319, "grad_norm": 3.1875, "learning_rate": 2.6108548889891005e-08, "loss": 0.5751, "mean_token_accuracy": 0.8802784979343414, "num_tokens": 449032265.0, "step": 4199 }, { "epoch": 9.567844925883694, "grad_norm": 3.03125, "learning_rate": 2.5837761295284258e-08, "loss": 0.5914, "mean_token_accuracy": 0.878703162074089, "num_tokens": 449139772.0, "step": 4200 }, { "epoch": 9.570125427594071, "grad_norm": 3.515625, "learning_rate": 2.5568377987398862e-08, "loss": 0.5538, "mean_token_accuracy": 0.8867183774709702, "num_tokens": 449247187.0, "step": 4201 }, { "epoch": 9.572405929304447, "grad_norm": 2.578125, "learning_rate": 2.5300399119131124e-08, "loss": 0.5641, "mean_token_accuracy": 0.8858496695756912, "num_tokens": 449354632.0, "step": 4202 }, { "epoch": 9.574686431014824, "grad_norm": 3.375, "learning_rate": 2.5033824842581046e-08, "loss": 0.5552, "mean_token_accuracy": 0.8847160637378693, "num_tokens": 449461642.0, "step": 4203 }, { "epoch": 9.576966932725199, "grad_norm": 3.328125, "learning_rate": 2.476865530905065e-08, "loss": 0.5726, "mean_token_accuracy": 0.8837704807519913, "num_tokens": 449567936.0, "step": 4204 }, { "epoch": 9.579247434435576, "grad_norm": 2.65625, "learning_rate": 2.4504890669045654e-08, "loss": 0.5684, "mean_token_accuracy": 0.8814296871423721, "num_tokens": 449674620.0, "step": 4205 }, { "epoch": 9.581527936145951, "grad_norm": 3.0, "learning_rate": 2.4242531072273255e-08, "loss": 0.5677, "mean_token_accuracy": 0.8843848407268524, "num_tokens": 449782103.0, "step": 4206 }, { "epoch": 9.583808437856328, "grad_norm": 2.484375, "learning_rate": 2.398157666764378e-08, "loss": 0.5678, "mean_token_accuracy": 0.883353129029274, "num_tokens": 449888924.0, "step": 4207 }, { "epoch": 9.586088939566705, "grad_norm": 2.875, "learning_rate": 2.3722027603270415e-08, "loss": 0.5702, "mean_token_accuracy": 0.8861001282930374, "num_tokens": 449996145.0, "step": 4208 }, { "epoch": 9.58836944127708, "grad_norm": 3.109375, "learning_rate": 2.3463884026467265e-08, "loss": 0.585, "mean_token_accuracy": 0.8813659995794296, "num_tokens": 450103843.0, "step": 4209 }, { "epoch": 9.590649942987458, "grad_norm": 2.703125, "learning_rate": 2.320714608375241e-08, "loss": 0.5616, "mean_token_accuracy": 0.8853833377361298, "num_tokens": 450210552.0, "step": 4210 }, { "epoch": 9.592930444697833, "grad_norm": 4.96875, "learning_rate": 2.295181392084511e-08, "loss": 0.5994, "mean_token_accuracy": 0.877122089266777, "num_tokens": 450318286.0, "step": 4211 }, { "epoch": 9.59521094640821, "grad_norm": 3.765625, "learning_rate": 2.269788768266695e-08, "loss": 0.5508, "mean_token_accuracy": 0.8865731358528137, "num_tokens": 450425852.0, "step": 4212 }, { "epoch": 9.597491448118586, "grad_norm": 3.0, "learning_rate": 2.2445367513341533e-08, "loss": 0.5923, "mean_token_accuracy": 0.8766883164644241, "num_tokens": 450532753.0, "step": 4213 }, { "epoch": 9.599771949828963, "grad_norm": 2.953125, "learning_rate": 2.21942535561942e-08, "loss": 0.5868, "mean_token_accuracy": 0.8800808787345886, "num_tokens": 450639888.0, "step": 4214 }, { "epoch": 9.602052451539338, "grad_norm": 3.765625, "learning_rate": 2.1944545953752894e-08, "loss": 0.5824, "mean_token_accuracy": 0.8837506324052811, "num_tokens": 450746949.0, "step": 4215 }, { "epoch": 9.604332953249715, "grad_norm": 3.265625, "learning_rate": 2.1696244847746737e-08, "loss": 0.574, "mean_token_accuracy": 0.8841430097818375, "num_tokens": 450853966.0, "step": 4216 }, { "epoch": 9.60661345496009, "grad_norm": 2.734375, "learning_rate": 2.1449350379106336e-08, "loss": 0.5536, "mean_token_accuracy": 0.8856358528137207, "num_tokens": 450961525.0, "step": 4217 }, { "epoch": 9.608893956670467, "grad_norm": 3.140625, "learning_rate": 2.1203862687964595e-08, "loss": 0.5718, "mean_token_accuracy": 0.8838754594326019, "num_tokens": 451068191.0, "step": 4218 }, { "epoch": 9.611174458380844, "grad_norm": 4.28125, "learning_rate": 2.0959781913655053e-08, "loss": 0.5791, "mean_token_accuracy": 0.8790152668952942, "num_tokens": 451175232.0, "step": 4219 }, { "epoch": 9.61345496009122, "grad_norm": 3.078125, "learning_rate": 2.0717108194713566e-08, "loss": 0.5641, "mean_token_accuracy": 0.8844882100820541, "num_tokens": 451282602.0, "step": 4220 }, { "epoch": 9.615735461801597, "grad_norm": 2.953125, "learning_rate": 2.0475841668877172e-08, "loss": 0.5797, "mean_token_accuracy": 0.8819600045681, "num_tokens": 451389683.0, "step": 4221 }, { "epoch": 9.618015963511972, "grad_norm": 3.078125, "learning_rate": 2.0235982473084115e-08, "loss": 0.5573, "mean_token_accuracy": 0.8830082267522812, "num_tokens": 451496555.0, "step": 4222 }, { "epoch": 9.62029646522235, "grad_norm": 3.140625, "learning_rate": 1.9997530743473548e-08, "loss": 0.5728, "mean_token_accuracy": 0.8831875026226044, "num_tokens": 451603138.0, "step": 4223 }, { "epoch": 9.622576966932725, "grad_norm": 3.28125, "learning_rate": 1.9760486615386376e-08, "loss": 0.5686, "mean_token_accuracy": 0.8828965127468109, "num_tokens": 451709899.0, "step": 4224 }, { "epoch": 9.624857468643102, "grad_norm": 3.078125, "learning_rate": 1.9524850223363868e-08, "loss": 0.5722, "mean_token_accuracy": 0.8803802877664566, "num_tokens": 451817093.0, "step": 4225 }, { "epoch": 9.627137970353477, "grad_norm": 2.90625, "learning_rate": 1.9290621701149315e-08, "loss": 0.5728, "mean_token_accuracy": 0.883967936038971, "num_tokens": 451924424.0, "step": 4226 }, { "epoch": 9.629418472063854, "grad_norm": 3.9375, "learning_rate": 1.905780118168582e-08, "loss": 0.5787, "mean_token_accuracy": 0.8782493621110916, "num_tokens": 452031788.0, "step": 4227 }, { "epoch": 9.631698973774231, "grad_norm": 3.625, "learning_rate": 1.882638879711768e-08, "loss": 0.5915, "mean_token_accuracy": 0.8770202100276947, "num_tokens": 452139174.0, "step": 4228 }, { "epoch": 9.633979475484606, "grad_norm": 2.640625, "learning_rate": 1.859638467879038e-08, "loss": 0.5595, "mean_token_accuracy": 0.8836124539375305, "num_tokens": 452246865.0, "step": 4229 }, { "epoch": 9.636259977194984, "grad_norm": 2.859375, "learning_rate": 1.8367788957250054e-08, "loss": 0.56, "mean_token_accuracy": 0.8824215233325958, "num_tokens": 452354042.0, "step": 4230 }, { "epoch": 9.638540478905359, "grad_norm": 2.765625, "learning_rate": 1.8140601762242916e-08, "loss": 0.5793, "mean_token_accuracy": 0.8815779983997345, "num_tokens": 452461206.0, "step": 4231 }, { "epoch": 9.640820980615736, "grad_norm": 3.625, "learning_rate": 1.7914823222715817e-08, "loss": 0.5824, "mean_token_accuracy": 0.8769263476133347, "num_tokens": 452567894.0, "step": 4232 }, { "epoch": 9.643101482326111, "grad_norm": 2.890625, "learning_rate": 1.7690453466816805e-08, "loss": 0.5695, "mean_token_accuracy": 0.8843720555305481, "num_tokens": 452675400.0, "step": 4233 }, { "epoch": 9.645381984036488, "grad_norm": 3.0625, "learning_rate": 1.7467492621893457e-08, "loss": 0.5697, "mean_token_accuracy": 0.8844276815652847, "num_tokens": 452782870.0, "step": 4234 }, { "epoch": 9.647662485746864, "grad_norm": 3.078125, "learning_rate": 1.724594081449399e-08, "loss": 0.5705, "mean_token_accuracy": 0.8851383030414581, "num_tokens": 452889771.0, "step": 4235 }, { "epoch": 9.64994298745724, "grad_norm": 3.8125, "learning_rate": 1.702579817036726e-08, "loss": 0.5686, "mean_token_accuracy": 0.8865519165992737, "num_tokens": 452997568.0, "step": 4236 }, { "epoch": 9.652223489167618, "grad_norm": 2.96875, "learning_rate": 1.680706481446165e-08, "loss": 0.5671, "mean_token_accuracy": 0.8826638162136078, "num_tokens": 453104638.0, "step": 4237 }, { "epoch": 9.654503990877993, "grad_norm": 3.09375, "learning_rate": 1.6589740870926186e-08, "loss": 0.5501, "mean_token_accuracy": 0.8905289322137833, "num_tokens": 453211884.0, "step": 4238 }, { "epoch": 9.65678449258837, "grad_norm": 2.640625, "learning_rate": 1.6373826463109976e-08, "loss": 0.5844, "mean_token_accuracy": 0.8786414712667465, "num_tokens": 453318930.0, "step": 4239 }, { "epoch": 9.659064994298745, "grad_norm": 2.59375, "learning_rate": 1.6159321713561382e-08, "loss": 0.5901, "mean_token_accuracy": 0.8797651678323746, "num_tokens": 453426505.0, "step": 4240 }, { "epoch": 9.661345496009123, "grad_norm": 3.203125, "learning_rate": 1.5946226744029402e-08, "loss": 0.5725, "mean_token_accuracy": 0.8815406709909439, "num_tokens": 453533705.0, "step": 4241 }, { "epoch": 9.663625997719498, "grad_norm": 4.21875, "learning_rate": 1.5734541675462567e-08, "loss": 0.556, "mean_token_accuracy": 0.8848689198493958, "num_tokens": 453640428.0, "step": 4242 }, { "epoch": 9.665906499429875, "grad_norm": 4.21875, "learning_rate": 1.5524266628009212e-08, "loss": 0.5868, "mean_token_accuracy": 0.8785993754863739, "num_tokens": 453747745.0, "step": 4243 }, { "epoch": 9.66818700114025, "grad_norm": 3.140625, "learning_rate": 1.5315401721017752e-08, "loss": 0.5512, "mean_token_accuracy": 0.8862568438053131, "num_tokens": 453854721.0, "step": 4244 }, { "epoch": 9.670467502850627, "grad_norm": 2.65625, "learning_rate": 1.5107947073035312e-08, "loss": 0.575, "mean_token_accuracy": 0.8809828609228134, "num_tokens": 453962100.0, "step": 4245 }, { "epoch": 9.672748004561003, "grad_norm": 4.5, "learning_rate": 1.4901902801809642e-08, "loss": 0.5666, "mean_token_accuracy": 0.8845654428005219, "num_tokens": 454069089.0, "step": 4246 }, { "epoch": 9.67502850627138, "grad_norm": 2.671875, "learning_rate": 1.4697269024287198e-08, "loss": 0.5652, "mean_token_accuracy": 0.8847362250089645, "num_tokens": 454176352.0, "step": 4247 }, { "epoch": 9.677309007981757, "grad_norm": 3.9375, "learning_rate": 1.4494045856613959e-08, "loss": 0.5736, "mean_token_accuracy": 0.8813314586877823, "num_tokens": 454282952.0, "step": 4248 }, { "epoch": 9.679589509692132, "grad_norm": 3.078125, "learning_rate": 1.4292233414135992e-08, "loss": 0.5811, "mean_token_accuracy": 0.8810845017433167, "num_tokens": 454390017.0, "step": 4249 }, { "epoch": 9.68187001140251, "grad_norm": 3.765625, "learning_rate": 1.4091831811397782e-08, "loss": 0.5771, "mean_token_accuracy": 0.8813609778881073, "num_tokens": 454496557.0, "step": 4250 }, { "epoch": 9.684150513112884, "grad_norm": 3.359375, "learning_rate": 1.38928411621439e-08, "loss": 0.576, "mean_token_accuracy": 0.8821087181568146, "num_tokens": 454603755.0, "step": 4251 }, { "epoch": 9.686431014823262, "grad_norm": 5.5, "learning_rate": 1.3695261579316776e-08, "loss": 0.5867, "mean_token_accuracy": 0.8810128718614578, "num_tokens": 454710416.0, "step": 4252 }, { "epoch": 9.688711516533637, "grad_norm": 2.546875, "learning_rate": 1.3499093175059208e-08, "loss": 0.5492, "mean_token_accuracy": 0.8873603790998459, "num_tokens": 454817321.0, "step": 4253 }, { "epoch": 9.690992018244014, "grad_norm": 2.625, "learning_rate": 1.3304336060712685e-08, "loss": 0.5604, "mean_token_accuracy": 0.8854569047689438, "num_tokens": 454924489.0, "step": 4254 }, { "epoch": 9.69327251995439, "grad_norm": 3.09375, "learning_rate": 1.3110990346817676e-08, "loss": 0.5641, "mean_token_accuracy": 0.8837967067956924, "num_tokens": 455031171.0, "step": 4255 }, { "epoch": 9.695553021664766, "grad_norm": 2.84375, "learning_rate": 1.2919056143113062e-08, "loss": 0.5644, "mean_token_accuracy": 0.8802383691072464, "num_tokens": 455138285.0, "step": 4256 }, { "epoch": 9.697833523375142, "grad_norm": 2.625, "learning_rate": 1.2728533558537259e-08, "loss": 0.5509, "mean_token_accuracy": 0.8880133926868439, "num_tokens": 455245413.0, "step": 4257 }, { "epoch": 9.700114025085519, "grad_norm": 3.390625, "learning_rate": 1.2539422701227099e-08, "loss": 0.5777, "mean_token_accuracy": 0.8805587142705917, "num_tokens": 455352632.0, "step": 4258 }, { "epoch": 9.702394526795896, "grad_norm": 2.921875, "learning_rate": 1.235172367851839e-08, "loss": 0.5548, "mean_token_accuracy": 0.8849736303091049, "num_tokens": 455460206.0, "step": 4259 }, { "epoch": 9.704675028506271, "grad_norm": 5.21875, "learning_rate": 1.2165436596945634e-08, "loss": 0.5487, "mean_token_accuracy": 0.8850021660327911, "num_tokens": 455567328.0, "step": 4260 }, { "epoch": 9.706955530216648, "grad_norm": 3.90625, "learning_rate": 1.19805615622412e-08, "loss": 0.5687, "mean_token_accuracy": 0.8875507861375809, "num_tokens": 455674739.0, "step": 4261 }, { "epoch": 9.709236031927023, "grad_norm": 3.3125, "learning_rate": 1.179709867933726e-08, "loss": 0.5739, "mean_token_accuracy": 0.8814339190721512, "num_tokens": 455782280.0, "step": 4262 }, { "epoch": 9.7115165336374, "grad_norm": 2.9375, "learning_rate": 1.1615048052363298e-08, "loss": 0.5736, "mean_token_accuracy": 0.8808081150054932, "num_tokens": 455888628.0, "step": 4263 }, { "epoch": 9.713797035347776, "grad_norm": 3.25, "learning_rate": 1.1434409784648049e-08, "loss": 0.5585, "mean_token_accuracy": 0.8849108666181564, "num_tokens": 455996087.0, "step": 4264 }, { "epoch": 9.716077537058153, "grad_norm": 2.8125, "learning_rate": 1.125518397871811e-08, "loss": 0.5839, "mean_token_accuracy": 0.880156397819519, "num_tokens": 456103096.0, "step": 4265 }, { "epoch": 9.718358038768528, "grad_norm": 3.1875, "learning_rate": 1.1077370736298498e-08, "loss": 0.575, "mean_token_accuracy": 0.8832675367593765, "num_tokens": 456209824.0, "step": 4266 }, { "epoch": 9.720638540478905, "grad_norm": 2.578125, "learning_rate": 1.090097015831293e-08, "loss": 0.5706, "mean_token_accuracy": 0.8845285028219223, "num_tokens": 456317318.0, "step": 4267 }, { "epoch": 9.722919042189282, "grad_norm": 2.90625, "learning_rate": 1.0725982344882701e-08, "loss": 0.5922, "mean_token_accuracy": 0.8807027041912079, "num_tokens": 456424411.0, "step": 4268 }, { "epoch": 9.725199543899658, "grad_norm": 3.0, "learning_rate": 1.0552407395327813e-08, "loss": 0.5872, "mean_token_accuracy": 0.8796821981668472, "num_tokens": 456531485.0, "step": 4269 }, { "epoch": 9.727480045610035, "grad_norm": 3.84375, "learning_rate": 1.0380245408165846e-08, "loss": 0.5654, "mean_token_accuracy": 0.8859843015670776, "num_tokens": 456638619.0, "step": 4270 }, { "epoch": 9.72976054732041, "grad_norm": 3.296875, "learning_rate": 1.0209496481112247e-08, "loss": 0.5732, "mean_token_accuracy": 0.8768668621778488, "num_tokens": 456745727.0, "step": 4271 }, { "epoch": 9.732041049030787, "grad_norm": 3.125, "learning_rate": 1.0040160711081437e-08, "loss": 0.5719, "mean_token_accuracy": 0.8817434310913086, "num_tokens": 456852873.0, "step": 4272 }, { "epoch": 9.734321550741162, "grad_norm": 2.609375, "learning_rate": 9.87223819418487e-09, "loss": 0.5612, "mean_token_accuracy": 0.8846335262060165, "num_tokens": 456960828.0, "step": 4273 }, { "epoch": 9.73660205245154, "grad_norm": 4.84375, "learning_rate": 9.705729025732135e-09, "loss": 0.5523, "mean_token_accuracy": 0.8860846012830734, "num_tokens": 457067866.0, "step": 4274 }, { "epoch": 9.738882554161915, "grad_norm": 3.90625, "learning_rate": 9.540633300230418e-09, "loss": 0.579, "mean_token_accuracy": 0.8812527358531952, "num_tokens": 457174777.0, "step": 4275 }, { "epoch": 9.741163055872292, "grad_norm": 3.0625, "learning_rate": 9.376951111385313e-09, "loss": 0.5482, "mean_token_accuracy": 0.8858974128961563, "num_tokens": 457282370.0, "step": 4276 }, { "epoch": 9.743443557582669, "grad_norm": 2.765625, "learning_rate": 9.214682552099175e-09, "loss": 0.5803, "mean_token_accuracy": 0.8799700736999512, "num_tokens": 457389651.0, "step": 4277 }, { "epoch": 9.745724059293044, "grad_norm": 2.90625, "learning_rate": 9.053827714472773e-09, "loss": 0.5549, "mean_token_accuracy": 0.8852715194225311, "num_tokens": 457496887.0, "step": 4278 }, { "epoch": 9.748004561003421, "grad_norm": 2.8125, "learning_rate": 8.894386689804469e-09, "loss": 0.5617, "mean_token_accuracy": 0.883760392665863, "num_tokens": 457603521.0, "step": 4279 }, { "epoch": 9.750285062713797, "grad_norm": 3.078125, "learning_rate": 8.73635956858937e-09, "loss": 0.5729, "mean_token_accuracy": 0.8828184455633163, "num_tokens": 457710089.0, "step": 4280 }, { "epoch": 9.752565564424174, "grad_norm": 2.84375, "learning_rate": 8.579746440520731e-09, "loss": 0.5782, "mean_token_accuracy": 0.8836691379547119, "num_tokens": 457817982.0, "step": 4281 }, { "epoch": 9.754846066134549, "grad_norm": 2.65625, "learning_rate": 8.424547394489668e-09, "loss": 0.5624, "mean_token_accuracy": 0.883584663271904, "num_tokens": 457925306.0, "step": 4282 }, { "epoch": 9.757126567844926, "grad_norm": 3.5625, "learning_rate": 8.270762518583498e-09, "loss": 0.5763, "mean_token_accuracy": 0.8828044384717941, "num_tokens": 458032364.0, "step": 4283 }, { "epoch": 9.759407069555301, "grad_norm": 2.921875, "learning_rate": 8.118391900087952e-09, "loss": 0.5401, "mean_token_accuracy": 0.8894132673740387, "num_tokens": 458139150.0, "step": 4284 }, { "epoch": 9.761687571265679, "grad_norm": 3.828125, "learning_rate": 7.967435625485242e-09, "loss": 0.5614, "mean_token_accuracy": 0.8839370012283325, "num_tokens": 458246236.0, "step": 4285 }, { "epoch": 9.763968072976056, "grad_norm": 4.0625, "learning_rate": 7.81789378045572e-09, "loss": 0.555, "mean_token_accuracy": 0.8877554684877396, "num_tokens": 458353277.0, "step": 4286 }, { "epoch": 9.766248574686431, "grad_norm": 2.796875, "learning_rate": 7.669766449876493e-09, "loss": 0.5718, "mean_token_accuracy": 0.8832697570323944, "num_tokens": 458459980.0, "step": 4287 }, { "epoch": 9.768529076396808, "grad_norm": 3.578125, "learning_rate": 7.523053717821138e-09, "loss": 0.57, "mean_token_accuracy": 0.8807340711355209, "num_tokens": 458566519.0, "step": 4288 }, { "epoch": 9.770809578107183, "grad_norm": 3.703125, "learning_rate": 7.377755667561659e-09, "loss": 0.5684, "mean_token_accuracy": 0.8821778148412704, "num_tokens": 458673827.0, "step": 4289 }, { "epoch": 9.77309007981756, "grad_norm": 3.921875, "learning_rate": 7.233872381565976e-09, "loss": 0.5793, "mean_token_accuracy": 0.8782579749822617, "num_tokens": 458780746.0, "step": 4290 }, { "epoch": 9.775370581527936, "grad_norm": 2.9375, "learning_rate": 7.091403941499597e-09, "loss": 0.556, "mean_token_accuracy": 0.885596290230751, "num_tokens": 458887534.0, "step": 4291 }, { "epoch": 9.777651083238313, "grad_norm": 3.0, "learning_rate": 6.950350428225061e-09, "loss": 0.5504, "mean_token_accuracy": 0.8886802643537521, "num_tokens": 458994895.0, "step": 4292 }, { "epoch": 9.779931584948688, "grad_norm": 3.09375, "learning_rate": 6.810711921801105e-09, "loss": 0.5644, "mean_token_accuracy": 0.8828777819871902, "num_tokens": 459102180.0, "step": 4293 }, { "epoch": 9.782212086659065, "grad_norm": 3.34375, "learning_rate": 6.672488501484608e-09, "loss": 0.5629, "mean_token_accuracy": 0.8822075426578522, "num_tokens": 459209744.0, "step": 4294 }, { "epoch": 9.78449258836944, "grad_norm": 3.765625, "learning_rate": 6.535680245727816e-09, "loss": 0.556, "mean_token_accuracy": 0.8854386210441589, "num_tokens": 459317224.0, "step": 4295 }, { "epoch": 9.786773090079818, "grad_norm": 2.828125, "learning_rate": 6.400287232180558e-09, "loss": 0.5752, "mean_token_accuracy": 0.8836027830839157, "num_tokens": 459424913.0, "step": 4296 }, { "epoch": 9.789053591790195, "grad_norm": 2.828125, "learning_rate": 6.266309537689696e-09, "loss": 0.5751, "mean_token_accuracy": 0.8846608698368073, "num_tokens": 459531883.0, "step": 4297 }, { "epoch": 9.79133409350057, "grad_norm": 4.5, "learning_rate": 6.133747238298016e-09, "loss": 0.5916, "mean_token_accuracy": 0.8778994530439377, "num_tokens": 459638404.0, "step": 4298 }, { "epoch": 9.793614595210947, "grad_norm": 2.65625, "learning_rate": 6.002600409245607e-09, "loss": 0.5522, "mean_token_accuracy": 0.8856344819068909, "num_tokens": 459746677.0, "step": 4299 }, { "epoch": 9.795895096921322, "grad_norm": 4.03125, "learning_rate": 5.872869124968761e-09, "loss": 0.5939, "mean_token_accuracy": 0.87481589615345, "num_tokens": 459853457.0, "step": 4300 }, { "epoch": 9.7981755986317, "grad_norm": 2.8125, "learning_rate": 5.7445534591002435e-09, "loss": 0.5763, "mean_token_accuracy": 0.8805107921361923, "num_tokens": 459960464.0, "step": 4301 }, { "epoch": 9.800456100342075, "grad_norm": 2.71875, "learning_rate": 5.617653484469576e-09, "loss": 0.5489, "mean_token_accuracy": 0.885638564825058, "num_tokens": 460067123.0, "step": 4302 }, { "epoch": 9.802736602052452, "grad_norm": 6.28125, "learning_rate": 5.492169273103309e-09, "loss": 0.5632, "mean_token_accuracy": 0.884444460272789, "num_tokens": 460173895.0, "step": 4303 }, { "epoch": 9.805017103762827, "grad_norm": 3.171875, "learning_rate": 5.368100896223083e-09, "loss": 0.5775, "mean_token_accuracy": 0.8823024779558182, "num_tokens": 460280558.0, "step": 4304 }, { "epoch": 9.807297605473204, "grad_norm": 3.640625, "learning_rate": 5.245448424248123e-09, "loss": 0.5707, "mean_token_accuracy": 0.885224923491478, "num_tokens": 460387236.0, "step": 4305 }, { "epoch": 9.80957810718358, "grad_norm": 2.96875, "learning_rate": 5.124211926793577e-09, "loss": 0.5659, "mean_token_accuracy": 0.8848960250616074, "num_tokens": 460493654.0, "step": 4306 }, { "epoch": 9.811858608893957, "grad_norm": 2.796875, "learning_rate": 5.004391472670788e-09, "loss": 0.5544, "mean_token_accuracy": 0.8836629986763, "num_tokens": 460600975.0, "step": 4307 }, { "epoch": 9.814139110604334, "grad_norm": 3.53125, "learning_rate": 4.885987129887859e-09, "loss": 0.5956, "mean_token_accuracy": 0.8799479305744171, "num_tokens": 460707729.0, "step": 4308 }, { "epoch": 9.816419612314709, "grad_norm": 2.6875, "learning_rate": 4.768998965648253e-09, "loss": 0.591, "mean_token_accuracy": 0.8796339482069016, "num_tokens": 460814870.0, "step": 4309 }, { "epoch": 9.818700114025086, "grad_norm": 3.15625, "learning_rate": 4.653427046352743e-09, "loss": 0.5804, "mean_token_accuracy": 0.8791462630033493, "num_tokens": 460921892.0, "step": 4310 }, { "epoch": 9.820980615735461, "grad_norm": 3.09375, "learning_rate": 4.53927143759747e-09, "loss": 0.5772, "mean_token_accuracy": 0.8801241964101791, "num_tokens": 461029101.0, "step": 4311 }, { "epoch": 9.823261117445838, "grad_norm": 2.484375, "learning_rate": 4.426532204175049e-09, "loss": 0.5596, "mean_token_accuracy": 0.8861949443817139, "num_tokens": 461136681.0, "step": 4312 }, { "epoch": 9.825541619156214, "grad_norm": 2.703125, "learning_rate": 4.3152094100740175e-09, "loss": 0.5629, "mean_token_accuracy": 0.8844578862190247, "num_tokens": 461244074.0, "step": 4313 }, { "epoch": 9.82782212086659, "grad_norm": 2.890625, "learning_rate": 4.205303118479109e-09, "loss": 0.5693, "mean_token_accuracy": 0.8827894330024719, "num_tokens": 461351000.0, "step": 4314 }, { "epoch": 9.830102622576966, "grad_norm": 2.71875, "learning_rate": 4.096813391770982e-09, "loss": 0.5606, "mean_token_accuracy": 0.8869377076625824, "num_tokens": 461458568.0, "step": 4315 }, { "epoch": 9.832383124287343, "grad_norm": 3.234375, "learning_rate": 3.989740291526212e-09, "loss": 0.5671, "mean_token_accuracy": 0.8829131424427032, "num_tokens": 461565463.0, "step": 4316 }, { "epoch": 9.83466362599772, "grad_norm": 3.046875, "learning_rate": 3.884083878517575e-09, "loss": 0.5668, "mean_token_accuracy": 0.8803281784057617, "num_tokens": 461672563.0, "step": 4317 }, { "epoch": 9.836944127708096, "grad_norm": 3.71875, "learning_rate": 3.779844212713213e-09, "loss": 0.5542, "mean_token_accuracy": 0.8861279636621475, "num_tokens": 461779451.0, "step": 4318 }, { "epoch": 9.839224629418473, "grad_norm": 2.734375, "learning_rate": 3.6770213532782985e-09, "loss": 0.5639, "mean_token_accuracy": 0.8874040246009827, "num_tokens": 461886457.0, "step": 4319 }, { "epoch": 9.841505131128848, "grad_norm": 3.5, "learning_rate": 3.5756153585725374e-09, "loss": 0.5524, "mean_token_accuracy": 0.8833809942007065, "num_tokens": 461993587.0, "step": 4320 }, { "epoch": 9.843785632839225, "grad_norm": 3.03125, "learning_rate": 3.475626286152112e-09, "loss": 0.5725, "mean_token_accuracy": 0.8821329474449158, "num_tokens": 462100785.0, "step": 4321 }, { "epoch": 9.8460661345496, "grad_norm": 2.65625, "learning_rate": 3.3770541927691247e-09, "loss": 0.5641, "mean_token_accuracy": 0.8800762295722961, "num_tokens": 462207714.0, "step": 4322 }, { "epoch": 9.848346636259977, "grad_norm": 2.671875, "learning_rate": 3.2798991343707676e-09, "loss": 0.5559, "mean_token_accuracy": 0.8831606060266495, "num_tokens": 462314885.0, "step": 4323 }, { "epoch": 9.850627137970353, "grad_norm": 3.921875, "learning_rate": 3.1841611661007077e-09, "loss": 0.5608, "mean_token_accuracy": 0.8826912045478821, "num_tokens": 462422224.0, "step": 4324 }, { "epoch": 9.85290763968073, "grad_norm": 3.1875, "learning_rate": 3.089840342297701e-09, "loss": 0.5771, "mean_token_accuracy": 0.8812664598226547, "num_tokens": 462529435.0, "step": 4325 }, { "epoch": 9.855188141391107, "grad_norm": 3.75, "learning_rate": 2.9969367164969787e-09, "loss": 0.5616, "mean_token_accuracy": 0.8857505470514297, "num_tokens": 462636769.0, "step": 4326 }, { "epoch": 9.857468643101482, "grad_norm": 3.484375, "learning_rate": 2.905450341428029e-09, "loss": 0.5718, "mean_token_accuracy": 0.8818775713443756, "num_tokens": 462743978.0, "step": 4327 }, { "epoch": 9.85974914481186, "grad_norm": 3.421875, "learning_rate": 2.8153812690173697e-09, "loss": 0.5767, "mean_token_accuracy": 0.8814905136823654, "num_tokens": 462850538.0, "step": 4328 }, { "epoch": 9.862029646522235, "grad_norm": 2.625, "learning_rate": 2.726729550386331e-09, "loss": 0.5719, "mean_token_accuracy": 0.8831851780414581, "num_tokens": 462957437.0, "step": 4329 }, { "epoch": 9.864310148232612, "grad_norm": 3.15625, "learning_rate": 2.6394952358518854e-09, "loss": 0.5804, "mean_token_accuracy": 0.8794573098421097, "num_tokens": 463063901.0, "step": 4330 }, { "epoch": 9.866590649942987, "grad_norm": 2.875, "learning_rate": 2.553678374926649e-09, "loss": 0.5617, "mean_token_accuracy": 0.8824156671762466, "num_tokens": 463171455.0, "step": 4331 }, { "epoch": 9.868871151653364, "grad_norm": 3.3125, "learning_rate": 2.4692790163183268e-09, "loss": 0.5688, "mean_token_accuracy": 0.8811941742897034, "num_tokens": 463278442.0, "step": 4332 }, { "epoch": 9.87115165336374, "grad_norm": 3.796875, "learning_rate": 2.3862972079305435e-09, "loss": 0.5559, "mean_token_accuracy": 0.8832034319639206, "num_tokens": 463385401.0, "step": 4333 }, { "epoch": 9.873432155074116, "grad_norm": 3.90625, "learning_rate": 2.3047329968620137e-09, "loss": 0.5627, "mean_token_accuracy": 0.8862831592559814, "num_tokens": 463492423.0, "step": 4334 }, { "epoch": 9.875712656784494, "grad_norm": 3.0, "learning_rate": 2.2245864294073715e-09, "loss": 0.5696, "mean_token_accuracy": 0.8842545598745346, "num_tokens": 463599395.0, "step": 4335 }, { "epoch": 9.877993158494869, "grad_norm": 2.875, "learning_rate": 2.145857551056063e-09, "loss": 0.5805, "mean_token_accuracy": 0.8817007839679718, "num_tokens": 463707615.0, "step": 4336 }, { "epoch": 9.880273660205246, "grad_norm": 2.796875, "learning_rate": 2.0685464064928996e-09, "loss": 0.5538, "mean_token_accuracy": 0.8855222314596176, "num_tokens": 463814300.0, "step": 4337 }, { "epoch": 9.882554161915621, "grad_norm": 3.03125, "learning_rate": 1.992653039598613e-09, "loss": 0.5621, "mean_token_accuracy": 0.8810195475816727, "num_tokens": 463921566.0, "step": 4338 }, { "epoch": 9.884834663625998, "grad_norm": 3.234375, "learning_rate": 1.91817749344847e-09, "loss": 0.5934, "mean_token_accuracy": 0.8760579079389572, "num_tokens": 464028254.0, "step": 4339 }, { "epoch": 9.887115165336374, "grad_norm": 2.5, "learning_rate": 1.8451198103133783e-09, "loss": 0.5684, "mean_token_accuracy": 0.8798963725566864, "num_tokens": 464135330.0, "step": 4340 }, { "epoch": 9.88939566704675, "grad_norm": 4.78125, "learning_rate": 1.7734800316596135e-09, "loss": 0.577, "mean_token_accuracy": 0.8792663961648941, "num_tokens": 464242517.0, "step": 4341 }, { "epoch": 9.891676168757126, "grad_norm": 2.953125, "learning_rate": 1.703258198148261e-09, "loss": 0.5634, "mean_token_accuracy": 0.884123831987381, "num_tokens": 464349209.0, "step": 4342 }, { "epoch": 9.893956670467503, "grad_norm": 3.703125, "learning_rate": 1.6344543496360499e-09, "loss": 0.5629, "mean_token_accuracy": 0.8826566338539124, "num_tokens": 464456526.0, "step": 4343 }, { "epoch": 9.896237172177878, "grad_norm": 2.765625, "learning_rate": 1.567068525175075e-09, "loss": 0.5671, "mean_token_accuracy": 0.887358620762825, "num_tokens": 464563578.0, "step": 4344 }, { "epoch": 9.898517673888255, "grad_norm": 3.625, "learning_rate": 1.5011007630114093e-09, "loss": 0.5738, "mean_token_accuracy": 0.8806861042976379, "num_tokens": 464670481.0, "step": 4345 }, { "epoch": 9.900798175598633, "grad_norm": 2.8125, "learning_rate": 1.4365511005878796e-09, "loss": 0.5689, "mean_token_accuracy": 0.8842573165893555, "num_tokens": 464778377.0, "step": 4346 }, { "epoch": 9.903078677309008, "grad_norm": 3.796875, "learning_rate": 1.3734195745412904e-09, "loss": 0.5739, "mean_token_accuracy": 0.8824024051427841, "num_tokens": 464885701.0, "step": 4347 }, { "epoch": 9.905359179019385, "grad_norm": 3.625, "learning_rate": 1.3117062207038123e-09, "loss": 0.577, "mean_token_accuracy": 0.880613699555397, "num_tokens": 464992795.0, "step": 4348 }, { "epoch": 9.90763968072976, "grad_norm": 3.53125, "learning_rate": 1.2514110741029816e-09, "loss": 0.5741, "mean_token_accuracy": 0.8834188729524612, "num_tokens": 465099950.0, "step": 4349 }, { "epoch": 9.909920182440137, "grad_norm": 2.734375, "learning_rate": 1.1925341689608682e-09, "loss": 0.5755, "mean_token_accuracy": 0.8817101418972015, "num_tokens": 465207553.0, "step": 4350 }, { "epoch": 9.912200684150513, "grad_norm": 3.59375, "learning_rate": 1.135075538695185e-09, "loss": 0.5977, "mean_token_accuracy": 0.8769432902336121, "num_tokens": 465314057.0, "step": 4351 }, { "epoch": 9.91448118586089, "grad_norm": 3.875, "learning_rate": 1.0790352159179007e-09, "loss": 0.5812, "mean_token_accuracy": 0.8812268972396851, "num_tokens": 465420891.0, "step": 4352 }, { "epoch": 9.916761687571265, "grad_norm": 3.6875, "learning_rate": 1.024413232436905e-09, "loss": 0.5762, "mean_token_accuracy": 0.878851979970932, "num_tokens": 465527864.0, "step": 4353 }, { "epoch": 9.919042189281642, "grad_norm": 3.09375, "learning_rate": 9.71209619254343e-10, "loss": 0.5747, "mean_token_accuracy": 0.8795945793390274, "num_tokens": 465634366.0, "step": 4354 }, { "epoch": 9.921322690992017, "grad_norm": 3.390625, "learning_rate": 9.194244065674484e-10, "loss": 0.5829, "mean_token_accuracy": 0.8763747960329056, "num_tokens": 465741337.0, "step": 4355 }, { "epoch": 9.923603192702394, "grad_norm": 3.0, "learning_rate": 8.690576237688208e-10, "loss": 0.573, "mean_token_accuracy": 0.8846324980258942, "num_tokens": 465848967.0, "step": 4356 }, { "epoch": 9.925883694412772, "grad_norm": 2.609375, "learning_rate": 8.201092994453153e-10, "loss": 0.5799, "mean_token_accuracy": 0.879865899682045, "num_tokens": 465955628.0, "step": 4357 }, { "epoch": 9.928164196123147, "grad_norm": 2.953125, "learning_rate": 7.725794613791527e-10, "loss": 0.5724, "mean_token_accuracy": 0.8813536763191223, "num_tokens": 466062877.0, "step": 4358 }, { "epoch": 9.930444697833524, "grad_norm": 3.75, "learning_rate": 7.264681365476422e-10, "loss": 0.5683, "mean_token_accuracy": 0.8843436688184738, "num_tokens": 466170580.0, "step": 4359 }, { "epoch": 9.9327251995439, "grad_norm": 3.125, "learning_rate": 6.817753511226266e-10, "loss": 0.5662, "mean_token_accuracy": 0.8817441612482071, "num_tokens": 466277744.0, "step": 4360 }, { "epoch": 9.935005701254276, "grad_norm": 2.375, "learning_rate": 6.385011304704814e-10, "loss": 0.5443, "mean_token_accuracy": 0.8887474536895752, "num_tokens": 466384776.0, "step": 4361 }, { "epoch": 9.937286202964652, "grad_norm": 2.875, "learning_rate": 5.96645499152948e-10, "loss": 0.5935, "mean_token_accuracy": 0.8781139552593231, "num_tokens": 466491782.0, "step": 4362 }, { "epoch": 9.939566704675029, "grad_norm": 2.78125, "learning_rate": 5.562084809268564e-10, "loss": 0.5561, "mean_token_accuracy": 0.8866285234689713, "num_tokens": 466598743.0, "step": 4363 }, { "epoch": 9.941847206385404, "grad_norm": 2.6875, "learning_rate": 5.171900987430146e-10, "loss": 0.5625, "mean_token_accuracy": 0.8828070014715195, "num_tokens": 466705736.0, "step": 4364 }, { "epoch": 9.944127708095781, "grad_norm": 2.78125, "learning_rate": 4.795903747475961e-10, "loss": 0.5786, "mean_token_accuracy": 0.8803279101848602, "num_tokens": 466812755.0, "step": 4365 }, { "epoch": 9.946408209806158, "grad_norm": 2.734375, "learning_rate": 4.434093302815856e-10, "loss": 0.5639, "mean_token_accuracy": 0.8847870975732803, "num_tokens": 466919779.0, "step": 4366 }, { "epoch": 9.948688711516533, "grad_norm": 2.75, "learning_rate": 4.0864698588077844e-10, "loss": 0.5774, "mean_token_accuracy": 0.883985847234726, "num_tokens": 467026369.0, "step": 4367 }, { "epoch": 9.95096921322691, "grad_norm": 3.09375, "learning_rate": 3.7530336127550306e-10, "loss": 0.6001, "mean_token_accuracy": 0.8794097006320953, "num_tokens": 467133105.0, "step": 4368 }, { "epoch": 9.953249714937286, "grad_norm": 3.21875, "learning_rate": 3.4337847539089866e-10, "loss": 0.5515, "mean_token_accuracy": 0.8836051672697067, "num_tokens": 467240349.0, "step": 4369 }, { "epoch": 9.955530216647663, "grad_norm": 3.125, "learning_rate": 3.1287234634663766e-10, "loss": 0.5822, "mean_token_accuracy": 0.8793405294418335, "num_tokens": 467347282.0, "step": 4370 }, { "epoch": 9.957810718358038, "grad_norm": 2.71875, "learning_rate": 2.8378499145803593e-10, "loss": 0.5576, "mean_token_accuracy": 0.8886418640613556, "num_tokens": 467454418.0, "step": 4371 }, { "epoch": 9.960091220068415, "grad_norm": 4.1875, "learning_rate": 2.5611642723410987e-10, "loss": 0.5857, "mean_token_accuracy": 0.8817699551582336, "num_tokens": 467561394.0, "step": 4372 }, { "epoch": 9.96237172177879, "grad_norm": 3.890625, "learning_rate": 2.2986666937896418e-10, "loss": 0.5689, "mean_token_accuracy": 0.8804190456867218, "num_tokens": 467668306.0, "step": 4373 }, { "epoch": 9.964652223489168, "grad_norm": 2.875, "learning_rate": 2.050357327917918e-10, "loss": 0.5541, "mean_token_accuracy": 0.8814870566129684, "num_tokens": 467775667.0, "step": 4374 }, { "epoch": 9.966932725199545, "grad_norm": 2.515625, "learning_rate": 1.816236315657638e-10, "loss": 0.5719, "mean_token_accuracy": 0.8799229115247726, "num_tokens": 467883505.0, "step": 4375 }, { "epoch": 9.96921322690992, "grad_norm": 2.875, "learning_rate": 1.5963037898913957e-10, "loss": 0.5626, "mean_token_accuracy": 0.8831915408372879, "num_tokens": 467990987.0, "step": 4376 }, { "epoch": 9.971493728620297, "grad_norm": 2.796875, "learning_rate": 1.3905598754526684e-10, "loss": 0.5844, "mean_token_accuracy": 0.8801503032445908, "num_tokens": 468098922.0, "step": 4377 }, { "epoch": 9.973774230330672, "grad_norm": 2.859375, "learning_rate": 1.1990046891147133e-10, "loss": 0.5554, "mean_token_accuracy": 0.8852853924036026, "num_tokens": 468206794.0, "step": 4378 }, { "epoch": 9.97605473204105, "grad_norm": 2.625, "learning_rate": 1.021638339598896e-10, "loss": 0.5712, "mean_token_accuracy": 0.8826959729194641, "num_tokens": 468313770.0, "step": 4379 }, { "epoch": 9.978335233751425, "grad_norm": 3.421875, "learning_rate": 8.584609275802402e-11, "loss": 0.5628, "mean_token_accuracy": 0.8853756189346313, "num_tokens": 468420750.0, "step": 4380 }, { "epoch": 9.980615735461802, "grad_norm": 2.921875, "learning_rate": 7.094725456707752e-11, "loss": 0.5398, "mean_token_accuracy": 0.8900392800569534, "num_tokens": 468527796.0, "step": 4381 }, { "epoch": 9.982896237172177, "grad_norm": 3.21875, "learning_rate": 5.746732784361886e-11, "loss": 0.5661, "mean_token_accuracy": 0.8818590939044952, "num_tokens": 468634501.0, "step": 4382 }, { "epoch": 9.985176738882554, "grad_norm": 2.921875, "learning_rate": 4.540632023819491e-11, "loss": 0.5673, "mean_token_accuracy": 0.8829274624586105, "num_tokens": 468741744.0, "step": 4383 }, { "epoch": 9.987457240592931, "grad_norm": 3.09375, "learning_rate": 3.47642385967184e-11, "loss": 0.5709, "mean_token_accuracy": 0.8815714865922928, "num_tokens": 468848526.0, "step": 4384 }, { "epoch": 9.989737742303307, "grad_norm": 2.84375, "learning_rate": 2.5541088959357697e-11, "loss": 0.5437, "mean_token_accuracy": 0.8854826241731644, "num_tokens": 468955714.0, "step": 4385 }, { "epoch": 9.992018244013684, "grad_norm": 2.90625, "learning_rate": 1.773687656109191e-11, "loss": 0.5839, "mean_token_accuracy": 0.8776491433382034, "num_tokens": 469062395.0, "step": 4386 }, { "epoch": 9.994298745724059, "grad_norm": 3.671875, "learning_rate": 1.1351605831433354e-11, "loss": 0.5663, "mean_token_accuracy": 0.8828050196170807, "num_tokens": 469169221.0, "step": 4387 }, { "epoch": 9.996579247434436, "grad_norm": 4.84375, "learning_rate": 6.385280394149984e-12, "loss": 0.5735, "mean_token_accuracy": 0.8813241869211197, "num_tokens": 469276076.0, "step": 4388 }, { "epoch": 9.998859749144811, "grad_norm": 4.5625, "learning_rate": 2.8379030686531696e-12, "loss": 0.5724, "mean_token_accuracy": 0.8789683431386948, "num_tokens": 469383376.0, "step": 4389 }, { "epoch": 10.0, "grad_norm": 4.90625, "learning_rate": 7.094758677772539e-13, "loss": 0.578, "mean_token_accuracy": 0.8766284584999084, "num_tokens": 469422320.0, "step": 4390 } ], "logging_steps": 1, "max_steps": 4390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.252059023052636e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }