diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14543 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17362995116657623, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2888.0, + "completions/mean_length": 1069.0982666015625, + "completions/mean_terminated_length": 861.9014892578125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.00017362995116657623, + "grad_norm": 0.08312451094388962, + "kl": 7.200241088867188e-05, + "learning_rate": 0.0, + "loss": 0.1019, + "num_tokens": 543828.0, + "reward": 0.2187500149011612, + "reward_std": 0.26587459444999695, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4138607978820801, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00034725990233315247, + "grad_norm": 0.08317255973815918, + "kl": 7.200241088867188e-05, + "learning_rate": 1e-08, + "loss": 0.1019, + "step": 2 + }, + { + "clip_ratio/high_max": 0.00017734717948769685, + "clip_ratio/high_mean": 3.614835213738843e-05, + "clip_ratio/low_mean": 6.03649980348564e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.65133513091132e-05, + "epoch": 0.0005208898534997288, + "grad_norm": 0.09187809377908707, + "kl": 7.516145706176758e-05, + "learning_rate": 2e-08, + "loss": 0.1019, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0002011762153415475, + "clip_ratio/high_mean": 3.874928006553091e-05, + "clip_ratio/low_mean": 5.499333383340854e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.374261480843415e-05, + "epoch": 0.0006945198046663049, + "grad_norm": 0.08208971470594406, + "kl": 7.733702659606934e-05, + "learning_rate": 3e-08, + "loss": 0.1019, + "step": 4 + }, + { + "clip_ratio/high_max": 0.00018187825298809912, + "clip_ratio/high_mean": 3.383311673132994e-05, + "clip_ratio/low_mean": 6.573261293851829e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.956573057934293e-05, + "epoch": 0.0008681497558328812, + "grad_norm": 0.08130073547363281, + "kl": 7.62939453125e-05, + "learning_rate": 4e-08, + "loss": 0.102, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0002339957318326924, + "clip_ratio/high_mean": 3.8574269524360716e-05, + "clip_ratio/low_mean": 5.737374135605933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.594801019829902e-05, + "epoch": 0.0010417797069994575, + "grad_norm": 0.07988259941339493, + "kl": 8.347630500793457e-05, + "learning_rate": 5e-08, + "loss": 0.102, + "step": 6 + }, + { + "clip_ratio/high_max": 0.00019622506715677446, + "clip_ratio/high_mean": 4.668732128720876e-05, + "clip_ratio/low_mean": 5.498728080510773e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010167460220600333, + "epoch": 0.0012154096581660336, + "grad_norm": 0.07902826368808746, + "kl": 8.025765419006348e-05, + "learning_rate": 6e-08, + "loss": 0.1019, + "step": 7 + }, + { + "clip_ratio/high_max": 0.00014846220437902957, + "clip_ratio/high_mean": 3.300488288004999e-05, + "clip_ratio/low_mean": 5.860795056378265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.161283082903537e-05, + "epoch": 0.0013890396093326099, + "grad_norm": 0.09225912392139435, + "kl": 7.525086402893066e-05, + "learning_rate": 7e-08, + "loss": 0.1019, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0758928571428571, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3056.0, + "completions/mean_length": 1106.6429443359375, + "completions/mean_terminated_length": 945.2366943359375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0015626695604991862, + "grad_norm": 0.07411890476942062, + "kl": 7.322430610656738e-05, + "learning_rate": 8e-08, + "loss": 0.0741, + "num_tokens": 1104492.0, + "reward": 0.1674107164144516, + "reward_std": 0.1820857971906662, + "rewards/accuracy_reward/mean": 0.1674107164144516, + "rewards/accuracy_reward/std": 0.37375950813293457, + "step": 9 + }, + { + "clip_ratio/high_max": 9.704411877464736e-05, + "clip_ratio/high_mean": 1.3863445815331943e-05, + "clip_ratio/low_mean": 4.3595246893346484e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.745869407292048e-05, + "epoch": 0.0017362995116657625, + "grad_norm": 0.07438937574625015, + "kl": 7.450580596923828e-05, + "learning_rate": 9e-08, + "loss": 0.0742, + "step": 10 + }, + { + "clip_ratio/high_max": 8.37466150187538e-05, + "clip_ratio/high_mean": 1.2991255744054797e-05, + "clip_ratio/low_mean": 4.0086545254780503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.307780259045103e-05, + "epoch": 0.0019099294628323385, + "grad_norm": 0.07481148838996887, + "kl": 7.724761962890625e-05, + "learning_rate": 1e-07, + "loss": 0.0742, + "step": 11 + }, + { + "clip_ratio/high_max": 0.00011983493641309906, + "clip_ratio/high_mean": 1.989127281376568e-05, + "clip_ratio/low_mean": 4.786969634551497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.776096938665432e-05, + "epoch": 0.002083559413998915, + "grad_norm": 0.07376047968864441, + "kl": 0.0001112222671508789, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.0742, + "step": 12 + }, + { + "clip_ratio/high_max": 0.00014891848786646733, + "clip_ratio/high_mean": 2.2301523358692066e-05, + "clip_ratio/low_mean": 4.656390899526741e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.886543269501999e-05, + "epoch": 0.002257189365165491, + "grad_norm": 0.07342521846294403, + "kl": 7.43567943572998e-05, + "learning_rate": 1.2e-07, + "loss": 0.0742, + "step": 13 + }, + { + "clip_ratio/high_max": 5.2887537094647996e-05, + "clip_ratio/high_mean": 1.058451312019315e-05, + "clip_ratio/low_mean": 4.9786707904786454e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.037122079760593e-05, + "epoch": 0.002430819316332067, + "grad_norm": 0.07298363000154495, + "kl": 8.45789909362793e-05, + "learning_rate": 1.3e-07, + "loss": 0.0742, + "step": 14 + }, + { + "clip_ratio/high_max": 0.00012552260341180954, + "clip_ratio/high_mean": 1.9142536984873004e-05, + "clip_ratio/low_mean": 3.941370846405334e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.855624488049216e-05, + "epoch": 0.0026044492674986435, + "grad_norm": 0.07264551520347595, + "kl": 7.745623588562012e-05, + "learning_rate": 1.4e-07, + "loss": 0.0742, + "step": 15 + }, + { + "clip_ratio/high_max": 9.212246914103162e-05, + "clip_ratio/high_mean": 1.3160352864360902e-05, + "clip_ratio/low_mean": 4.3187843061787135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6348194448219147e-05, + "epoch": 0.0027780792186652198, + "grad_norm": 0.07343482226133347, + "kl": 0.00010889768600463867, + "learning_rate": 1.5e-07, + "loss": 0.0742, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0848214285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3039.0, + "completions/mean_length": 1027.546875, + "completions/mean_terminated_length": 838.0609741210938, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.002951709169831796, + "grad_norm": 0.07811467349529266, + "kl": 7.998943328857422e-05, + "learning_rate": 1.6e-07, + "loss": 0.0773, + "num_tokens": 1633745.0, + "reward": 0.1763392984867096, + "reward_std": 0.2238721251487732, + "rewards/accuracy_reward/mean": 0.1763392835855484, + "rewards/accuracy_reward/std": 0.3815346360206604, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0001340598719252739, + "clip_ratio/high_mean": 2.269723245262867e-05, + "clip_ratio/low_mean": 4.6339094751601806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.90363260673621e-05, + "epoch": 0.0031253391209983723, + "grad_norm": 0.07763178646564484, + "kl": 7.75456428527832e-05, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.0773, + "step": 18 + }, + { + "clip_ratio/high_max": 9.851857248577289e-05, + "clip_ratio/high_mean": 2.10864341170236e-05, + "clip_ratio/low_mean": 4.406144989843597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.514788219647016e-05, + "epoch": 0.0032989690721649486, + "grad_norm": 0.07750116288661957, + "kl": 8.14199447631836e-05, + "learning_rate": 1.8e-07, + "loss": 0.0773, + "step": 19 + }, + { + "clip_ratio/high_max": 0.00021771680439997, + "clip_ratio/high_mean": 3.593632732190599e-05, + "clip_ratio/low_mean": 7.785161528772733e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011378794010852289, + "epoch": 0.003472599023331525, + "grad_norm": 0.07713920623064041, + "kl": 8.374452590942383e-05, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.0772, + "step": 20 + }, + { + "clip_ratio/high_max": 0.00013868681571693742, + "clip_ratio/high_mean": 2.3358224211733614e-05, + "clip_ratio/low_mean": 4.4567103941517416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.792532974486676e-05, + "epoch": 0.0036462289744981008, + "grad_norm": 0.07797636836767197, + "kl": 8.046627044677734e-05, + "learning_rate": 2e-07, + "loss": 0.0773, + "step": 21 + }, + { + "clip_ratio/high_max": 0.00013905832020100206, + "clip_ratio/high_mean": 2.2133624156595033e-05, + "clip_ratio/low_mean": 3.620566849349416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.833929367327073e-05, + "epoch": 0.003819858925664677, + "grad_norm": 0.07895684242248535, + "kl": 7.939338684082031e-05, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.0773, + "step": 22 + }, + { + "clip_ratio/high_max": 0.00015697673734393902, + "clip_ratio/high_mean": 2.6972006480718846e-05, + "clip_ratio/low_mean": 5.847412899129267e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.54461356993852e-05, + "epoch": 0.003993488876831253, + "grad_norm": 0.07751636952161789, + "kl": 8.183717727661133e-05, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0773, + "step": 23 + }, + { + "clip_ratio/high_max": 0.00022616104979533702, + "clip_ratio/high_mean": 4.1390749629499624e-05, + "clip_ratio/low_mean": 5.6943259323816164e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.833400918068946e-05, + "epoch": 0.00416711882799783, + "grad_norm": 0.07773281633853912, + "kl": 8.618831634521484e-05, + "learning_rate": 2.3e-07, + "loss": 0.0773, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0758928571428571, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3018.0, + "completions/mean_length": 1000.9777221679688, + "completions/mean_terminated_length": 830.8937377929688, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.004340748779164406, + "grad_norm": 0.09035438299179077, + "kl": 7.954239845275879e-05, + "learning_rate": 2.4e-07, + "loss": 0.0559, + "num_tokens": 2144215.0, + "reward": 0.2075892984867096, + "reward_std": 0.2196648269891739, + "rewards/accuracy_reward/mean": 0.2075892835855484, + "rewards/accuracy_reward/std": 0.4060344398021698, + "step": 25 + }, + { + "clip_ratio/high_max": 0.00016966628209047485, + "clip_ratio/high_mean": 4.3388792391851894e-05, + "clip_ratio/low_mean": 5.315348585099855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.654227915234515e-05, + "epoch": 0.004514378730330982, + "grad_norm": 0.08993355929851532, + "kl": 7.805228233337402e-05, + "learning_rate": 2.5e-07, + "loss": 0.0559, + "step": 26 + }, + { + "clip_ratio/high_max": 0.00014771419046155643, + "clip_ratio/high_mean": 2.7782731876868638e-05, + "clip_ratio/low_mean": 6.608906437577389e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.387179579789517e-05, + "epoch": 0.0046880086814975585, + "grad_norm": 0.08998360484838486, + "kl": 7.49826431274414e-05, + "learning_rate": 2.6e-07, + "loss": 0.0559, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0002342458137718495, + "clip_ratio/high_mean": 4.849543029195047e-05, + "clip_ratio/low_mean": 9.095161726691003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001394470457398711, + "epoch": 0.004861638632664134, + "grad_norm": 0.08851566910743713, + "kl": 8.386373519897461e-05, + "learning_rate": 2.7e-07, + "loss": 0.0559, + "step": 28 + }, + { + "clip_ratio/high_max": 0.00023134674120228738, + "clip_ratio/high_mean": 5.22928294230951e-05, + "clip_ratio/low_mean": 8.602222465015075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013831504952577234, + "epoch": 0.005035268583830711, + "grad_norm": 0.08929405361413956, + "kl": 8.207559585571289e-05, + "learning_rate": 2.8e-07, + "loss": 0.0559, + "step": 29 + }, + { + "clip_ratio/high_max": 0.00022253452789300354, + "clip_ratio/high_mean": 5.1063938826700905e-05, + "clip_ratio/low_mean": 0.00011547929466360074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001665432312165649, + "epoch": 0.005208898534997287, + "grad_norm": 0.08835883438587189, + "kl": 7.921457290649414e-05, + "learning_rate": 2.9e-07, + "loss": 0.0559, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0002099981602441403, + "clip_ratio/high_mean": 3.942374564758211e-05, + "clip_ratio/low_mean": 0.00011220073656659224, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015162447925831657, + "epoch": 0.005382528486163864, + "grad_norm": 0.08897814899682999, + "kl": 7.662177085876465e-05, + "learning_rate": 3e-07, + "loss": 0.0559, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0001918534326250665, + "clip_ratio/high_mean": 4.845873968406522e-05, + "clip_ratio/low_mean": 0.00012192821054668457, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017038694886650774, + "epoch": 0.0055561584373304395, + "grad_norm": 0.08902352303266525, + "kl": 7.456541061401367e-05, + "learning_rate": 3.1e-07, + "loss": 0.0559, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1026785714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3070.0, + "completions/mean_length": 1071.555908203125, + "completions/mean_terminated_length": 842.6492309570312, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.005729788388497015, + "grad_norm": 0.0930178314447403, + "kl": 8.696317672729492e-05, + "learning_rate": 3.2e-07, + "loss": 0.1121, + "num_tokens": 2693064.0, + "reward": 0.1785714328289032, + "reward_std": 0.2662632167339325, + "rewards/accuracy_reward/mean": 0.1785714328289032, + "rewards/accuracy_reward/std": 0.3834212124347687, + "step": 33 + }, + { + "clip_ratio/high_max": 6.574029430339579e-05, + "clip_ratio/high_mean": 1.168525886896532e-05, + "clip_ratio/low_mean": 0.00010515081294215634, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011683607067425328, + "epoch": 0.005903418339663592, + "grad_norm": 0.0920194536447525, + "kl": 8.732080459594727e-05, + "learning_rate": 3.3e-07, + "loss": 0.1122, + "step": 34 + }, + { + "clip_ratio/high_max": 8.627499573776731e-05, + "clip_ratio/high_mean": 2.075476299978618e-05, + "clip_ratio/low_mean": 8.375979768970865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010451455796101072, + "epoch": 0.006077048290830168, + "grad_norm": 0.09262243658304214, + "kl": 8.809566497802734e-05, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.1122, + "step": 35 + }, + { + "clip_ratio/high_max": 8.649766004964476e-05, + "clip_ratio/high_mean": 1.6442359765278525e-05, + "clip_ratio/low_mean": 9.817714681048528e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011461950930424791, + "epoch": 0.006250678241996745, + "grad_norm": 0.09249166399240494, + "kl": 9.393692016601562e-05, + "learning_rate": 3.5e-07, + "loss": 0.1122, + "step": 36 + }, + { + "clip_ratio/high_max": 8.238965892815031e-05, + "clip_ratio/high_mean": 2.05791129701538e-05, + "clip_ratio/low_mean": 0.00010433309148538683, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012491220536503533, + "epoch": 0.0064243081931633205, + "grad_norm": 0.09264138340950012, + "kl": 8.857250213623047e-05, + "learning_rate": 3.6e-07, + "loss": 0.1121, + "step": 37 + }, + { + "clip_ratio/high_max": 0.00010841518542292761, + "clip_ratio/high_mean": 2.3967947527125943e-05, + "clip_ratio/low_mean": 0.00011336427724017994, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013733222294831648, + "epoch": 0.006597938144329897, + "grad_norm": 0.09265647083520889, + "kl": 9.21487808227539e-05, + "learning_rate": 3.7e-07, + "loss": 0.1121, + "step": 38 + }, + { + "clip_ratio/high_max": 6.219745228008833e-05, + "clip_ratio/high_mean": 1.2619579706552031e-05, + "clip_ratio/low_mean": 7.132467067094694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.394424799007538e-05, + "epoch": 0.006771568095496473, + "grad_norm": 0.09281119704246521, + "kl": 9.66191291809082e-05, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.1122, + "step": 39 + }, + { + "clip_ratio/high_max": 0.00010844510052265832, + "clip_ratio/high_mean": 2.2843767737867893e-05, + "clip_ratio/low_mean": 0.00013229608794063097, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015513985545112519, + "epoch": 0.00694519804666305, + "grad_norm": 0.09060566127300262, + "kl": 9.85264778137207e-05, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.1121, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3056.0, + "completions/mean_length": 988.8906860351562, + "completions/mean_terminated_length": 865.775390625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.007118827997829626, + "grad_norm": 0.09165029227733612, + "kl": 8.952617645263672e-05, + "learning_rate": 4e-07, + "loss": 0.085, + "num_tokens": 3200567.0, + "reward": 0.1897321492433548, + "reward_std": 0.23183387517929077, + "rewards/accuracy_reward/mean": 0.1897321492433548, + "rewards/accuracy_reward/std": 0.39252743124961853, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0001679516826698091, + "clip_ratio/high_mean": 2.899227456509834e-05, + "clip_ratio/low_mean": 8.33833173601306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001123755914704816, + "epoch": 0.0072924579489962015, + "grad_norm": 0.08812303096055984, + "kl": 9.34600830078125e-05, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.085, + "step": 42 + }, + { + "clip_ratio/high_max": 0.00021431623463286087, + "clip_ratio/high_mean": 4.141974716276309e-05, + "clip_ratio/low_mean": 7.455421678059793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011597396405704785, + "epoch": 0.007466087900162778, + "grad_norm": 0.09308629482984543, + "kl": 0.00010097026824951172, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.085, + "step": 43 + }, + { + "clip_ratio/high_max": 0.00013697694157599472, + "clip_ratio/high_mean": 2.4210043761740963e-05, + "clip_ratio/low_mean": 7.815628828211629e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001023663330670388, + "epoch": 0.007639717851329354, + "grad_norm": 0.0892244353890419, + "kl": 0.00010156631469726562, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.085, + "step": 44 + }, + { + "clip_ratio/high_max": 0.00020106123156438116, + "clip_ratio/high_mean": 3.764704842978972e-05, + "clip_ratio/low_mean": 0.0001562574616400525, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019390450324863195, + "epoch": 0.00781334780249593, + "grad_norm": 0.08605607599020004, + "kl": 0.00010067224502563477, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.085, + "step": 45 + }, + { + "clip_ratio/high_max": 0.00021489420214493293, + "clip_ratio/high_mean": 4.421249855113274e-05, + "clip_ratio/low_mean": 0.00010841752259693749, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001526300195564545, + "epoch": 0.007986977753662507, + "grad_norm": 0.0870320051908493, + "kl": 0.0001035928726196289, + "learning_rate": 4.5e-07, + "loss": 0.085, + "step": 46 + }, + { + "clip_ratio/high_max": 0.00031488635067944415, + "clip_ratio/high_mean": 7.150872943384456e-05, + "clip_ratio/low_mean": 0.00011320743965370639, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018471616658644052, + "epoch": 0.008160607704829083, + "grad_norm": 0.08356923609972, + "kl": 0.00011730194091796875, + "learning_rate": 4.6e-07, + "loss": 0.0849, + "step": 47 + }, + { + "clip_ratio/high_max": 0.00023286564282898325, + "clip_ratio/high_mean": 5.216112549533136e-05, + "clip_ratio/low_mean": 0.00014855244535283418, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020071356902917614, + "epoch": 0.00833423765599566, + "grad_norm": 0.08161472529172897, + "kl": 0.00012886524200439453, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.0849, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3002.0, + "completions/mean_length": 1160.8929443359375, + "completions/mean_terminated_length": 998.9346923828125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.008507867607162236, + "grad_norm": 0.0810905322432518, + "kl": 0.00011199712753295898, + "learning_rate": 4.8e-07, + "loss": 0.1006, + "num_tokens": 3786527.0, + "reward": 0.212053582072258, + "reward_std": 0.2776791751384735, + "rewards/accuracy_reward/mean": 0.2120535671710968, + "rewards/accuracy_reward/std": 0.40921956300735474, + "step": 49 + }, + { + "clip_ratio/high_max": 0.00012155798049207078, + "clip_ratio/high_mean": 2.4444943960588716e-05, + "clip_ratio/low_mean": 7.255377909132221e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.699872271085042e-05, + "epoch": 0.008681497558328812, + "grad_norm": 0.0808875784277916, + "kl": 0.00011438131332397461, + "learning_rate": 4.9e-07, + "loss": 0.1006, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0001246258707396919, + "clip_ratio/high_mean": 2.6213942874164786e-05, + "clip_ratio/low_mean": 5.216798012952495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.838192368581076e-05, + "epoch": 0.008855127509495388, + "grad_norm": 0.08084183931350708, + "kl": 0.00011897087097167969, + "learning_rate": 5e-07, + "loss": 0.1006, + "step": 51 + }, + { + "clip_ratio/high_max": 0.00010440920277687837, + "clip_ratio/high_mean": 1.9211074516078952e-05, + "clip_ratio/low_mean": 5.505485529511134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.42659295269732e-05, + "epoch": 0.009028757460661964, + "grad_norm": 0.08065138012170792, + "kl": 0.00012415647506713867, + "learning_rate": 5.1e-07, + "loss": 0.1006, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0001526883379483479, + "clip_ratio/high_mean": 3.393346742086578e-05, + "clip_ratio/low_mean": 6.860658731966396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010254005451315606, + "epoch": 0.009202387411828541, + "grad_norm": 0.08006803691387177, + "kl": 0.00014036893844604492, + "learning_rate": 5.2e-07, + "loss": 0.1006, + "step": 53 + }, + { + "clip_ratio/high_max": 0.00015472432642127387, + "clip_ratio/high_mean": 2.8822314334320254e-05, + "clip_ratio/low_mean": 9.972897760235355e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012855129034505808, + "epoch": 0.009376017362995117, + "grad_norm": 0.07948467135429382, + "kl": 0.00014001131057739258, + "learning_rate": 5.3e-07, + "loss": 0.1005, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0001281892527913442, + "clip_ratio/high_mean": 2.8609633318410488e-05, + "clip_ratio/low_mean": 0.00011305644147796556, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014166607843435486, + "epoch": 0.009549647314161693, + "grad_norm": 0.07863836735486984, + "kl": 0.0001538395881652832, + "learning_rate": 5.4e-07, + "loss": 0.1005, + "step": 55 + }, + { + "clip_ratio/high_max": 0.00021591029508272186, + "clip_ratio/high_mean": 4.273020863365673e-05, + "clip_ratio/low_mean": 0.00012304980691624223, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016578001577727264, + "epoch": 0.009723277265328269, + "grad_norm": 0.07811674475669861, + "kl": 0.00016063451766967773, + "learning_rate": 5.5e-07, + "loss": 0.1005, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0647321428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3030.0, + "completions/mean_length": 1078.890625, + "completions/mean_terminated_length": 940.9427490234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.009896907216494845, + "grad_norm": 0.05809413641691208, + "kl": 0.00015920400619506836, + "learning_rate": 5.6e-07, + "loss": 0.0966, + "num_tokens": 4337990.0, + "reward": 0.1919642984867096, + "reward_std": 0.2389710694551468, + "rewards/accuracy_reward/mean": 0.1919642835855484, + "rewards/accuracy_reward/std": 0.3942854404449463, + "step": 57 + }, + { + "clip_ratio/high_max": 9.883356233331142e-05, + "clip_ratio/high_mean": 1.7071649949684797e-05, + "clip_ratio/low_mean": 8.026342766243033e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.733507886267034e-05, + "epoch": 0.010070537167661422, + "grad_norm": 0.058291442692279816, + "kl": 0.0001792311668395996, + "learning_rate": 5.699999999999999e-07, + "loss": 0.0967, + "step": 58 + }, + { + "clip_ratio/high_max": 7.553357681899797e-05, + "clip_ratio/high_mean": 1.4598224424844375e-05, + "clip_ratio/low_mean": 6.950293573027011e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.410116151935654e-05, + "epoch": 0.010244167118827998, + "grad_norm": 0.05770552530884743, + "kl": 0.00018918514251708984, + "learning_rate": 5.8e-07, + "loss": 0.0966, + "step": 59 + }, + { + "clip_ratio/high_max": 0.00012035238160024164, + "clip_ratio/high_mean": 3.2588331578153884e-05, + "clip_ratio/low_mean": 0.00010148188289349491, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013407021515376982, + "epoch": 0.010417797069994574, + "grad_norm": 0.057692401111125946, + "kl": 0.0002110004425048828, + "learning_rate": 5.9e-07, + "loss": 0.0966, + "step": 60 + }, + { + "clip_ratio/high_max": 0.00012550407063827151, + "clip_ratio/high_mean": 2.3239101210492663e-05, + "clip_ratio/low_mean": 0.00012950542713952018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015274452835001284, + "epoch": 0.01059142702116115, + "grad_norm": 0.057331617921590805, + "kl": 0.0002313852310180664, + "learning_rate": 6e-07, + "loss": 0.0966, + "step": 61 + }, + { + "clip_ratio/high_max": 0.00016726318881410407, + "clip_ratio/high_mean": 3.171255480083346e-05, + "clip_ratio/low_mean": 0.0001368561224808218, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016856867387105012, + "epoch": 0.010765056972327727, + "grad_norm": 0.056989286094903946, + "kl": 0.00024771690368652344, + "learning_rate": 6.1e-07, + "loss": 0.0966, + "step": 62 + }, + { + "clip_ratio/high_max": 0.00013737804783886531, + "clip_ratio/high_mean": 2.2813339683125378e-05, + "clip_ratio/low_mean": 0.00010997585422956035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001327891914115753, + "epoch": 0.010938686923494303, + "grad_norm": 0.057343486696481705, + "kl": 0.00024831295013427734, + "learning_rate": 6.2e-07, + "loss": 0.0965, + "step": 63 + }, + { + "clip_ratio/high_max": 0.00016584300738031743, + "clip_ratio/high_mean": 2.9249637009343132e-05, + "clip_ratio/low_mean": 0.0001424719978331268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001717216332508542, + "epoch": 0.011112316874660879, + "grad_norm": 0.05638548359274864, + "kl": 0.0002677440643310547, + "learning_rate": 6.3e-07, + "loss": 0.0965, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3057.0, + "completions/mean_length": 1200.466552734375, + "completions/mean_terminated_length": 996.6361083984375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.011285946825827455, + "grad_norm": 0.06586487591266632, + "kl": 0.0002557039260864258, + "learning_rate": 6.4e-07, + "loss": 0.1014, + "num_tokens": 4943351.0, + "reward": 0.2165178656578064, + "reward_std": 0.25919023156166077, + "rewards/accuracy_reward/mean": 0.2165178507566452, + "rewards/accuracy_reward/std": 0.41233164072036743, + "step": 65 + }, + { + "clip_ratio/high_max": 9.257514739147155e-05, + "clip_ratio/high_mean": 1.5136920751501748e-05, + "clip_ratio/low_mean": 9.249962863577821e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010763654859147209, + "epoch": 0.01145957677699403, + "grad_norm": 0.06532612442970276, + "kl": 0.0002568960189819336, + "learning_rate": 6.5e-07, + "loss": 0.1014, + "step": 66 + }, + { + "clip_ratio/high_max": 0.00014168794223223813, + "clip_ratio/high_mean": 3.67118568647129e-05, + "clip_ratio/low_mean": 0.00010102744954565424, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013773930140814628, + "epoch": 0.011633206728160608, + "grad_norm": 0.06464871764183044, + "kl": 0.00027680397033691406, + "learning_rate": 6.6e-07, + "loss": 0.1014, + "step": 67 + }, + { + "clip_ratio/high_max": 0.00016270207197521813, + "clip_ratio/high_mean": 2.535767532663158e-05, + "clip_ratio/low_mean": 0.0001051789729444863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001305366477026837, + "epoch": 0.011806836679327184, + "grad_norm": 0.06376749277114868, + "kl": 0.0002999305725097656, + "learning_rate": 6.7e-07, + "loss": 0.1013, + "step": 68 + }, + { + "clip_ratio/high_max": 0.00011891062604263425, + "clip_ratio/high_mean": 3.1178230642581184e-05, + "clip_ratio/low_mean": 0.00010918855605268618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001403667915838014, + "epoch": 0.01198046663049376, + "grad_norm": 0.062105316668748856, + "kl": 0.00032901763916015625, + "learning_rate": 6.800000000000001e-07, + "loss": 0.1013, + "step": 69 + }, + { + "clip_ratio/high_max": 0.00014659844509878894, + "clip_ratio/high_mean": 3.453203146364103e-05, + "clip_ratio/low_mean": 0.0001557424211569014, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001902744552353397, + "epoch": 0.012154096581660336, + "grad_norm": 0.06183410808444023, + "kl": 0.00034737586975097656, + "learning_rate": 6.9e-07, + "loss": 0.1013, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0001618939486434101, + "clip_ratio/high_mean": 3.591397171476274e-05, + "clip_ratio/low_mean": 0.00016991257416520966, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020582654906320386, + "epoch": 0.012327726532826913, + "grad_norm": 0.060535185039043427, + "kl": 0.0003695487976074219, + "learning_rate": 7e-07, + "loss": 0.1012, + "step": 71 + }, + { + "clip_ratio/high_max": 0.00020508543730102247, + "clip_ratio/high_mean": 5.265289792077965e-05, + "clip_ratio/low_mean": 0.0001940940292115556, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002467469275870826, + "epoch": 0.01250135648399349, + "grad_norm": 0.0596056692302227, + "kl": 0.00040793418884277344, + "learning_rate": 7.1e-07, + "loss": 0.1011, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3045.0, + "completions/mean_length": 1005.2188110351562, + "completions/mean_terminated_length": 830.0678100585938, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.012674986435160065, + "grad_norm": 0.09431299567222595, + "kl": 0.000469207763671875, + "learning_rate": 7.2e-07, + "loss": 0.2052, + "num_tokens": 5454785.0, + "reward": 0.3169642984867096, + "reward_std": 0.3128453195095062, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "step": 73 + }, + { + "clip_ratio/high_max": 0.00018865706988435704, + "clip_ratio/high_mean": 5.247045248779614e-05, + "clip_ratio/low_mean": 8.627016336504312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001387406196045049, + "epoch": 0.012848616386326641, + "grad_norm": 0.09711380302906036, + "kl": 0.0004971027374267578, + "learning_rate": 7.3e-07, + "loss": 0.2053, + "step": 74 + }, + { + "clip_ratio/high_max": 0.00026415193678985815, + "clip_ratio/high_mean": 4.625358667453838e-05, + "clip_ratio/low_mean": 0.000107605147150025, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015385873530249228, + "epoch": 0.013022246337493217, + "grad_norm": 0.09356379508972168, + "kl": 0.0005331039428710938, + "learning_rate": 7.4e-07, + "loss": 0.2052, + "step": 75 + }, + { + "clip_ratio/high_max": 0.00018644466399564408, + "clip_ratio/high_mean": 4.887306351974985e-05, + "clip_ratio/low_mean": 0.00011954909859923646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016842216427903622, + "epoch": 0.013195876288659794, + "grad_norm": 0.09154535830020905, + "kl": 0.0005588531494140625, + "learning_rate": 7.5e-07, + "loss": 0.2051, + "step": 76 + }, + { + "clip_ratio/high_max": 0.00020755413788720034, + "clip_ratio/high_mean": 4.1972878989327e-05, + "clip_ratio/low_mean": 0.00014174781313158746, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018372068689131993, + "epoch": 0.01336950623982637, + "grad_norm": 0.09051401168107986, + "kl": 0.0005941390991210938, + "learning_rate": 7.599999999999999e-07, + "loss": 0.2051, + "step": 77 + }, + { + "clip_ratio/high_max": 0.00034200187019450823, + "clip_ratio/high_mean": 5.432748537259613e-05, + "clip_ratio/low_mean": 0.00023175117985374527, + "clip_ratio/low_min": 1.703229281702079e-05, + "clip_ratio/region_mean": 0.00028607866261154413, + "epoch": 0.013543136190992946, + "grad_norm": 0.08831170946359634, + "kl": 0.0006444454193115234, + "learning_rate": 7.699999999999999e-07, + "loss": 0.2049, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0003072102390433429, + "clip_ratio/high_mean": 7.849929306757986e-05, + "clip_ratio/low_mean": 0.00034069092816935154, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00041919022260117345, + "epoch": 0.013716766142159522, + "grad_norm": 0.0850556269288063, + "kl": 0.0007207393646240234, + "learning_rate": 7.799999999999999e-07, + "loss": 0.2048, + "step": 79 + }, + { + "clip_ratio/high_max": 0.00048199150296568405, + "clip_ratio/high_mean": 0.00010863949273698381, + "clip_ratio/low_mean": 0.000415855715800717, + "clip_ratio/low_min": 3.406458563404158e-05, + "clip_ratio/region_mean": 0.0005244952099019429, + "epoch": 0.0138903960933261, + "grad_norm": 0.08350726962089539, + "kl": 0.0007863044738769531, + "learning_rate": 7.9e-07, + "loss": 0.2047, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060267857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3066.0, + "completions/mean_length": 1026.169677734375, + "completions/mean_terminated_length": 894.9644165039062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.014064026044492675, + "grad_norm": 0.11045031994581223, + "kl": 0.0007188320159912109, + "learning_rate": 8e-07, + "loss": 0.0618, + "num_tokens": 5982949.0, + "reward": 0.2388392984867096, + "reward_std": 0.2520594596862793, + "rewards/accuracy_reward/mean": 0.2388392835855484, + "rewards/accuracy_reward/std": 0.4268510043621063, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0002242754899270949, + "clip_ratio/high_mean": 3.7077297179166635e-05, + "clip_ratio/low_mean": 9.089440675325022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012797170325029583, + "epoch": 0.014237655995659251, + "grad_norm": 0.11083557456731796, + "kl": 0.0007548332214355469, + "learning_rate": 8.1e-07, + "loss": 0.0618, + "step": 82 + }, + { + "clip_ratio/high_max": 0.00020692071393568767, + "clip_ratio/high_mean": 3.56615225882706e-05, + "clip_ratio/low_mean": 7.765734721942863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011331886753396248, + "epoch": 0.014411285946825827, + "grad_norm": 0.11028597503900528, + "kl": 0.0007700920104980469, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0617, + "step": 83 + }, + { + "clip_ratio/high_max": 0.00021996179202687927, + "clip_ratio/high_mean": 4.556706562652835e-05, + "clip_ratio/low_mean": 0.00014453896699251345, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019010603398328385, + "epoch": 0.014584915897992403, + "grad_norm": 0.10600043088197708, + "kl": 0.0008153915405273438, + "learning_rate": 8.299999999999999e-07, + "loss": 0.0617, + "step": 84 + }, + { + "clip_ratio/high_max": 0.00017493029190518428, + "clip_ratio/high_mean": 3.109146177848743e-05, + "clip_ratio/low_mean": 0.00015599238474806043, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018708384857291094, + "epoch": 0.01475854584915898, + "grad_norm": 0.1012059897184372, + "kl": 0.0008490085601806641, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0615, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0002562393874541158, + "clip_ratio/high_mean": 3.7630897622875636e-05, + "clip_ratio/low_mean": 0.00022896333712196792, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026659423656383296, + "epoch": 0.014932175800325556, + "grad_norm": 0.09839516878128052, + "kl": 0.0009174346923828125, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0614, + "step": 86 + }, + { + "clip_ratio/high_max": 0.00045952131404192187, + "clip_ratio/high_mean": 0.00010341303345740016, + "clip_ratio/low_mean": 0.000319124579164054, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004225376196700381, + "epoch": 0.015105805751492132, + "grad_norm": 0.09919780492782593, + "kl": 0.0009403228759765625, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0613, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0005570015837292885, + "clip_ratio/high_mean": 0.00013585392844106536, + "clip_ratio/low_mean": 0.0003930496450266219, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005289035789246554, + "epoch": 0.015279435702658708, + "grad_norm": 0.08871521055698395, + "kl": 0.00098419189453125, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0612, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3041.0, + "completions/mean_length": 973.2678833007812, + "completions/mean_terminated_length": 859.6893920898438, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.015453065653825284, + "grad_norm": 0.07832150161266327, + "kl": 0.0008444786071777344, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0415, + "num_tokens": 6484357.0, + "reward": 0.2790178656578064, + "reward_std": 0.2440057247877121, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "step": 89 + }, + { + "clip_ratio/high_max": 0.00016099204549391288, + "clip_ratio/high_mean": 2.73356216666798e-05, + "clip_ratio/low_mean": 7.892878966231365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010626441053318558, + "epoch": 0.01562669560499186, + "grad_norm": 0.07688305526971817, + "kl": 0.0008554458618164062, + "learning_rate": 8.9e-07, + "loss": 0.0414, + "step": 90 + }, + { + "clip_ratio/high_max": 0.00017042357285390608, + "clip_ratio/high_mean": 2.867850150778395e-05, + "clip_ratio/low_mean": 0.00011265330908827309, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014133181025499653, + "epoch": 0.015800325556158436, + "grad_norm": 0.07443048059940338, + "kl": 0.0008955001831054688, + "learning_rate": 9e-07, + "loss": 0.0414, + "step": 91 + }, + { + "clip_ratio/high_max": 0.00012658219111472135, + "clip_ratio/high_mean": 2.5285736001023906e-05, + "clip_ratio/low_mean": 8.549421204406826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011077994668085012, + "epoch": 0.015973955507325013, + "grad_norm": 0.07477273792028427, + "kl": 0.00095367431640625, + "learning_rate": 9.1e-07, + "loss": 0.0413, + "step": 92 + }, + { + "clip_ratio/high_max": 0.00020998206491640303, + "clip_ratio/high_mean": 4.2077007947227685e-05, + "clip_ratio/low_mean": 0.0001513093657194986, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019338637139298953, + "epoch": 0.01614758545849159, + "grad_norm": 0.07124733179807663, + "kl": 0.0009908676147460938, + "learning_rate": 9.2e-07, + "loss": 0.0413, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0003180483063260908, + "clip_ratio/high_mean": 6.089267048992042e-05, + "clip_ratio/low_mean": 0.000301444641991111, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003623373095251736, + "epoch": 0.016321215409658165, + "grad_norm": 0.06833954155445099, + "kl": 0.001064300537109375, + "learning_rate": 9.3e-07, + "loss": 0.0412, + "step": 94 + }, + { + "clip_ratio/high_max": 0.000373542799934512, + "clip_ratio/high_mean": 7.861555286581279e-05, + "clip_ratio/low_mean": 0.00033876666111609666, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004173822171651409, + "epoch": 0.016494845360824743, + "grad_norm": 0.06791101396083832, + "kl": 0.0011701583862304688, + "learning_rate": 9.399999999999999e-07, + "loss": 0.041, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0005620219963020645, + "clip_ratio/high_mean": 0.00011367950537533034, + "clip_ratio/low_mean": 0.0004376019815026666, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005512814832400181, + "epoch": 0.01666847531199132, + "grad_norm": 0.06597509980201721, + "kl": 0.0012426376342773438, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0409, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0424107142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2996.0, + "completions/mean_length": 996.1563110351562, + "completions/mean_terminated_length": 904.2191162109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.016842105263157894, + "grad_norm": 0.08962814509868622, + "kl": 0.0008907318115234375, + "learning_rate": 9.6e-07, + "loss": 0.1002, + "num_tokens": 6996099.0, + "reward": 0.3035714328289032, + "reward_std": 0.28586265444755554, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "step": 97 + }, + { + "clip_ratio/high_max": 8.884687940735603e-05, + "clip_ratio/high_mean": 2.605929273613583e-05, + "clip_ratio/low_mean": 7.139072999962082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.745002171257511e-05, + "epoch": 0.017015735214324472, + "grad_norm": 0.08770177513360977, + "kl": 0.0009493827819824219, + "learning_rate": 9.7e-07, + "loss": 0.1002, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0001762988013069844, + "clip_ratio/high_mean": 4.0132708363671554e-05, + "clip_ratio/low_mean": 8.831965647004836e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012845236415159889, + "epoch": 0.017189365165491046, + "grad_norm": 0.08555649220943451, + "kl": 0.0010132789611816406, + "learning_rate": 9.8e-07, + "loss": 0.1001, + "step": 99 + }, + { + "clip_ratio/high_max": 0.00037393095044535585, + "clip_ratio/high_mean": 6.405019064459339e-05, + "clip_ratio/low_mean": 0.00014098743713475415, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002050376247098029, + "epoch": 0.017362995116657624, + "grad_norm": 0.08166724443435669, + "kl": 0.0011091232299804688, + "learning_rate": 9.9e-07, + "loss": 0.1, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0004312006349209696, + "clip_ratio/high_mean": 8.29151215384627e-05, + "clip_ratio/low_mean": 0.00021879333689867053, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030170845275279135, + "epoch": 0.0175366250678242, + "grad_norm": 0.07911830395460129, + "kl": 0.0011663436889648438, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 101 + }, + { + "clip_ratio/high_max": 0.00042637546539481264, + "clip_ratio/high_mean": 8.882865267878515e-05, + "clip_ratio/low_mean": 0.00035785686850431375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004466855389182456, + "epoch": 0.017710255018990775, + "grad_norm": 0.07597984373569489, + "kl": 0.0012717247009277344, + "learning_rate": 1e-06, + "loss": 0.0997, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0006530468817800283, + "clip_ratio/high_mean": 0.00013048992650510627, + "clip_ratio/low_mean": 0.00046826389188936446, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005987538261251757, + "epoch": 0.017883884970157353, + "grad_norm": 0.07416495680809021, + "kl": 0.0013484954833984375, + "learning_rate": 1e-06, + "loss": 0.0995, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0009700720911496319, + "clip_ratio/high_mean": 0.0001942694098033826, + "clip_ratio/low_mean": 0.0005604173165920656, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007546867382188793, + "epoch": 0.018057514921323927, + "grad_norm": 0.07335416227579117, + "kl": 0.0014243125915527344, + "learning_rate": 1e-06, + "loss": 0.0994, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0647321428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2937.0, + "completions/mean_length": 1022.5647583007812, + "completions/mean_terminated_length": 880.7183837890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.018231144872490505, + "grad_norm": 0.07201837748289108, + "kl": 0.0012178421020507812, + "learning_rate": 1e-06, + "loss": 0.0805, + "num_tokens": 7519240.0, + "reward": 0.2165178656578064, + "reward_std": 0.21725556254386902, + "rewards/accuracy_reward/mean": 0.2165178507566452, + "rewards/accuracy_reward/std": 0.41233164072036743, + "step": 105 + }, + { + "clip_ratio/high_max": 0.00014526121867675101, + "clip_ratio/high_mean": 2.213181198840175e-05, + "clip_ratio/low_mean": 6.364144383042003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.577325434089289e-05, + "epoch": 0.018404774823657082, + "grad_norm": 0.07121127098798752, + "kl": 0.0012769699096679688, + "learning_rate": 1e-06, + "loss": 0.0805, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0003467846117928275, + "clip_ratio/high_mean": 5.9322157085262006e-05, + "clip_ratio/low_mean": 9.673506110630115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015605721637257375, + "epoch": 0.018578404774823656, + "grad_norm": 0.07026924937963486, + "kl": 0.0012917518615722656, + "learning_rate": 1e-06, + "loss": 0.0805, + "step": 107 + }, + { + "clip_ratio/high_max": 0.00023550596233690158, + "clip_ratio/high_mean": 4.1465860476819216e-05, + "clip_ratio/low_mean": 0.00012672945786107448, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016819531992950942, + "epoch": 0.018752034725990234, + "grad_norm": 0.06839488446712494, + "kl": 0.0013484954833984375, + "learning_rate": 1e-06, + "loss": 0.0804, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0004035711845062906, + "clip_ratio/high_mean": 6.820505836913071e-05, + "clip_ratio/low_mean": 0.0001611022416909691, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022930730119696818, + "epoch": 0.018925664677156808, + "grad_norm": 0.06637325137853622, + "kl": 0.0014042854309082031, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0005445892875286518, + "clip_ratio/high_mean": 9.60605432283046e-05, + "clip_ratio/low_mean": 0.00023658055852138205, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033264110334130237, + "epoch": 0.019099294628323386, + "grad_norm": 0.06226283311843872, + "kl": 0.0014386177062988281, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0006908353989274474, + "clip_ratio/high_mean": 0.0001292448923777556, + "clip_ratio/low_mean": 0.0003256560162299138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00045490090906241676, + "epoch": 0.019272924579489963, + "grad_norm": 0.0611107274889946, + "kl": 0.0014801025390625, + "learning_rate": 1e-06, + "loss": 0.08, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0011556729550648015, + "clip_ratio/high_mean": 0.00022815814600107842, + "clip_ratio/low_mean": 0.00032356871270167176, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005517268618859816, + "epoch": 0.019446554530656537, + "grad_norm": 0.060339849442243576, + "kl": 0.00157928466796875, + "learning_rate": 1e-06, + "loss": 0.0799, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3036.0, + "completions/mean_length": 979.0045166015625, + "completions/mean_terminated_length": 876.0702514648438, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.019620184481823115, + "grad_norm": 0.06964851170778275, + "kl": 0.0015101432800292969, + "learning_rate": 1e-06, + "loss": 0.0826, + "num_tokens": 8028714.0, + "reward": 0.314732164144516, + "reward_std": 0.24220770597457886, + "rewards/accuracy_reward/mean": 0.3147321343421936, + "rewards/accuracy_reward/std": 0.4649282693862915, + "step": 113 + }, + { + "clip_ratio/high_max": 0.00024118073088175151, + "clip_ratio/high_mean": 4.5313062400964554e-05, + "clip_ratio/low_mean": 7.009086084508453e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011540392324604909, + "epoch": 0.01979381443298969, + "grad_norm": 0.06875589489936829, + "kl": 0.0015625953674316406, + "learning_rate": 1e-06, + "loss": 0.0826, + "step": 114 + }, + { + "clip_ratio/high_max": 0.00013809604388370644, + "clip_ratio/high_mean": 3.406273185646569e-05, + "clip_ratio/low_mean": 8.048790186876431e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011455063258836162, + "epoch": 0.019967444384156267, + "grad_norm": 0.06772466003894806, + "kl": 0.0015835762023925781, + "learning_rate": 1e-06, + "loss": 0.0825, + "step": 115 + }, + { + "clip_ratio/high_max": 0.00022875043578096665, + "clip_ratio/high_mean": 4.0153966892830795e-05, + "clip_ratio/low_mean": 7.630263394275971e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011645660606518504, + "epoch": 0.020141074335322844, + "grad_norm": 0.06836879998445511, + "kl": 0.00160980224609375, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 116 + }, + { + "clip_ratio/high_max": 0.00036874830766464584, + "clip_ratio/high_mean": 7.70110639223276e-05, + "clip_ratio/low_mean": 0.0001127591285694507, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001897701895359205, + "epoch": 0.02031470428648942, + "grad_norm": 0.06553922593593597, + "kl": 0.0016446113586425781, + "learning_rate": 1e-06, + "loss": 0.0822, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0004669185072998516, + "clip_ratio/high_mean": 0.0001029003185522015, + "clip_ratio/low_mean": 0.0001852769314609759, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002881772416003514, + "epoch": 0.020488334237655996, + "grad_norm": 0.06398870795965195, + "kl": 0.0019135475158691406, + "learning_rate": 1e-06, + "loss": 0.0821, + "step": 118 + }, + { + "clip_ratio/high_max": 0.00044479290045273956, + "clip_ratio/high_mean": 0.00011731415941085288, + "clip_ratio/low_mean": 0.00027997055531159276, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00039728472438582685, + "epoch": 0.020661964188822574, + "grad_norm": 0.060857128351926804, + "kl": 0.0019588470458984375, + "learning_rate": 1e-06, + "loss": 0.0819, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0006475476484411047, + "clip_ratio/high_mean": 0.00016795206431652332, + "clip_ratio/low_mean": 0.00039172447122837184, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000559676525881514, + "epoch": 0.020835594139989148, + "grad_norm": 0.06337802857160568, + "kl": 0.006945610046386719, + "learning_rate": 1e-06, + "loss": 0.0818, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0647321428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2931.0, + "completions/mean_length": 1012.05810546875, + "completions/mean_terminated_length": 869.4844970703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.021009224091155725, + "grad_norm": 0.06343921273946762, + "kl": 0.0016217231750488281, + "learning_rate": 1e-06, + "loss": 0.0576, + "num_tokens": 8547940.0, + "reward": 0.2857142984867096, + "reward_std": 0.24281248450279236, + "rewards/accuracy_reward/mean": 0.2857142984867096, + "rewards/accuracy_reward/std": 0.45225897431373596, + "step": 121 + }, + { + "clip_ratio/high_max": 0.00015984401761670597, + "clip_ratio/high_mean": 3.2233886486210395e-05, + "clip_ratio/low_mean": 4.042998898512451e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.266387228810345e-05, + "epoch": 0.0211828540423223, + "grad_norm": 0.06328827887773514, + "kl": 0.0016565322875976562, + "learning_rate": 1e-06, + "loss": 0.0576, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0002079317318930407, + "clip_ratio/high_mean": 4.6169100187398726e-05, + "clip_ratio/low_mean": 4.9363021275894425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.553212214541418e-05, + "epoch": 0.021356483993488877, + "grad_norm": 0.06310537457466125, + "kl": 0.0016760826110839844, + "learning_rate": 1e-06, + "loss": 0.0576, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0001335408705926966, + "clip_ratio/high_mean": 2.3682008304604096e-05, + "clip_ratio/low_mean": 5.922453442508413e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.290654091069882e-05, + "epoch": 0.021530113944655455, + "grad_norm": 0.06295251846313477, + "kl": 0.001682281494140625, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 124 + }, + { + "clip_ratio/high_max": 0.000468465772428317, + "clip_ratio/high_mean": 8.70894049285198e-05, + "clip_ratio/low_mean": 9.746788873599144e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018455729423294542, + "epoch": 0.02170374389582203, + "grad_norm": 0.06160371005535126, + "kl": 0.0017123222351074219, + "learning_rate": 1e-06, + "loss": 0.0574, + "step": 125 + }, + { + "clip_ratio/high_max": 0.00033351476668030955, + "clip_ratio/high_mean": 6.599840207854868e-05, + "clip_ratio/low_mean": 0.00015436868443430285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022036708742234623, + "epoch": 0.021877373846988606, + "grad_norm": 0.06023632735013962, + "kl": 0.001766204833984375, + "learning_rate": 1e-06, + "loss": 0.0572, + "step": 126 + }, + { + "clip_ratio/high_max": 0.00045442115879268385, + "clip_ratio/high_mean": 8.075954519881634e-05, + "clip_ratio/low_mean": 0.00020491478198891855, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028567432855197694, + "epoch": 0.02205100379815518, + "grad_norm": 0.059372879564762115, + "kl": 0.0018067359924316406, + "learning_rate": 1e-06, + "loss": 0.0571, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0007542960265709553, + "clip_ratio/high_mean": 0.00015878474175679003, + "clip_ratio/low_mean": 0.0002918565678555751, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004506413060880732, + "epoch": 0.022224633749321758, + "grad_norm": 0.0571664422750473, + "kl": 0.0018205642700195312, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0379464285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2919.0, + "completions/mean_length": 955.6138916015625, + "completions/mean_terminated_length": 872.1368408203125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.022398263700488336, + "grad_norm": 0.057005640119314194, + "kl": 0.0013856887817382812, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 9047183.0, + "reward": 0.243303582072258, + "reward_std": 0.19260793924331665, + "rewards/accuracy_reward/mean": 0.2433035671710968, + "rewards/accuracy_reward/std": 0.42955654859542847, + "step": 129 + }, + { + "clip_ratio/high_max": 7.326933246076806e-05, + "clip_ratio/high_mean": 1.6158443600033934e-05, + "clip_ratio/low_mean": 3.133396603516303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.74924088393891e-05, + "epoch": 0.02257189365165491, + "grad_norm": 0.056930623948574066, + "kl": 0.0014224052429199219, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0001849055006459821, + "clip_ratio/high_mean": 3.323665521293151e-05, + "clip_ratio/low_mean": 5.8128867522100336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.136552466770809e-05, + "epoch": 0.022745523602821487, + "grad_norm": 0.056313302367925644, + "kl": 0.0014595985412597656, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 131 + }, + { + "clip_ratio/high_max": 0.00022968542089074617, + "clip_ratio/high_mean": 4.783902329563716e-05, + "clip_ratio/low_mean": 5.7108149803752895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010494717275832954, + "epoch": 0.02291915355398806, + "grad_norm": 0.05616778880357742, + "kl": 0.0014619827270507812, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 132 + }, + { + "clip_ratio/high_max": 0.00026807786525751, + "clip_ratio/high_mean": 6.31882905963721e-05, + "clip_ratio/low_mean": 0.00013262989091344934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019581817377911648, + "epoch": 0.02309278350515464, + "grad_norm": 0.05515192821621895, + "kl": 0.0015435218811035156, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 133 + }, + { + "clip_ratio/high_max": 0.00045220999072626, + "clip_ratio/high_mean": 9.471168698382826e-05, + "clip_ratio/low_mean": 0.00014267419146563043, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002373858787905192, + "epoch": 0.023266413456321217, + "grad_norm": 0.05476092919707298, + "kl": 0.0015516281127929688, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0004470752182896831, + "clip_ratio/high_mean": 0.00011202639598195674, + "clip_ratio/low_mean": 0.0001511907557869563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026321716177335475, + "epoch": 0.02344004340748779, + "grad_norm": 0.054482921957969666, + "kl": 0.0015797615051269531, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0006936039699212415, + "clip_ratio/high_mean": 0.0001707356555016304, + "clip_ratio/low_mean": 0.00024215895155066391, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004128946266064304, + "epoch": 0.02361367335865437, + "grad_norm": 0.05383079871535301, + "kl": 0.0016427040100097656, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0401785714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3003.0, + "completions/mean_length": 987.8973388671875, + "completions/mean_terminated_length": 900.6558227539062, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.023787303309820942, + "grad_norm": 0.08290693163871765, + "kl": 0.0018758773803710938, + "learning_rate": 1e-06, + "loss": 0.0378, + "num_tokens": 9549905.0, + "reward": 0.3035714328289032, + "reward_std": 0.2951851487159729, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0002218438739873818, + "clip_ratio/high_mean": 4.477615084397257e-05, + "clip_ratio/low_mean": 6.174628288135864e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010652243554432062, + "epoch": 0.02396093326098752, + "grad_norm": 0.08015606552362442, + "kl": 0.0019130706787109375, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0003537330576364184, + "clip_ratio/high_mean": 9.184439272758027e-05, + "clip_ratio/low_mean": 7.87214435149508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001705658287391998, + "epoch": 0.024134563212154098, + "grad_norm": 0.07555245608091354, + "kl": 0.0019807815551757812, + "learning_rate": 1e-06, + "loss": 0.0376, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0003051557650906034, + "clip_ratio/high_mean": 7.390725613731775e-05, + "clip_ratio/low_mean": 0.0001728247807477601, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002467320355208358, + "epoch": 0.02430819316332067, + "grad_norm": 0.07540517300367355, + "kl": 0.002040863037109375, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0004980636476830114, + "clip_ratio/high_mean": 0.00011843086258522817, + "clip_ratio/low_mean": 0.0001919290658634054, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00031035993379191495, + "epoch": 0.02448182311448725, + "grad_norm": 0.07049719244241714, + "kl": 0.0021924972534179688, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0007083129894454032, + "clip_ratio/high_mean": 0.00015012747144282912, + "clip_ratio/low_mean": 0.00033265770889556734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000482785168060218, + "epoch": 0.024655453065653827, + "grad_norm": 0.06842149794101715, + "kl": 0.0022840499877929688, + "learning_rate": 1e-06, + "loss": 0.0372, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0014356730971485376, + "clip_ratio/high_mean": 0.00031906748336041346, + "clip_ratio/low_mean": 0.0003924428983737016, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007115103908290621, + "epoch": 0.0248290830168204, + "grad_norm": 0.06725531816482544, + "kl": 0.0024328231811523438, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 143 + }, + { + "clip_ratio/high_max": 0.00184958733007079, + "clip_ratio/high_mean": 0.00042206205398542807, + "clip_ratio/low_mean": 0.00047157559947663685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008936376871133689, + "epoch": 0.02500271296798698, + "grad_norm": 0.06511066108942032, + "kl": 0.00250244140625, + "learning_rate": 1e-06, + "loss": 0.0368, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033482142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3032.0, + "completions/mean_length": 920.5535888671875, + "completions/mean_terminated_length": 846.0230712890625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.025176342919153553, + "grad_norm": 0.07030708342790604, + "kl": 0.0033507347106933594, + "learning_rate": 1e-06, + "loss": 0.0516, + "num_tokens": 10031681.0, + "reward": 0.2366071492433548, + "reward_std": 0.22048716247081757, + "rewards/accuracy_reward/mean": 0.2366071492433548, + "rewards/accuracy_reward/std": 0.4254741966724396, + "step": 145 + }, + { + "clip_ratio/high_max": 0.00012553701526485384, + "clip_ratio/high_mean": 1.9572279597923625e-05, + "clip_ratio/low_mean": 3.527367766764655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4845957834004366e-05, + "epoch": 0.02534997287032013, + "grad_norm": 0.06973014771938324, + "kl": 0.003448009490966797, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 146 + }, + { + "clip_ratio/high_max": 0.00023112245617085136, + "clip_ratio/high_mean": 4.108958228243864e-05, + "clip_ratio/low_mean": 7.726499984528346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011835458099085372, + "epoch": 0.025523602821486708, + "grad_norm": 0.06742454320192337, + "kl": 0.0036106109619140625, + "learning_rate": 1e-06, + "loss": 0.0514, + "step": 147 + }, + { + "clip_ratio/high_max": 0.00028261789339012466, + "clip_ratio/high_mean": 5.363225932342175e-05, + "clip_ratio/low_mean": 8.012215084818308e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013375441358220996, + "epoch": 0.025697232772653282, + "grad_norm": 0.06728488951921463, + "kl": 0.0036330223083496094, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 148 + }, + { + "clip_ratio/high_max": 0.00047033898681547726, + "clip_ratio/high_mean": 0.00011440584557931288, + "clip_ratio/low_mean": 0.000143927357839857, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002583332079666434, + "epoch": 0.02587086272381986, + "grad_norm": 0.06493983417749405, + "kl": 0.0037364959716796875, + "learning_rate": 1e-06, + "loss": 0.0512, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0007258505465870257, + "clip_ratio/high_mean": 0.00015386336144729285, + "clip_ratio/low_mean": 0.00022877389665154624, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00038263726219156524, + "epoch": 0.026044492674986434, + "grad_norm": 0.062032733112573624, + "kl": 0.0039081573486328125, + "learning_rate": 1e-06, + "loss": 0.0511, + "step": 150 + }, + { + "clip_ratio/high_max": 0.000978127933194628, + "clip_ratio/high_mean": 0.00021059580376459053, + "clip_ratio/low_mean": 0.00031547414164379006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000526069929037476, + "epoch": 0.02621812262615301, + "grad_norm": 0.05963636189699173, + "kl": 0.0039005279541015625, + "learning_rate": 1e-06, + "loss": 0.0509, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0015005462882982101, + "clip_ratio/high_mean": 0.00035101260891678976, + "clip_ratio/low_mean": 0.00038873040193720954, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007397430272249039, + "epoch": 0.02639175257731959, + "grad_norm": 0.05930650979280472, + "kl": 0.004211902618408203, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0379464285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3065.0, + "completions/mean_length": 912.7120971679688, + "completions/mean_terminated_length": 827.5429077148438, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.026565382528486163, + "grad_norm": 0.08117852360010147, + "kl": 0.0026111602783203125, + "learning_rate": 1e-06, + "loss": 0.0603, + "num_tokens": 10507168.0, + "reward": 0.2723214328289032, + "reward_std": 0.23747976124286652, + "rewards/accuracy_reward/mean": 0.2723214328289032, + "rewards/accuracy_reward/std": 0.4456520676612854, + "step": 153 + }, + { + "clip_ratio/high_max": 0.00024314642178069334, + "clip_ratio/high_mean": 4.562768071991741e-05, + "clip_ratio/low_mean": 8.730899253350799e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013293667370817275, + "epoch": 0.02673901247965274, + "grad_norm": 0.07922299206256866, + "kl": 0.0026540756225585938, + "learning_rate": 1e-06, + "loss": 0.0603, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0002608774939290015, + "clip_ratio/high_mean": 5.474172508002084e-05, + "clip_ratio/low_mean": 0.00014282652716701705, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019756825201966421, + "epoch": 0.026912642430819315, + "grad_norm": 0.0762348622083664, + "kl": 0.0027418136596679688, + "learning_rate": 1e-06, + "loss": 0.0601, + "step": 155 + }, + { + "clip_ratio/high_max": 0.00036479203481576405, + "clip_ratio/high_mean": 9.768490372152883e-05, + "clip_ratio/low_mean": 0.00023774758369654592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033543246968292806, + "epoch": 0.027086272381985892, + "grad_norm": 0.07322980463504791, + "kl": 0.0028104782104492188, + "learning_rate": 1e-06, + "loss": 0.06, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0002822166024998296, + "clip_ratio/high_mean": 6.688455550829531e-05, + "clip_ratio/low_mean": 0.000275632558327743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003425171188382592, + "epoch": 0.02725990233315247, + "grad_norm": 0.07279310375452042, + "kl": 0.0028982162475585938, + "learning_rate": 1e-06, + "loss": 0.0598, + "step": 157 + }, + { + "clip_ratio/high_max": 0.00045094662073097425, + "clip_ratio/high_mean": 0.0001143783389352393, + "clip_ratio/low_mean": 0.0004326280250097625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005470063615575782, + "epoch": 0.027433532284319044, + "grad_norm": 0.0695834681391716, + "kl": 0.0029802322387695312, + "learning_rate": 1e-06, + "loss": 0.0596, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0006921358872205019, + "clip_ratio/high_mean": 0.00017893295341764315, + "clip_ratio/low_mean": 0.0005072136981425501, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006861466806640237, + "epoch": 0.02760716223548562, + "grad_norm": 0.0684238150715828, + "kl": 0.0030870437622070312, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0013232454330136534, + "clip_ratio/high_mean": 0.00032933773400145583, + "clip_ratio/low_mean": 0.0007189521752479777, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010482899342605378, + "epoch": 0.0277807921866522, + "grad_norm": 0.065030537545681, + "kl": 0.0034379959106445312, + "learning_rate": 1e-06, + "loss": 0.0591, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3052.0, + "completions/mean_length": 1043.18310546875, + "completions/mean_terminated_length": 933.38818359375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.027954422137818773, + "grad_norm": 0.052377089858055115, + "kl": 0.0028295516967773438, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 11045314.0, + "reward": 0.330357164144516, + "reward_std": 0.20050694048404694, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "step": 161 + }, + { + "clip_ratio/high_max": 0.00024209098774008453, + "clip_ratio/high_mean": 5.171584029994847e-05, + "clip_ratio/low_mean": 9.617499017622322e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.133333772595506e-05, + "epoch": 0.02812805208898535, + "grad_norm": 0.05258831009268761, + "kl": 0.0028676986694335938, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0003229970971005969, + "clip_ratio/high_mean": 6.822645605097932e-05, + "clip_ratio/low_mean": 3.125610123788647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.948255683411844e-05, + "epoch": 0.028301682040151925, + "grad_norm": 0.047927577048540115, + "kl": 0.0029458999633789062, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0003256688087276416, + "clip_ratio/high_mean": 6.948249210836366e-05, + "clip_ratio/low_mean": 4.39329550090406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011341544404785964, + "epoch": 0.028475311991318503, + "grad_norm": 0.05049503222107887, + "kl": 0.0031719207763671875, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 164 + }, + { + "clip_ratio/high_max": 0.000370184202438395, + "clip_ratio/high_mean": 7.70422022924322e-05, + "clip_ratio/low_mean": 5.284456028675777e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012988675916858483, + "epoch": 0.02864894194248508, + "grad_norm": 0.0459713339805603, + "kl": 0.0033349990844726562, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0004074840153407422, + "clip_ratio/high_mean": 8.759766228649823e-05, + "clip_ratio/low_mean": 8.854583165884833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017614349599170964, + "epoch": 0.028822571893651654, + "grad_norm": 0.044937968254089355, + "kl": 0.0033702850341796875, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 166 + }, + { + "clip_ratio/high_max": 0.000457144911706564, + "clip_ratio/high_mean": 0.0001094272352020198, + "clip_ratio/low_mean": 0.00013749242225458147, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002469196515448857, + "epoch": 0.028996201844818232, + "grad_norm": 0.0443839393556118, + "kl": 0.0034618377685546875, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0007651772466488183, + "clip_ratio/high_mean": 0.00019795518346654717, + "clip_ratio/low_mean": 0.00018231052672490478, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00038026570473448373, + "epoch": 0.029169831795984806, + "grad_norm": 0.043584346771240234, + "kl": 0.0035676956176757812, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3007.0, + "completions/mean_length": 939.904052734375, + "completions/mean_terminated_length": 876.1862182617188, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.029343461747151384, + "grad_norm": 0.058060579001903534, + "kl": 0.0027599334716796875, + "learning_rate": 1e-06, + "loss": 0.0357, + "num_tokens": 11526207.0, + "reward": 0.3325892984867096, + "reward_std": 0.24348647892475128, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166749835014343, + "step": 169 + }, + { + "clip_ratio/high_max": 7.949582868604921e-05, + "clip_ratio/high_mean": 1.4768956816624268e-05, + "clip_ratio/low_mean": 6.115878431955935e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.592774454678874e-05, + "epoch": 0.02951709169831796, + "grad_norm": 0.057736754417419434, + "kl": 0.0028047561645507812, + "learning_rate": 1e-06, + "loss": 0.0356, + "step": 170 + }, + { + "clip_ratio/high_max": 0.00015729896767879836, + "clip_ratio/high_mean": 4.2888379084615735e-05, + "clip_ratio/low_mean": 8.17910949990619e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012467947408367763, + "epoch": 0.029690721649484535, + "grad_norm": 0.057525020092725754, + "kl": 0.002838134765625, + "learning_rate": 1e-06, + "loss": 0.0356, + "step": 171 + }, + { + "clip_ratio/high_max": 0.00023573577527713496, + "clip_ratio/high_mean": 5.4839815902596456e-05, + "clip_ratio/low_mean": 0.00010850183298316551, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001633416522963671, + "epoch": 0.029864351600651113, + "grad_norm": 0.05670091137290001, + "kl": 0.0028829574584960938, + "learning_rate": 1e-06, + "loss": 0.0355, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0003415731025597779, + "clip_ratio/high_mean": 6.922935222064552e-05, + "clip_ratio/low_mean": 0.0001644781750655966, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002337075347895734, + "epoch": 0.030037981551817687, + "grad_norm": 0.056562550365924835, + "kl": 0.0029077529907226562, + "learning_rate": 1e-06, + "loss": 0.0353, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0004495588964346098, + "clip_ratio/high_mean": 0.00012080550823156955, + "clip_ratio/low_mean": 0.00019655877258628607, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003173642799083609, + "epoch": 0.030211611502984265, + "grad_norm": 0.0561620332300663, + "kl": 0.0029401779174804688, + "learning_rate": 1e-06, + "loss": 0.0352, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0005854300325154327, + "clip_ratio/high_mean": 0.00017523811584396753, + "clip_ratio/low_mean": 0.0002363654875807697, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00041160360706271604, + "epoch": 0.030385241454150842, + "grad_norm": 0.05543660745024681, + "kl": 0.0029840469360351562, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0009771084696694743, + "clip_ratio/high_mean": 0.00027096680787508376, + "clip_ratio/low_mean": 0.00029686722427868517, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000567834016692359, + "epoch": 0.030558871405317416, + "grad_norm": 0.05448862165212631, + "kl": 0.003055572509765625, + "learning_rate": 1e-06, + "loss": 0.0349, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2985.0, + "completions/mean_length": 1008.0022583007812, + "completions/mean_terminated_length": 931.557861328125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.030732501356483994, + "grad_norm": 0.08998759090900421, + "kl": 0.0031576156616210938, + "learning_rate": 1e-06, + "loss": 0.0775, + "num_tokens": 12051168.0, + "reward": 0.3013392984867096, + "reward_std": 0.20764635503292084, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "step": 177 + }, + { + "clip_ratio/high_max": 0.00015629503104719333, + "clip_ratio/high_mean": 2.997989861341921e-05, + "clip_ratio/low_mean": 9.554711175496777e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012552701150525536, + "epoch": 0.030906131307650568, + "grad_norm": 0.07973919808864594, + "kl": 0.0031671524047851562, + "learning_rate": 1e-06, + "loss": 0.0774, + "step": 178 + }, + { + "clip_ratio/high_max": 0.00025089192740779254, + "clip_ratio/high_mean": 5.6041383686533663e-05, + "clip_ratio/low_mean": 0.00014061532374398666, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019665670424728887, + "epoch": 0.031079761258817146, + "grad_norm": 0.07631222158670425, + "kl": 0.0032281875610351562, + "learning_rate": 1e-06, + "loss": 0.0773, + "step": 179 + }, + { + "clip_ratio/high_max": 0.00023156802672019694, + "clip_ratio/high_mean": 6.145132510937401e-05, + "clip_ratio/low_mean": 0.00023873491204540187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030018624488548085, + "epoch": 0.03125339120998372, + "grad_norm": 0.07314057648181915, + "kl": 0.0038042068481445312, + "learning_rate": 1e-06, + "loss": 0.077, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0004854790277022403, + "clip_ratio/high_mean": 0.00013014399382882402, + "clip_ratio/low_mean": 0.0003142020809718815, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00044434608298615785, + "epoch": 0.0314270211611503, + "grad_norm": 0.07185754179954529, + "kl": 0.004241943359375, + "learning_rate": 1e-06, + "loss": 0.0769, + "step": 181 + }, + { + "clip_ratio/high_max": 0.00048584528303763364, + "clip_ratio/high_mean": 0.00014717924295837292, + "clip_ratio/low_mean": 0.000399926654608862, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005471058798320882, + "epoch": 0.03160065111231687, + "grad_norm": 0.07045626640319824, + "kl": 0.00583648681640625, + "learning_rate": 1e-06, + "loss": 0.0767, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0005844768056704197, + "clip_ratio/high_mean": 0.00017479775351603166, + "clip_ratio/low_mean": 0.000574327963477117, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007491256938010338, + "epoch": 0.03177428106348345, + "grad_norm": 0.0706855058670044, + "kl": 0.010582923889160156, + "learning_rate": 1e-06, + "loss": 0.0764, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0007202654705906752, + "clip_ratio/high_mean": 0.0002495038406777894, + "clip_ratio/low_mean": 0.0007111470912377627, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009606508992874296, + "epoch": 0.03194791101465003, + "grad_norm": 0.06658057868480682, + "kl": 0.008107185363769531, + "learning_rate": 1e-06, + "loss": 0.0761, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.024553571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2989.0, + "completions/mean_length": 958.138427734375, + "completions/mean_terminated_length": 904.9290161132812, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.032121540965816604, + "grad_norm": 0.07046614587306976, + "kl": 0.003448486328125, + "learning_rate": 1e-06, + "loss": 0.068, + "num_tokens": 12544158.0, + "reward": 0.3437500298023224, + "reward_std": 0.26166731119155884, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0003379352210686193, + "clip_ratio/high_mean": 5.736567982239649e-05, + "clip_ratio/low_mean": 4.496884105265053e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010233451757812873, + "epoch": 0.03229517091698318, + "grad_norm": 0.06952168792486191, + "kl": 0.0035343170166015625, + "learning_rate": 1e-06, + "loss": 0.068, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0003869851816489245, + "clip_ratio/high_mean": 7.145971255795303e-05, + "clip_ratio/low_mean": 7.222868202916288e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001436883933365607, + "epoch": 0.03246880086814975, + "grad_norm": 0.06756498664617538, + "kl": 0.00366973876953125, + "learning_rate": 1e-06, + "loss": 0.0679, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0003951659928134177, + "clip_ratio/high_mean": 6.739753143847338e-05, + "clip_ratio/low_mean": 0.00013503104480605543, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020242857999619446, + "epoch": 0.03264243081931633, + "grad_norm": 0.06552397459745407, + "kl": 0.003772735595703125, + "learning_rate": 1e-06, + "loss": 0.0678, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0006933476051926846, + "clip_ratio/high_mean": 0.0001376764244014339, + "clip_ratio/low_mean": 0.00022158519732329296, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003592616158130113, + "epoch": 0.03281606077048291, + "grad_norm": 0.06394247710704803, + "kl": 0.0037860870361328125, + "learning_rate": 1e-06, + "loss": 0.0676, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0009899373217194807, + "clip_ratio/high_mean": 0.0002147355144188623, + "clip_ratio/low_mean": 0.0002963067954624421, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000511042308062315, + "epoch": 0.032989690721649485, + "grad_norm": 0.06160977855324745, + "kl": 0.004016876220703125, + "learning_rate": 1e-06, + "loss": 0.0674, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0011956020243815146, + "clip_ratio/high_mean": 0.00024122577406160417, + "clip_ratio/low_mean": 0.000372293266991619, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006135190560598858, + "epoch": 0.03316332067281606, + "grad_norm": 0.06096015125513077, + "kl": 0.0042057037353515625, + "learning_rate": 1e-06, + "loss": 0.0672, + "step": 191 + }, + { + "clip_ratio/high_max": 0.001829320641263621, + "clip_ratio/high_mean": 0.0003938561858376488, + "clip_ratio/low_mean": 0.0005409362638602033, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009347924242320005, + "epoch": 0.03333695062398264, + "grad_norm": 0.05924614891409874, + "kl": 0.0044708251953125, + "learning_rate": 1e-06, + "loss": 0.067, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.049107142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3063.0, + "completions/mean_length": 968.8125610351562, + "completions/mean_terminated_length": 860.1972045898438, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.03351058057514921, + "grad_norm": 0.08352500945329666, + "kl": 0.004497528076171875, + "learning_rate": 1e-06, + "loss": 0.109, + "num_tokens": 13036994.0, + "reward": 0.4151785969734192, + "reward_std": 0.31909093260765076, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "step": 193 + }, + { + "clip_ratio/high_max": 0.000140161529998295, + "clip_ratio/high_mean": 3.017108031144744e-05, + "clip_ratio/low_mean": 4.686322324687353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.703430196670524e-05, + "epoch": 0.03368421052631579, + "grad_norm": 0.08296140283346176, + "kl": 0.004650115966796875, + "learning_rate": 1e-06, + "loss": 0.109, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0003227378510928247, + "clip_ratio/high_mean": 6.0443295751611004e-05, + "clip_ratio/low_mean": 9.267849782190751e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015312179402826587, + "epoch": 0.033857840477482366, + "grad_norm": 0.07949453592300415, + "kl": 0.0046291351318359375, + "learning_rate": 1e-06, + "loss": 0.1088, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0004256090387571021, + "clip_ratio/high_mean": 0.00012772009233685822, + "clip_ratio/low_mean": 0.0001363359301649325, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002640560178406304, + "epoch": 0.034031470428648944, + "grad_norm": 0.07807482033967972, + "kl": 0.0046863555908203125, + "learning_rate": 1e-06, + "loss": 0.1086, + "step": 196 + }, + { + "clip_ratio/high_max": 0.00045126841177989263, + "clip_ratio/high_mean": 0.00012729068498629204, + "clip_ratio/low_mean": 0.00023031252203509212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035760319951805286, + "epoch": 0.03420510037981552, + "grad_norm": 0.07618335634469986, + "kl": 0.004894256591796875, + "learning_rate": 1e-06, + "loss": 0.1083, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0006292932230280712, + "clip_ratio/high_mean": 0.00015736467776150675, + "clip_ratio/low_mean": 0.00031010527618491324, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004674699594033882, + "epoch": 0.03437873033098209, + "grad_norm": 0.07498215138912201, + "kl": 0.0049343109130859375, + "learning_rate": 1e-06, + "loss": 0.1081, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0008988762601802591, + "clip_ratio/high_mean": 0.000263835643636412, + "clip_ratio/low_mean": 0.00046786652455921285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007317021663766354, + "epoch": 0.03455236028214867, + "grad_norm": 0.0731743574142456, + "kl": 0.005153656005859375, + "learning_rate": 1e-06, + "loss": 0.1078, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0014448634610744193, + "clip_ratio/high_mean": 0.00038249538010859396, + "clip_ratio/low_mean": 0.000571403325011488, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009538987433188595, + "epoch": 0.03472599023331525, + "grad_norm": 0.07175468653440475, + "kl": 0.0052967071533203125, + "learning_rate": 1e-06, + "loss": 0.1074, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0379464285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2539.0, + "completions/mean_length": 901.8973388671875, + "completions/mean_terminated_length": 816.3015747070312, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.034899620184481825, + "grad_norm": 0.07219688594341278, + "kl": 0.0058536529541015625, + "learning_rate": 1e-06, + "loss": 0.0525, + "num_tokens": 13499420.0, + "reward": 0.3415178656578064, + "reward_std": 0.24220769107341766, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "step": 201 + }, + { + "clip_ratio/high_max": 0.00022225413158594165, + "clip_ratio/high_mean": 5.331816169018566e-05, + "clip_ratio/low_mean": 3.999007969923696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.330824423159356e-05, + "epoch": 0.0350732501356484, + "grad_norm": 0.07217645645141602, + "kl": 0.0058422088623046875, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0002689363118406618, + "clip_ratio/high_mean": 4.732967659037968e-05, + "clip_ratio/low_mean": 6.601988286547567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011334956116115791, + "epoch": 0.03524688008681497, + "grad_norm": 0.06945184618234634, + "kl": 0.0058040618896484375, + "learning_rate": 1e-06, + "loss": 0.0523, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0002276651703141397, + "clip_ratio/high_mean": 4.509589098233846e-05, + "clip_ratio/low_mean": 0.00011881938007718418, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016391527628911717, + "epoch": 0.03542051003798155, + "grad_norm": 0.06934140622615814, + "kl": 0.0057373046875, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0003470212650427129, + "clip_ratio/high_mean": 7.446860672644107e-05, + "clip_ratio/low_mean": 0.00020419288193807006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00027866149321198463, + "epoch": 0.03559413998914813, + "grad_norm": 0.06609861552715302, + "kl": 0.0058307647705078125, + "learning_rate": 1e-06, + "loss": 0.052, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0005327750805008691, + "clip_ratio/high_mean": 0.00013216317529440857, + "clip_ratio/low_mean": 0.0002984118009408121, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004305749771447154, + "epoch": 0.035767769940314706, + "grad_norm": 0.0640389621257782, + "kl": 0.0058765411376953125, + "learning_rate": 1e-06, + "loss": 0.0518, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0006460240965679986, + "clip_ratio/high_mean": 0.00016327472349075833, + "clip_ratio/low_mean": 0.0004372286666693981, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006005033847031882, + "epoch": 0.035941399891481284, + "grad_norm": 0.06271988153457642, + "kl": 0.0059070587158203125, + "learning_rate": 1e-06, + "loss": 0.0516, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0008908939234970603, + "clip_ratio/high_mean": 0.0002099051257573592, + "clip_ratio/low_mean": 0.0005713552454835735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007812603525962913, + "epoch": 0.036115029842647854, + "grad_norm": 0.061273712664842606, + "kl": 0.006015777587890625, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0580357142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3057.0, + "completions/mean_length": 952.16748046875, + "completions/mean_terminated_length": 821.5616455078125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.03628865979381443, + "grad_norm": 0.06407427042722702, + "kl": 0.004886627197265625, + "learning_rate": 1e-06, + "loss": 0.1025, + "num_tokens": 13985263.0, + "reward": 0.3191964328289032, + "reward_std": 0.2290615439414978, + "rewards/accuracy_reward/mean": 0.3191964328289032, + "rewards/accuracy_reward/std": 0.4666863977909088, + "step": 209 + }, + { + "clip_ratio/high_max": 0.00015849755891395034, + "clip_ratio/high_mean": 2.3897754886093026e-05, + "clip_ratio/low_mean": 7.990761650944478e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010380537696619285, + "epoch": 0.03646228974498101, + "grad_norm": 0.062143974006175995, + "kl": 0.0049839019775390625, + "learning_rate": 1e-06, + "loss": 0.1025, + "step": 210 + }, + { + "clip_ratio/high_max": 5.2734306336787995e-05, + "clip_ratio/high_mean": 9.400788712810026e-06, + "clip_ratio/low_mean": 9.952195864570967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010892274804064073, + "epoch": 0.03663591969614759, + "grad_norm": 0.06083887815475464, + "kl": 0.0050144195556640625, + "learning_rate": 1e-06, + "loss": 0.1024, + "step": 211 + }, + { + "clip_ratio/high_max": 0.00019393379716348136, + "clip_ratio/high_mean": 3.7539814911724534e-05, + "clip_ratio/low_mean": 0.00015922662441880675, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001967664411495207, + "epoch": 0.036809549647314165, + "grad_norm": 0.05928577110171318, + "kl": 0.004993438720703125, + "learning_rate": 1e-06, + "loss": 0.1022, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0002714459042181261, + "clip_ratio/high_mean": 5.4696326742487145e-05, + "clip_ratio/low_mean": 0.00022914259307071916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002838389173120959, + "epoch": 0.036983179598480735, + "grad_norm": 0.05882363021373749, + "kl": 0.0050983428955078125, + "learning_rate": 1e-06, + "loss": 0.1021, + "step": 213 + }, + { + "clip_ratio/high_max": 0.00029216874827397987, + "clip_ratio/high_mean": 6.417147642423515e-05, + "clip_ratio/low_mean": 0.0002869319855562935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035110346379951807, + "epoch": 0.03715680954964731, + "grad_norm": 0.05811864510178566, + "kl": 0.005207061767578125, + "learning_rate": 1e-06, + "loss": 0.1019, + "step": 214 + }, + { + "clip_ratio/high_max": 0.000571939079236472, + "clip_ratio/high_mean": 0.00015079140621310216, + "clip_ratio/low_mean": 0.0003746497259271564, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005254411134956172, + "epoch": 0.03733043950081389, + "grad_norm": 0.056842949241399765, + "kl": 0.005207061767578125, + "learning_rate": 1e-06, + "loss": 0.1017, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0009351344560855068, + "clip_ratio/high_mean": 0.00023273797432921128, + "clip_ratio/low_mean": 0.000497910777994548, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007306487459572963, + "epoch": 0.03750406945198047, + "grad_norm": 0.05533328279852867, + "kl": 0.0053424835205078125, + "learning_rate": 1e-06, + "loss": 0.1015, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2872.0, + "completions/mean_length": 926.55810546875, + "completions/mean_terminated_length": 867.5091552734375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.037677699403147046, + "grad_norm": 0.07703442871570587, + "kl": 0.0050373077392578125, + "learning_rate": 1e-06, + "loss": 0.0593, + "num_tokens": 14467825.0, + "reward": 0.28125, + "reward_std": 0.20200327038764954, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45011183619499207, + "step": 217 + }, + { + "clip_ratio/high_max": 6.790631960029714e-05, + "clip_ratio/high_mean": 9.700902978693193e-06, + "clip_ratio/low_mean": 3.6159358842269285e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.586026261677034e-05, + "epoch": 0.037851329354313616, + "grad_norm": 0.07659391313791275, + "kl": 0.00499725341796875, + "learning_rate": 1e-06, + "loss": 0.0593, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0002732168450165773, + "clip_ratio/high_mean": 4.3419124835963885e-05, + "clip_ratio/low_mean": 4.087164938937349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.429077468008472e-05, + "epoch": 0.038024959305480194, + "grad_norm": 0.07656816393136978, + "kl": 0.0050830841064453125, + "learning_rate": 1e-06, + "loss": 0.0592, + "step": 219 + }, + { + "clip_ratio/high_max": 0.00043329581967554986, + "clip_ratio/high_mean": 7.427704031215399e-05, + "clip_ratio/low_mean": 7.956944955367362e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015384648577310145, + "epoch": 0.03819858925664677, + "grad_norm": 0.07245522737503052, + "kl": 0.0051059722900390625, + "learning_rate": 1e-06, + "loss": 0.059, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0010144469370061415, + "clip_ratio/high_mean": 0.0001617254663415224, + "clip_ratio/low_mean": 0.0001599316456122324, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032165711309062317, + "epoch": 0.03837221920781335, + "grad_norm": 0.06884177774190903, + "kl": 0.00505828857421875, + "learning_rate": 1e-06, + "loss": 0.0589, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0013151224520697724, + "clip_ratio/high_mean": 0.00023866308310971363, + "clip_ratio/low_mean": 0.00029511865602671605, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005337817792678834, + "epoch": 0.038545849158979927, + "grad_norm": 0.061943087726831436, + "kl": 0.0050640106201171875, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0022168690484249964, + "clip_ratio/high_mean": 0.0003760762010642793, + "clip_ratio/low_mean": 0.0004110157383365731, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007870919434935786, + "epoch": 0.0387194791101465, + "grad_norm": 0.06023077294230461, + "kl": 0.005130767822265625, + "learning_rate": 1e-06, + "loss": 0.0585, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0029914466322225053, + "clip_ratio/high_mean": 0.0005386665370679111, + "clip_ratio/low_mean": 0.0006065707384550478, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011452372928033583, + "epoch": 0.038893109061313075, + "grad_norm": 0.05875673517584801, + "kl": 0.0051403045654296875, + "learning_rate": 1e-06, + "loss": 0.0582, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3019.0, + "completions/mean_length": 1040.6875, + "completions/mean_terminated_length": 930.7576293945312, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.03906673901247965, + "grad_norm": 0.07734653353691101, + "kl": 0.004603385925292969, + "learning_rate": 1e-06, + "loss": 0.0694, + "num_tokens": 15011133.0, + "reward": 0.3102678656578064, + "reward_std": 0.27459728717803955, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312037110328674, + "step": 225 + }, + { + "clip_ratio/high_max": 0.00019487492318148725, + "clip_ratio/high_mean": 4.5932367584100575e-05, + "clip_ratio/low_mean": 7.053138790524827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011646375571672252, + "epoch": 0.03924036896364623, + "grad_norm": 0.06685353070497513, + "kl": 0.00495147705078125, + "learning_rate": 1e-06, + "loss": 0.0694, + "step": 226 + }, + { + "clip_ratio/high_max": 0.00031368246345664375, + "clip_ratio/high_mean": 6.847989561720169e-05, + "clip_ratio/low_mean": 8.863970174388669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015711960259068292, + "epoch": 0.03941399891481281, + "grad_norm": 0.0640801414847374, + "kl": 0.006466865539550781, + "learning_rate": 1e-06, + "loss": 0.0692, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0005355771545509924, + "clip_ratio/high_mean": 0.00011912179661521805, + "clip_ratio/low_mean": 0.00014311500808616984, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002622368083393667, + "epoch": 0.03958762886597938, + "grad_norm": 0.07433435320854187, + "kl": 0.013917922973632812, + "learning_rate": 1e-06, + "loss": 0.0691, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0006701716301904526, + "clip_ratio/high_mean": 0.00013557579131884268, + "clip_ratio/low_mean": 0.0002315429728696472, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003671187678264687, + "epoch": 0.039761258817145956, + "grad_norm": 0.0625142753124237, + "kl": 0.008487701416015625, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0008232090513047297, + "clip_ratio/high_mean": 0.00017708818268147297, + "clip_ratio/low_mean": 0.00025413072307856055, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00043121890848851763, + "epoch": 0.03993488876831253, + "grad_norm": 0.059884194284677505, + "kl": 0.005160331726074219, + "learning_rate": 1e-06, + "loss": 0.0688, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0011380590331100393, + "clip_ratio/high_mean": 0.00027094270353700267, + "clip_ratio/low_mean": 0.00033740878188837087, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006083514854253735, + "epoch": 0.04010851871947911, + "grad_norm": 0.05885405093431473, + "kl": 0.005904197692871094, + "learning_rate": 1e-06, + "loss": 0.0685, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0015637538763257908, + "clip_ratio/high_mean": 0.00033849308420030866, + "clip_ratio/low_mean": 0.0004142180532653583, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007527111192757729, + "epoch": 0.04028214867064569, + "grad_norm": 0.07160970568656921, + "kl": 0.0067691802978515625, + "learning_rate": 1e-06, + "loss": 0.0683, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022321428571428603, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3057.0, + "completions/mean_length": 876.8504638671875, + "completions/mean_terminated_length": 826.7328491210938, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.04045577862181226, + "grad_norm": 0.062098950147628784, + "kl": 0.0056095123291015625, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 15472922.0, + "reward": 0.2946428656578064, + "reward_std": 0.19855552911758423, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639169216156006, + "step": 233 + }, + { + "clip_ratio/high_max": 0.00010377163380326238, + "clip_ratio/high_mean": 1.9110096218355466e-05, + "clip_ratio/low_mean": 4.0245389982374036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.93554862007295e-05, + "epoch": 0.04062940857297884, + "grad_norm": 0.0625988245010376, + "kl": 0.00557708740234375, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 234 + }, + { + "clip_ratio/high_max": 0.000248120824835496, + "clip_ratio/high_mean": 3.9824263694754336e-05, + "clip_ratio/low_mean": 4.7418674853361154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.72429370701866e-05, + "epoch": 0.040803038524145414, + "grad_norm": 0.06109880656003952, + "kl": 0.0056209564208984375, + "learning_rate": 1e-06, + "loss": 0.0339, + "step": 235 + }, + { + "clip_ratio/high_max": 0.00048607084318064153, + "clip_ratio/high_mean": 8.778467281445046e-05, + "clip_ratio/low_mean": 7.282936212504865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001606140376679832, + "epoch": 0.04097666847531199, + "grad_norm": 0.058381833136081696, + "kl": 0.0055522918701171875, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0006172246039568563, + "clip_ratio/high_mean": 0.00011923551528525422, + "clip_ratio/low_mean": 0.00010164225318476383, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022087777142587584, + "epoch": 0.04115029842647857, + "grad_norm": 0.05656541883945465, + "kl": 0.00556182861328125, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0006294661616266239, + "clip_ratio/high_mean": 0.00013370380429478246, + "clip_ratio/low_mean": 0.00013383337841332832, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002675371806617477, + "epoch": 0.04132392837764515, + "grad_norm": 0.054563816636800766, + "kl": 0.005542755126953125, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0008545208220311906, + "clip_ratio/high_mean": 0.0002191799603679101, + "clip_ratio/low_mean": 0.0001912095015086379, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004103894552827114, + "epoch": 0.04149755832881172, + "grad_norm": 0.05368833616375923, + "kl": 0.0055751800537109375, + "learning_rate": 1e-06, + "loss": 0.0333, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0012206146639073268, + "clip_ratio/high_mean": 0.0002968720500575728, + "clip_ratio/low_mean": 0.00026627426154846034, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005631463272948167, + "epoch": 0.041671188279978295, + "grad_norm": 0.05278347060084343, + "kl": 0.0055980682373046875, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3062.0, + "completions/mean_length": 907.138427734375, + "completions/mean_terminated_length": 842.44140625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.04184481823114487, + "grad_norm": 0.07795985788106918, + "kl": 0.0053615570068359375, + "learning_rate": 1e-06, + "loss": 0.0367, + "num_tokens": 15942000.0, + "reward": 0.3571428656578064, + "reward_std": 0.29097503423690796, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "step": 241 + }, + { + "clip_ratio/high_max": 0.00023618104478373425, + "clip_ratio/high_mean": 5.623849551739113e-05, + "clip_ratio/low_mean": 4.708936762654048e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010332786200706323, + "epoch": 0.04201844818231145, + "grad_norm": 0.08209370821714401, + "kl": 0.0053768157958984375, + "learning_rate": 1e-06, + "loss": 0.0367, + "step": 242 + }, + { + "clip_ratio/high_max": 0.00019513875759002985, + "clip_ratio/high_mean": 4.236984761973872e-05, + "clip_ratio/low_mean": 6.438475884351647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001067546049853263, + "epoch": 0.04219207813347803, + "grad_norm": 0.07343382388353348, + "kl": 0.0057773590087890625, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0003968233195337234, + "clip_ratio/high_mean": 7.895871931395959e-05, + "clip_ratio/low_mean": 0.00010558860458331765, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001845473316279822, + "epoch": 0.0423657080846446, + "grad_norm": 0.06719336658716202, + "kl": 0.007381439208984375, + "learning_rate": 1e-06, + "loss": 0.0364, + "step": 244 + }, + { + "clip_ratio/high_max": 0.00034431864150974434, + "clip_ratio/high_mean": 0.00010109648519573966, + "clip_ratio/low_mean": 0.00019400129531277344, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002950977695945767, + "epoch": 0.042539338035811176, + "grad_norm": 0.06623882800340652, + "kl": 0.010259628295898438, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0006708281780447578, + "clip_ratio/high_mean": 0.00018490812908567023, + "clip_ratio/low_mean": 0.0002478310889273416, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00043273921983200125, + "epoch": 0.042712967986977754, + "grad_norm": 0.06415053457021713, + "kl": 0.012523651123046875, + "learning_rate": 1e-06, + "loss": 0.036, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0010141216152987909, + "clip_ratio/high_mean": 0.0002891791241381725, + "clip_ratio/low_mean": 0.00031956291877577314, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006087420424591983, + "epoch": 0.04288659793814433, + "grad_norm": 0.06282375752925873, + "kl": 0.01096343994140625, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0015019040474726353, + "clip_ratio/high_mean": 0.00040584710313851247, + "clip_ratio/low_mean": 0.00043446086328913225, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008403079482377507, + "epoch": 0.04306022788931091, + "grad_norm": 0.06128554791212082, + "kl": 0.009317398071289062, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2845.0, + "completions/mean_length": 941.5670166015625, + "completions/mean_terminated_length": 862.6620483398438, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.04323385784047748, + "grad_norm": 0.07063725590705872, + "kl": 0.0073642730712890625, + "learning_rate": 1e-06, + "loss": 0.0601, + "num_tokens": 16430582.0, + "reward": 0.3727678656578064, + "reward_std": 0.24220770597457886, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "step": 249 + }, + { + "clip_ratio/high_max": 0.00020031468193337787, + "clip_ratio/high_mean": 3.476571146165952e-05, + "clip_ratio/low_mean": 6.625006051308446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010101577299792552, + "epoch": 0.04340748779164406, + "grad_norm": 0.06864381581544876, + "kl": 0.0077762603759765625, + "learning_rate": 1e-06, + "loss": 0.06, + "step": 250 + }, + { + "clip_ratio/high_max": 0.00016189541565836407, + "clip_ratio/high_mean": 3.912930105798296e-05, + "clip_ratio/low_mean": 0.00012363333485154726, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000162762638865388, + "epoch": 0.043581117742810635, + "grad_norm": 0.06514810025691986, + "kl": 0.0077838897705078125, + "learning_rate": 1e-06, + "loss": 0.0598, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0002215413087469642, + "clip_ratio/high_mean": 4.305978052343562e-05, + "clip_ratio/low_mean": 0.00022305307766146143, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002661128637555521, + "epoch": 0.04375474769397721, + "grad_norm": 0.06342755258083344, + "kl": 0.0071201324462890625, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0003869523479806958, + "clip_ratio/high_mean": 0.0001022462720356998, + "clip_ratio/low_mean": 0.0002877782198993373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00039002447738312185, + "epoch": 0.04392837764514379, + "grad_norm": 0.061800289899110794, + "kl": 0.00754547119140625, + "learning_rate": 1e-06, + "loss": 0.0595, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0005191371383261867, + "clip_ratio/high_mean": 0.00013858500869901036, + "clip_ratio/low_mean": 0.0004190501440461958, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005576351450145012, + "epoch": 0.04410200759631036, + "grad_norm": 0.059872034937143326, + "kl": 0.0073833465576171875, + "learning_rate": 1e-06, + "loss": 0.0593, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0007083320306264795, + "clip_ratio/high_mean": 0.00019542840436770348, + "clip_ratio/low_mean": 0.000516365797011531, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007117942095646868, + "epoch": 0.04427563754747694, + "grad_norm": 0.058419279754161835, + "kl": 0.007396697998046875, + "learning_rate": 1e-06, + "loss": 0.0591, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0007946904079290107, + "clip_ratio/high_mean": 0.00027383365613786737, + "clip_ratio/low_mean": 0.0006758919789717766, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009497256032773294, + "epoch": 0.044449267498643516, + "grad_norm": 0.05724295228719711, + "kl": 0.00705718994140625, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3027.0, + "completions/mean_length": 941.76123046875, + "completions/mean_terminated_length": 878.098876953125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.044622897449810094, + "grad_norm": 0.060508158057928085, + "kl": 0.0050525665283203125, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 16914499.0, + "reward": 0.2901785969734192, + "reward_std": 0.1770561784505844, + "rewards/accuracy_reward/mean": 0.2901785671710968, + "rewards/accuracy_reward/std": 0.4543520212173462, + "step": 257 + }, + { + "clip_ratio/high_max": 9.353849964099936e-05, + "clip_ratio/high_mean": 2.103227188854362e-05, + "clip_ratio/low_mean": 3.136462350994407e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.23968948300535e-05, + "epoch": 0.04479652740097667, + "grad_norm": 0.05994526296854019, + "kl": 0.00505828857421875, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 258 + }, + { + "clip_ratio/high_max": 0.00020633136591641232, + "clip_ratio/high_mean": 3.963158269471023e-05, + "clip_ratio/low_mean": 7.10599949798052e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011069157744714175, + "epoch": 0.04497015735214324, + "grad_norm": 0.056097980588674545, + "kl": 0.0055408477783203125, + "learning_rate": 1e-06, + "loss": 0.0372, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0002540853474783944, + "clip_ratio/high_mean": 4.4107477378929616e-05, + "clip_ratio/low_mean": 0.00010519251759433246, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014929998997104121, + "epoch": 0.04514378730330982, + "grad_norm": 0.05632726475596428, + "kl": 0.0072269439697265625, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0004259305715095252, + "clip_ratio/high_mean": 8.404366553804721e-05, + "clip_ratio/low_mean": 0.00018473069758329075, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026877435766436975, + "epoch": 0.0453174172544764, + "grad_norm": 0.09751134365797043, + "kl": 0.025461196899414062, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0006160968723634141, + "clip_ratio/high_mean": 0.00013288566321989492, + "clip_ratio/low_mean": 0.00019220442413825367, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003250900886087038, + "epoch": 0.045491047205642975, + "grad_norm": 0.05354911461472511, + "kl": 0.0053157806396484375, + "learning_rate": 1e-06, + "loss": 0.0368, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0008643306409794604, + "clip_ratio/high_mean": 0.00019448020816525968, + "clip_ratio/low_mean": 0.0003239961540657532, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005184763945180748, + "epoch": 0.04566467715680955, + "grad_norm": 0.05303514748811722, + "kl": 0.005519866943359375, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 263 + }, + { + "clip_ratio/high_max": 0.001355866901576519, + "clip_ratio/high_mean": 0.00028985774633838446, + "clip_ratio/low_mean": 0.0003956428708988824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006855006386103923, + "epoch": 0.04583830710797612, + "grad_norm": 0.05054004490375519, + "kl": 0.0057773590087890625, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2973.0, + "completions/mean_length": 991.1406860351562, + "completions/mean_terminated_length": 914.07177734375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0460119370591427, + "grad_norm": 0.06739285588264465, + "kl": 0.0048274993896484375, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 17419210.0, + "reward": 0.2946428656578064, + "reward_std": 0.22567379474639893, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639169216156006, + "step": 265 + }, + { + "clip_ratio/high_max": 0.00016192326165764825, + "clip_ratio/high_mean": 2.524561978134443e-05, + "clip_ratio/low_mean": 1.922370933016282e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4469327804108616e-05, + "epoch": 0.04618556701030928, + "grad_norm": 0.06700567156076431, + "kl": 0.004856109619140625, + "learning_rate": 1e-06, + "loss": 0.0342, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0002134851547452854, + "clip_ratio/high_mean": 4.8476697884325404e-05, + "clip_ratio/low_mean": 4.052649569530331e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.900319448912342e-05, + "epoch": 0.046359196961475856, + "grad_norm": 0.06606908142566681, + "kl": 0.0049610137939453125, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 267 + }, + { + "clip_ratio/high_max": 0.000238894994254224, + "clip_ratio/high_mean": 6.014573023094272e-05, + "clip_ratio/low_mean": 7.323912814172218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013338485950953327, + "epoch": 0.04653282691264243, + "grad_norm": 0.0640922486782074, + "kl": 0.005035400390625, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0004048451355629368, + "clip_ratio/high_mean": 8.832772527966881e-05, + "clip_ratio/low_mean": 0.0001314838095822779, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021981153440719936, + "epoch": 0.046706456863809004, + "grad_norm": 0.06180435046553612, + "kl": 0.0051631927490234375, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 269 + }, + { + "clip_ratio/high_max": 0.00042566919364617206, + "clip_ratio/high_mean": 0.00010506819876354712, + "clip_ratio/low_mean": 0.00019753369633690454, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030260188941610977, + "epoch": 0.04688008681497558, + "grad_norm": 0.06020312383770943, + "kl": 0.00527191162109375, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0007700443820795044, + "clip_ratio/high_mean": 0.0001932386594489799, + "clip_ratio/low_mean": 0.0002969075612782035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004901462289126357, + "epoch": 0.04705371676614216, + "grad_norm": 0.05897383391857147, + "kl": 0.00539398193359375, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0012605484298546799, + "clip_ratio/high_mean": 0.0003097341177635826, + "clip_ratio/low_mean": 0.00041161479930451605, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007213489188870881, + "epoch": 0.04722734671730874, + "grad_norm": 0.058010537177324295, + "kl": 0.0055389404296875, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3061.0, + "completions/mean_length": 851.5870971679688, + "completions/mean_terminated_length": 779.9608154296875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.047400976668475314, + "grad_norm": 0.08966987580060959, + "kl": 0.00626373291015625, + "learning_rate": 1e-06, + "loss": 0.0759, + "num_tokens": 17861377.0, + "reward": 0.3571428656578064, + "reward_std": 0.30179888010025024, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0001605883317097323, + "clip_ratio/high_mean": 4.534556683211122e-05, + "clip_ratio/low_mean": 7.896989200162352e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012431546065272414, + "epoch": 0.047574606619641885, + "grad_norm": 0.0873035416007042, + "kl": 0.006275177001953125, + "learning_rate": 1e-06, + "loss": 0.0758, + "step": 274 + }, + { + "clip_ratio/high_max": 0.00021376000040618237, + "clip_ratio/high_mean": 4.36021082350635e-05, + "clip_ratio/low_mean": 0.00011392920532671269, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001575313096964237, + "epoch": 0.04774823657080846, + "grad_norm": 0.08415080606937408, + "kl": 0.006378173828125, + "learning_rate": 1e-06, + "loss": 0.0756, + "step": 275 + }, + { + "clip_ratio/high_max": 0.00033713232005538885, + "clip_ratio/high_mean": 8.745402078602638e-05, + "clip_ratio/low_mean": 0.00021641604666911007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030387006950149953, + "epoch": 0.04792186652197504, + "grad_norm": 0.07931944727897644, + "kl": 0.0064239501953125, + "learning_rate": 1e-06, + "loss": 0.0753, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0004226724913678481, + "clip_ratio/high_mean": 0.00010187912403125665, + "clip_ratio/low_mean": 0.0003550987789822102, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00045697789755649865, + "epoch": 0.04809549647314162, + "grad_norm": 0.07786241173744202, + "kl": 0.006526947021484375, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0004876680177403614, + "clip_ratio/high_mean": 0.00016212375703617, + "clip_ratio/low_mean": 0.0005133480717631755, + "clip_ratio/low_min": 4.0650407754583284e-05, + "clip_ratio/region_mean": 0.000675471848808229, + "epoch": 0.048269126424308195, + "grad_norm": 0.07480353116989136, + "kl": 0.006641387939453125, + "learning_rate": 1e-06, + "loss": 0.0747, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0012324294511927292, + "clip_ratio/high_mean": 0.000332753936163499, + "clip_ratio/low_mean": 0.0007735885374131612, + "clip_ratio/low_min": 2.34345698117977e-05, + "clip_ratio/region_mean": 0.0011063424553867662, + "epoch": 0.04844275637547477, + "grad_norm": 0.0702272430062294, + "kl": 0.006763458251953125, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0014421493906411342, + "clip_ratio/high_mean": 0.00041218113892682595, + "clip_ratio/low_mean": 0.0009530982906653662, + "clip_ratio/low_min": 6.097560981288552e-05, + "clip_ratio/region_mean": 0.0013652794532390544, + "epoch": 0.04861638632664134, + "grad_norm": 0.06830212473869324, + "kl": 0.0068817138671875, + "learning_rate": 1e-06, + "loss": 0.0741, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2966.0, + "completions/mean_length": 922.4710083007812, + "completions/mean_terminated_length": 806.1434936523438, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.04879001627780792, + "grad_norm": 0.07949787378311157, + "kl": 0.00719451904296875, + "learning_rate": 1e-06, + "loss": 0.083, + "num_tokens": 18332116.0, + "reward": 0.4799107313156128, + "reward_std": 0.2820347547531128, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0001998847537834081, + "clip_ratio/high_mean": 3.373063202616322e-05, + "clip_ratio/low_mean": 4.849972401643754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.223035638366127e-05, + "epoch": 0.0489636462289745, + "grad_norm": 0.07878443598747253, + "kl": 0.007335662841796875, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0002505343209122657, + "clip_ratio/high_mean": 4.8857516503630904e-05, + "clip_ratio/low_mean": 7.547520181105938e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012433271649570088, + "epoch": 0.049137276180141076, + "grad_norm": 0.07637440413236618, + "kl": 0.0074005126953125, + "learning_rate": 1e-06, + "loss": 0.0828, + "step": 283 + }, + { + "clip_ratio/high_max": 0.00025776311213121517, + "clip_ratio/high_mean": 6.53541358133225e-05, + "clip_ratio/low_mean": 0.00015036954891911591, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021572368177658063, + "epoch": 0.049310906131307654, + "grad_norm": 0.07400918751955032, + "kl": 0.007476806640625, + "learning_rate": 1e-06, + "loss": 0.0825, + "step": 284 + }, + { + "clip_ratio/high_max": 0.000453416398158879, + "clip_ratio/high_mean": 0.00010801970029206132, + "clip_ratio/low_mean": 0.0003123989649793657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00042041867982334225, + "epoch": 0.049484536082474224, + "grad_norm": 0.06988854706287384, + "kl": 0.0076446533203125, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0007676553177589085, + "clip_ratio/high_mean": 0.00018302588523511076, + "clip_ratio/low_mean": 0.0004619739061126893, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006449997918025474, + "epoch": 0.0496581660336408, + "grad_norm": 0.06732655316591263, + "kl": 0.00780487060546875, + "learning_rate": 1e-06, + "loss": 0.0819, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0011395079673093278, + "clip_ratio/high_mean": 0.0002781070566015842, + "clip_ratio/low_mean": 0.0005988135985717236, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008769206324359402, + "epoch": 0.04983179598480738, + "grad_norm": 0.06665686517953873, + "kl": 0.0085906982421875, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0015629008048563264, + "clip_ratio/high_mean": 0.0003871869575959863, + "clip_ratio/low_mean": 0.0008609295123278571, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001248116475835559, + "epoch": 0.05000542593597396, + "grad_norm": 0.06433871388435364, + "kl": 0.008209228515625, + "learning_rate": 1e-06, + "loss": 0.0814, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2935.0, + "completions/mean_length": 926.3170166015625, + "completions/mean_terminated_length": 867.2614135742188, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.050179055887140535, + "grad_norm": 0.06643395125865936, + "kl": 0.006938934326171875, + "learning_rate": 1e-06, + "loss": 0.0317, + "num_tokens": 18815842.0, + "reward": 0.283482164144516, + "reward_std": 0.2583692967891693, + "rewards/accuracy_reward/mean": 0.2834821343421936, + "rewards/accuracy_reward/std": 0.4511922299861908, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0003123140459138085, + "clip_ratio/high_mean": 6.21080882865499e-05, + "clip_ratio/low_mean": 4.5941967414364626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010805005251768307, + "epoch": 0.050352685838307105, + "grad_norm": 0.06594830006361008, + "kl": 0.006916046142578125, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 290 + }, + { + "clip_ratio/high_max": 0.00023929768303787569, + "clip_ratio/high_mean": 6.255331015836418e-05, + "clip_ratio/low_mean": 4.7806821157792e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011036013302145875, + "epoch": 0.05052631578947368, + "grad_norm": 0.06469112634658813, + "kl": 0.006977081298828125, + "learning_rate": 1e-06, + "loss": 0.0316, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0002586631198937539, + "clip_ratio/high_mean": 6.922550142007822e-05, + "clip_ratio/low_mean": 7.943579021230107e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014866128822177416, + "epoch": 0.05069994574064026, + "grad_norm": 0.06349951028823853, + "kl": 0.007030487060546875, + "learning_rate": 1e-06, + "loss": 0.0314, + "step": 292 + }, + { + "clip_ratio/high_max": 0.00042957832010870334, + "clip_ratio/high_mean": 0.00012425359830103844, + "clip_ratio/low_mean": 0.0001268721914584603, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025112579169217497, + "epoch": 0.05087357569180684, + "grad_norm": 0.06216879189014435, + "kl": 0.007099151611328125, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0006666590361419367, + "clip_ratio/high_mean": 0.00016270859043743258, + "clip_ratio/low_mean": 0.0001714420361622615, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033415062353014946, + "epoch": 0.051047205642973416, + "grad_norm": 0.06121203675866127, + "kl": 0.00717926025390625, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0009229369024978951, + "clip_ratio/high_mean": 0.00026319695643906016, + "clip_ratio/low_mean": 0.00027043375757784816, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005336307222023606, + "epoch": 0.051220835594139986, + "grad_norm": 0.06015590950846672, + "kl": 0.00724029541015625, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0012426360481185839, + "clip_ratio/high_mean": 0.00038000732274667826, + "clip_ratio/low_mean": 0.0004012878252979135, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007812951334926765, + "epoch": 0.051394465545306564, + "grad_norm": 0.05853807181119919, + "kl": 0.007404327392578125, + "learning_rate": 1e-06, + "loss": 0.0305, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2689.0, + "completions/mean_length": 869.5714721679688, + "completions/mean_terminated_length": 788.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.05156809549647314, + "grad_norm": 0.0845448449254036, + "kl": 0.008144378662109375, + "learning_rate": 1e-06, + "loss": 0.0393, + "num_tokens": 19265786.0, + "reward": 0.408482164144516, + "reward_std": 0.28106045722961426, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "step": 297 + }, + { + "clip_ratio/high_max": 0.00019608572983997874, + "clip_ratio/high_mean": 3.846525328299322e-05, + "clip_ratio/low_mean": 6.76260506224935e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010609130504235509, + "epoch": 0.05174172544763972, + "grad_norm": 0.0822998583316803, + "kl": 0.008108139038085938, + "learning_rate": 1e-06, + "loss": 0.0392, + "step": 298 + }, + { + "clip_ratio/high_max": 0.000367565653505153, + "clip_ratio/high_mean": 7.71552161040745e-05, + "clip_ratio/low_mean": 0.00015584997890982777, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002330051900116814, + "epoch": 0.0519153553988063, + "grad_norm": 0.07886821031570435, + "kl": 0.008052825927734375, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0007286291183845606, + "clip_ratio/high_mean": 0.00016052503224273096, + "clip_ratio/low_mean": 0.0003011094893281552, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00046163452589098597, + "epoch": 0.05208898534997287, + "grad_norm": 0.07584602385759354, + "kl": 0.008008956909179688, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0013207844567659777, + "clip_ratio/high_mean": 0.00033218473527085735, + "clip_ratio/low_mean": 0.00045401635998132406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007862010588723933, + "epoch": 0.052262615301139445, + "grad_norm": 0.07002071291208267, + "kl": 0.007955551147460938, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 301 + }, + { + "clip_ratio/high_max": 0.001383892129524611, + "clip_ratio/high_mean": 0.00038688031872879947, + "clip_ratio/low_mean": 0.0006034226244082674, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009903029531415086, + "epoch": 0.05243624525230602, + "grad_norm": 0.06730292737483978, + "kl": 0.007963180541992188, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0020160504900559317, + "clip_ratio/high_mean": 0.0005702830758309574, + "clip_ratio/low_mean": 0.0008280014535557711, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013982844720885623, + "epoch": 0.0526098752034726, + "grad_norm": 0.06562905013561249, + "kl": 0.007966995239257812, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 303 + }, + { + "clip_ratio/high_max": 0.002757076952548232, + "clip_ratio/high_mean": 0.0007698839326621965, + "clip_ratio/low_mean": 0.0011136934281239519, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018835773662431166, + "epoch": 0.05278350515463918, + "grad_norm": 0.06432028859853745, + "kl": 0.008121490478515625, + "learning_rate": 1e-06, + "loss": 0.0376, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2959.0, + "completions/mean_length": 949.4152221679688, + "completions/mean_terminated_length": 880.9447021484375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.05295713510580575, + "grad_norm": 0.08733092248439789, + "kl": 0.0059604644775390625, + "learning_rate": 1e-06, + "loss": 0.0849, + "num_tokens": 19759084.0, + "reward": 0.3415178656578064, + "reward_std": 0.23777782917022705, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "step": 305 + }, + { + "clip_ratio/high_max": 0.00022321581855067052, + "clip_ratio/high_mean": 4.442176077645854e-05, + "clip_ratio/low_mean": 5.937946241374448e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010380122409969772, + "epoch": 0.053130765056972326, + "grad_norm": 0.08582375943660736, + "kl": 0.0059108734130859375, + "learning_rate": 1e-06, + "loss": 0.0848, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0001373877903461107, + "clip_ratio/high_mean": 2.4557514052503393e-05, + "clip_ratio/low_mean": 0.00011884130560702033, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001433988235248762, + "epoch": 0.053304395008138904, + "grad_norm": 0.08099596947431564, + "kl": 0.005962371826171875, + "learning_rate": 1e-06, + "loss": 0.0846, + "step": 307 + }, + { + "clip_ratio/high_max": 0.00031567388759867754, + "clip_ratio/high_mean": 7.723764929323806e-05, + "clip_ratio/low_mean": 0.00022481346013591974, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003020510926035058, + "epoch": 0.05347802495930548, + "grad_norm": 0.0770307332277298, + "kl": 0.006092071533203125, + "learning_rate": 1e-06, + "loss": 0.0843, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0005100320377096068, + "clip_ratio/high_mean": 0.0001354095022634283, + "clip_ratio/low_mean": 0.0003394417140043515, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00047485121876889025, + "epoch": 0.05365165491047206, + "grad_norm": 0.07363981008529663, + "kl": 0.00641632080078125, + "learning_rate": 1e-06, + "loss": 0.084, + "step": 309 + }, + { + "clip_ratio/high_max": 0.000718115181371104, + "clip_ratio/high_mean": 0.00019020097897737287, + "clip_ratio/low_mean": 0.0005468344879773213, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007370354860540829, + "epoch": 0.05382528486163863, + "grad_norm": 0.07094141095876694, + "kl": 0.0067901611328125, + "learning_rate": 1e-06, + "loss": 0.0837, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0010459593140694778, + "clip_ratio/high_mean": 0.0002841574773810862, + "clip_ratio/low_mean": 0.0007559218720416538, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010400794071756536, + "epoch": 0.05399891481280521, + "grad_norm": 0.07066638767719269, + "kl": 0.01213836669921875, + "learning_rate": 1e-06, + "loss": 0.0833, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0014701148611493409, + "clip_ratio/high_mean": 0.0004067589570695418, + "clip_ratio/low_mean": 0.0010627014617057284, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001469460456064553, + "epoch": 0.054172544763971785, + "grad_norm": 0.06604013592004776, + "kl": 0.00823974609375, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2898.0, + "completions/mean_length": 1010.7656860351562, + "completions/mean_terminated_length": 954.0343627929688, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.05434617471513836, + "grad_norm": 0.059996746480464935, + "kl": 0.005687713623046875, + "learning_rate": 1e-06, + "loss": 0.0515, + "num_tokens": 20275467.0, + "reward": 0.3571428656578064, + "reward_std": 0.21222595870494843, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0001253683230970637, + "clip_ratio/high_mean": 1.7909760231304972e-05, + "clip_ratio/low_mean": 3.6437782227949356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.434754348243587e-05, + "epoch": 0.05451980466630494, + "grad_norm": 0.05972859635949135, + "kl": 0.0057353973388671875, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 314 + }, + { + "clip_ratio/high_max": 0.00016321390103257727, + "clip_ratio/high_mean": 3.6228717135600164e-05, + "clip_ratio/low_mean": 7.020551993264235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010643423775036354, + "epoch": 0.05469343461747151, + "grad_norm": 0.058286990970373154, + "kl": 0.0056781768798828125, + "learning_rate": 1e-06, + "loss": 0.0514, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0002854832246157457, + "clip_ratio/high_mean": 5.420449110715708e-05, + "clip_ratio/low_mean": 0.0001089364175186347, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016314090498781297, + "epoch": 0.05486706456863809, + "grad_norm": 0.05725838616490364, + "kl": 0.0057086944580078125, + "learning_rate": 1e-06, + "loss": 0.0512, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0004620184208761202, + "clip_ratio/high_mean": 9.578551885169873e-05, + "clip_ratio/low_mean": 0.00012953469649801264, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022532021512233769, + "epoch": 0.055040694519804666, + "grad_norm": 0.055246591567993164, + "kl": 0.0057201385498046875, + "learning_rate": 1e-06, + "loss": 0.0511, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0007053697227092925, + "clip_ratio/high_mean": 0.00015162010822677985, + "clip_ratio/low_mean": 0.0001961540947377216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00034777420114551205, + "epoch": 0.05521432447097124, + "grad_norm": 0.054006077349185944, + "kl": 0.0057525634765625, + "learning_rate": 1e-06, + "loss": 0.0509, + "step": 318 + }, + { + "clip_ratio/high_max": 0.001098469780117739, + "clip_ratio/high_mean": 0.00022285287150225486, + "clip_ratio/low_mean": 0.0002760996901542967, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004989525559722097, + "epoch": 0.05538795442213782, + "grad_norm": 0.05298594385385513, + "kl": 0.00583648681640625, + "learning_rate": 1e-06, + "loss": 0.0507, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0014096367376623675, + "clip_ratio/high_mean": 0.00030796924420428695, + "clip_ratio/low_mean": 0.0003364029171279981, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006443721940740943, + "epoch": 0.0555615843733044, + "grad_norm": 0.05270378291606903, + "kl": 0.0059795379638671875, + "learning_rate": 1e-06, + "loss": 0.0505, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033482142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2831.0, + "completions/mean_length": 931.58935546875, + "completions/mean_terminated_length": 857.4411010742188, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.05573521432447097, + "grad_norm": 0.07154438644647598, + "kl": 0.006378173828125, + "learning_rate": 1e-06, + "loss": 0.0448, + "num_tokens": 20755547.0, + "reward": 0.3526785969734192, + "reward_std": 0.24799686670303345, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378839492798, + "step": 321 + }, + { + "clip_ratio/high_max": 0.00020322147429396864, + "clip_ratio/high_mean": 4.281151154827967e-05, + "clip_ratio/low_mean": 5.6118230531865265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.892974298963964e-05, + "epoch": 0.05590884427563755, + "grad_norm": 0.07094588130712509, + "kl": 0.006381988525390625, + "learning_rate": 1e-06, + "loss": 0.0447, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0002358427018407383, + "clip_ratio/high_mean": 5.9382473864388885e-05, + "clip_ratio/low_mean": 0.00010233332886855351, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001617158000044583, + "epoch": 0.056082474226804124, + "grad_norm": 0.0702100619673729, + "kl": 0.00641632080078125, + "learning_rate": 1e-06, + "loss": 0.0446, + "step": 323 + }, + { + "clip_ratio/high_max": 0.00046155449126672465, + "clip_ratio/high_mean": 9.153493806479673e-05, + "clip_ratio/low_mean": 0.0001642723580062011, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002558072897045349, + "epoch": 0.0562561041779707, + "grad_norm": 0.06722190976142883, + "kl": 0.0064334869384765625, + "learning_rate": 1e-06, + "loss": 0.0444, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0007024502501735697, + "clip_ratio/high_mean": 0.0001828085876240948, + "clip_ratio/low_mean": 0.00020051222145411884, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00038332080453074013, + "epoch": 0.05642973412913728, + "grad_norm": 0.0668235570192337, + "kl": 0.0064983367919921875, + "learning_rate": 1e-06, + "loss": 0.0442, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0009024349456012715, + "clip_ratio/high_mean": 0.0002448219024699938, + "clip_ratio/low_mean": 0.000368841066801906, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006136629790489678, + "epoch": 0.05660336408030385, + "grad_norm": 0.06355249136686325, + "kl": 0.006549835205078125, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0013838571976521052, + "clip_ratio/high_mean": 0.00036128328883933136, + "clip_ratio/low_mean": 0.0005197108957872842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008809941627987428, + "epoch": 0.05677699403147043, + "grad_norm": 0.060376983135938644, + "kl": 0.0065708160400390625, + "learning_rate": 1e-06, + "loss": 0.0437, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0019945110034313984, + "clip_ratio/high_mean": 0.0005276443816910614, + "clip_ratio/low_mean": 0.0007792924766363285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013069368797005154, + "epoch": 0.056950623982637005, + "grad_norm": 0.05769287422299385, + "kl": 0.0066738128662109375, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033482142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2960.0, + "completions/mean_length": 979.0469360351562, + "completions/mean_terminated_length": 906.542724609375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.05712425393380358, + "grad_norm": 0.06739767640829086, + "kl": 0.00618743896484375, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 21260784.0, + "reward": 0.3549107313156128, + "reward_std": 0.2653539478778839, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "step": 329 + }, + { + "clip_ratio/high_max": 0.00015782529590069316, + "clip_ratio/high_mean": 3.453848376011592e-05, + "clip_ratio/low_mean": 4.399564068080508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.853412489566836e-05, + "epoch": 0.05729788388497016, + "grad_norm": 0.06770840287208557, + "kl": 0.006214141845703125, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 330 + }, + { + "clip_ratio/high_max": 0.00016773168044892373, + "clip_ratio/high_mean": 3.33282195015272e-05, + "clip_ratio/low_mean": 6.293544925028982e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.626366613701975e-05, + "epoch": 0.05747151383613673, + "grad_norm": 0.06333687901496887, + "kl": 0.006290435791015625, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 331 + }, + { + "clip_ratio/high_max": 0.00040709618224354926, + "clip_ratio/high_mean": 0.00010449895353303873, + "clip_ratio/low_mean": 0.00010856338144549227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021306233247742057, + "epoch": 0.05764514378730331, + "grad_norm": 0.06344660371541977, + "kl": 0.00626373291015625, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 332 + }, + { + "clip_ratio/high_max": 0.00046426416156464256, + "clip_ratio/high_mean": 0.00013336538495423156, + "clip_ratio/low_mean": 0.00013622456754092127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002695899620448472, + "epoch": 0.057818773738469886, + "grad_norm": 0.060955412685871124, + "kl": 0.00628662109375, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0007499683651985833, + "clip_ratio/high_mean": 0.00018011142765317345, + "clip_ratio/low_mean": 0.00023876636896602577, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00041887780753313564, + "epoch": 0.057992403689636464, + "grad_norm": 0.06093477085232735, + "kl": 0.006439208984375, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0009169900949927978, + "clip_ratio/high_mean": 0.0002625715787871741, + "clip_ratio/low_mean": 0.00037085511758050416, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006334266872727312, + "epoch": 0.05816603364080304, + "grad_norm": 0.060754720121622086, + "kl": 0.006458282470703125, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0014435739358305, + "clip_ratio/high_mean": 0.0003758763450605329, + "clip_ratio/low_mean": 0.0004552715690806508, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008311479141411837, + "epoch": 0.05833966359196961, + "grad_norm": 0.05688638985157013, + "kl": 0.0066070556640625, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3049.0, + "completions/mean_length": 1015.3594360351562, + "completions/mean_terminated_length": 919.254638671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.05851329354313619, + "grad_norm": 0.06466048210859299, + "kl": 0.00494384765625, + "learning_rate": 1e-06, + "loss": 0.0379, + "num_tokens": 21783657.0, + "reward": 0.3013392984867096, + "reward_std": 0.25724107027053833, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "step": 337 + }, + { + "clip_ratio/high_max": 0.00011686661127896514, + "clip_ratio/high_mean": 1.7974303204937314e-05, + "clip_ratio/low_mean": 5.6507517456338974e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.448182327607356e-05, + "epoch": 0.05868692349430277, + "grad_norm": 0.06299170851707458, + "kl": 0.0049686431884765625, + "learning_rate": 1e-06, + "loss": 0.0379, + "step": 338 + }, + { + "clip_ratio/high_max": 0.00019993075147795025, + "clip_ratio/high_mean": 5.427902647170413e-05, + "clip_ratio/low_mean": 8.337086978826846e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013764990035269875, + "epoch": 0.058860553445469345, + "grad_norm": 0.06079478934407234, + "kl": 0.0049896240234375, + "learning_rate": 1e-06, + "loss": 0.0378, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0002074319290841231, + "clip_ratio/high_mean": 4.8314182095055e-05, + "clip_ratio/low_mean": 0.00013436131530397688, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001826754962621635, + "epoch": 0.05903418339663592, + "grad_norm": 0.059405338019132614, + "kl": 0.005062103271484375, + "learning_rate": 1e-06, + "loss": 0.0376, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0003447157305345172, + "clip_ratio/high_mean": 9.53452181420289e-05, + "clip_ratio/low_mean": 0.000228547291499126, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003238925219193334, + "epoch": 0.05920781334780249, + "grad_norm": 0.058088842779397964, + "kl": 0.005138397216796875, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0005202038064453518, + "clip_ratio/high_mean": 0.00015841264371374564, + "clip_ratio/low_mean": 0.0002997879437316442, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000458200592220237, + "epoch": 0.05938144329896907, + "grad_norm": 0.05708559975028038, + "kl": 0.00531005859375, + "learning_rate": 1e-06, + "loss": 0.0372, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0007481373795599211, + "clip_ratio/high_mean": 0.0002369811068092531, + "clip_ratio/low_mean": 0.0004216943220853864, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006586754225281766, + "epoch": 0.05955507325013565, + "grad_norm": 0.055297259241342545, + "kl": 0.005596160888671875, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0009841370374488179, + "clip_ratio/high_mean": 0.0003442479292061762, + "clip_ratio/low_mean": 0.0004872983208770165, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000831546249173698, + "epoch": 0.059728703201302226, + "grad_norm": 0.054491810500621796, + "kl": 0.006168365478515625, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2985.0, + "completions/mean_length": 886.9754638671875, + "completions/mean_terminated_length": 806.0486450195312, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.059902333152468804, + "grad_norm": 0.07429070770740509, + "kl": 0.0064544677734375, + "learning_rate": 1e-06, + "loss": 0.0417, + "num_tokens": 22246334.0, + "reward": 0.3638392984867096, + "reward_std": 0.29112476110458374, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "step": 345 + }, + { + "clip_ratio/high_max": 0.00030035827148822136, + "clip_ratio/high_mean": 5.879954687770805e-05, + "clip_ratio/low_mean": 5.32730310851548e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011207257603018661, + "epoch": 0.060075963103635374, + "grad_norm": 0.07200067490339279, + "kl": 0.006420135498046875, + "learning_rate": 1e-06, + "loss": 0.0416, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0003802006012847414, + "clip_ratio/high_mean": 8.521970255515043e-05, + "clip_ratio/low_mean": 0.0001073378498404054, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019255754705227446, + "epoch": 0.06024959305480195, + "grad_norm": 0.06983835250139236, + "kl": 0.00640869140625, + "learning_rate": 1e-06, + "loss": 0.0414, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0005957315588602796, + "clip_ratio/high_mean": 0.00015045818054204574, + "clip_ratio/low_mean": 0.0002226495976174192, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003731077813426964, + "epoch": 0.06042322300596853, + "grad_norm": 0.06727462261915207, + "kl": 0.00646209716796875, + "learning_rate": 1e-06, + "loss": 0.0412, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0008143403683789074, + "clip_ratio/high_mean": 0.00020838158707192633, + "clip_ratio/low_mean": 0.0003074368646593939, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005158184576430358, + "epoch": 0.06059685295713511, + "grad_norm": 0.0657956451177597, + "kl": 0.0064697265625, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0012356561164779123, + "clip_ratio/high_mean": 0.0003418914493522607, + "clip_ratio/low_mean": 0.00043947848189418437, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007813699194230139, + "epoch": 0.060770482908301685, + "grad_norm": 0.06357403844594955, + "kl": 0.00644683837890625, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0016889451071619987, + "clip_ratio/high_mean": 0.00046365106936718803, + "clip_ratio/low_mean": 0.0006350816156555084, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010987326932081487, + "epoch": 0.060944112859468255, + "grad_norm": 0.06124107167124748, + "kl": 0.006473541259765625, + "learning_rate": 1e-06, + "loss": 0.0403, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0019769157079281285, + "clip_ratio/high_mean": 0.0006289206194196595, + "clip_ratio/low_mean": 0.0007867491913202684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014156698343867902, + "epoch": 0.06111774281063483, + "grad_norm": 0.059978216886520386, + "kl": 0.006481170654296875, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3058.0, + "completions/mean_length": 967.4710083007812, + "completions/mean_terminated_length": 889.5254516601562, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.06129137276180141, + "grad_norm": 0.05818593502044678, + "kl": 0.006038665771484375, + "learning_rate": 1e-06, + "loss": 0.0534, + "num_tokens": 22749897.0, + "reward": 0.3638392984867096, + "reward_std": 0.21515080332756042, + "rewards/accuracy_reward/mean": 0.3704545497894287, + "rewards/accuracy_reward/std": 0.4834761619567871, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0001823139191401424, + "clip_ratio/high_mean": 3.733947960427031e-05, + "clip_ratio/low_mean": 2.3658254804104217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0997734408374527e-05, + "epoch": 0.06146500271296799, + "grad_norm": 0.05713135376572609, + "kl": 0.006031036376953125, + "learning_rate": 1e-06, + "loss": 0.0534, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0004046872491016984, + "clip_ratio/high_mean": 7.681633132960997e-05, + "clip_ratio/low_mean": 7.488967753488396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015170601113823068, + "epoch": 0.061638632664134566, + "grad_norm": 0.054760102182626724, + "kl": 0.00608062744140625, + "learning_rate": 1e-06, + "loss": 0.0532, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0005605849892162951, + "clip_ratio/high_mean": 0.00011883493357345287, + "clip_ratio/low_mean": 9.809467883314937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021692961900043883, + "epoch": 0.061812262615301136, + "grad_norm": 0.05416857451200485, + "kl": 0.006103515625, + "learning_rate": 1e-06, + "loss": 0.0531, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0008072235086729052, + "clip_ratio/high_mean": 0.00017728807324601803, + "clip_ratio/low_mean": 0.00013519402682504733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00031248209916157066, + "epoch": 0.061985892566467714, + "grad_norm": 0.05352313816547394, + "kl": 0.006137847900390625, + "learning_rate": 1e-06, + "loss": 0.0529, + "step": 357 + }, + { + "clip_ratio/high_max": 0.001043116622895468, + "clip_ratio/high_mean": 0.0002840233919414459, + "clip_ratio/low_mean": 0.00022088578862167196, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005049091714681708, + "epoch": 0.06215952251763429, + "grad_norm": 0.052352771162986755, + "kl": 0.006237030029296875, + "learning_rate": 1e-06, + "loss": 0.0527, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0015506240724789677, + "clip_ratio/high_mean": 0.0003829867150670907, + "clip_ratio/low_mean": 0.0003457808202256274, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007287675280167605, + "epoch": 0.06233315246880087, + "grad_norm": 0.0518171563744545, + "kl": 0.0062103271484375, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0020625034921977203, + "clip_ratio/high_mean": 0.0004773240989379701, + "clip_ratio/low_mean": 0.0004351693314674776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009124934103965643, + "epoch": 0.06250678241996745, + "grad_norm": 0.05028560385107994, + "kl": 0.006351470947265625, + "learning_rate": 1e-06, + "loss": 0.0523, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3004.0, + "completions/mean_length": 706.9754638671875, + "completions/mean_terminated_length": 663.9749755859375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.06268041237113402, + "grad_norm": 0.08463160693645477, + "kl": 0.0084075927734375, + "learning_rate": 1e-06, + "loss": 0.0658, + "num_tokens": 23128646.0, + "reward": 0.4308035969734192, + "reward_std": 0.25761252641677856, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "step": 361 + }, + { + "clip_ratio/high_max": 0.00027033269543608185, + "clip_ratio/high_mean": 4.487076716941374e-05, + "clip_ratio/low_mean": 3.6806141451961594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.167690953087003e-05, + "epoch": 0.0628540423223006, + "grad_norm": 0.08365386724472046, + "kl": 0.0084228515625, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 362 + }, + { + "clip_ratio/high_max": 0.00040712214104132727, + "clip_ratio/high_mean": 9.958373630070128e-05, + "clip_ratio/low_mean": 7.011401976342313e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001696977515166509, + "epoch": 0.06302767227346717, + "grad_norm": 0.08049225062131882, + "kl": 0.008441925048828125, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0009244686480087694, + "clip_ratio/high_mean": 0.00019890181010850938, + "clip_ratio/low_mean": 0.0001900474526337348, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00038894926001376007, + "epoch": 0.06320130222463374, + "grad_norm": 0.0748177096247673, + "kl": 0.00860595703125, + "learning_rate": 1e-06, + "loss": 0.0653, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0014030274032847956, + "clip_ratio/high_mean": 0.0003736615990419523, + "clip_ratio/low_mean": 0.00030446895380009664, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006781305510230595, + "epoch": 0.06337493217580033, + "grad_norm": 0.07293203473091125, + "kl": 0.00872039794921875, + "learning_rate": 1e-06, + "loss": 0.0651, + "step": 365 + }, + { + "clip_ratio/high_max": 0.001926543758600019, + "clip_ratio/high_mean": 0.0004975775227649137, + "clip_ratio/low_mean": 0.0004623770300895558, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009599545337550808, + "epoch": 0.0635485621269669, + "grad_norm": 0.07116150856018066, + "kl": 0.008884429931640625, + "learning_rate": 1e-06, + "loss": 0.0648, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0026287388463970274, + "clip_ratio/high_mean": 0.0006728721673425753, + "clip_ratio/low_mean": 0.0006116924014349934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012845645542256534, + "epoch": 0.06372219207813348, + "grad_norm": 0.07323356717824936, + "kl": 0.009067535400390625, + "learning_rate": 1e-06, + "loss": 0.0645, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0034251161850988865, + "clip_ratio/high_mean": 0.0009186936949845403, + "clip_ratio/low_mean": 0.0007596078030474018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016783014871180058, + "epoch": 0.06389582202930005, + "grad_norm": 0.06789842247962952, + "kl": 0.00916290283203125, + "learning_rate": 1e-06, + "loss": 0.0641, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2994.0, + "completions/mean_length": 1045.4866943359375, + "completions/mean_terminated_length": 970.4305419921875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.06406945198046662, + "grad_norm": 0.051936183124780655, + "kl": 0.005565643310546875, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 23665624.0, + "reward": 0.267857164144516, + "reward_std": 0.1885526329278946, + "rewards/accuracy_reward/mean": 0.2678571343421936, + "rewards/accuracy_reward/std": 0.4433377981185913, + "step": 369 + }, + { + "clip_ratio/high_max": 5.4333193475031294e-05, + "clip_ratio/high_mean": 9.046201512319385e-06, + "clip_ratio/low_mean": 5.7337443649885245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.63836453895783e-05, + "epoch": 0.06424308193163321, + "grad_norm": 0.050510816276073456, + "kl": 0.00559234619140625, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0001975403993128566, + "clip_ratio/high_mean": 3.56086854935711e-05, + "clip_ratio/low_mean": 6.963214400457218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010524082767915388, + "epoch": 0.06441671188279978, + "grad_norm": 0.04992562159895897, + "kl": 0.00553131103515625, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 371 + }, + { + "clip_ratio/high_max": 0.000229279610721278, + "clip_ratio/high_mean": 5.4440325925497746e-05, + "clip_ratio/low_mean": 0.00010199244979958166, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015643277765775565, + "epoch": 0.06459034183396636, + "grad_norm": 0.04871482774615288, + "kl": 0.005496978759765625, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0003177165035594953, + "clip_ratio/high_mean": 6.525937646983948e-05, + "clip_ratio/low_mean": 0.0001655387322898605, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002307981048943475, + "epoch": 0.06476397178513293, + "grad_norm": 0.047261085361242294, + "kl": 0.005504608154296875, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0006164622500364203, + "clip_ratio/high_mean": 0.00015622815271854051, + "clip_ratio/low_mean": 0.00025050604654097697, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004067341924383072, + "epoch": 0.0649376017362995, + "grad_norm": 0.04623940959572792, + "kl": 0.005558013916015625, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0008460017052129842, + "clip_ratio/high_mean": 0.00020639320155169116, + "clip_ratio/low_mean": 0.00033035484830179485, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005367480325730867, + "epoch": 0.06511123168746609, + "grad_norm": 0.045847054570913315, + "kl": 0.005550384521484375, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0010400067985756323, + "clip_ratio/high_mean": 0.000255543619459786, + "clip_ratio/low_mean": 0.00039686241279923706, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000652406038625486, + "epoch": 0.06528486163863266, + "grad_norm": 0.04530995339155197, + "kl": 0.0055999755859375, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3054.0, + "completions/mean_length": 963.8348388671875, + "completions/mean_terminated_length": 895.8294677734375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.06545849158979924, + "grad_norm": 0.062260325998067856, + "kl": 0.0054988861083984375, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 24162670.0, + "reward": 0.3437500298023224, + "reward_std": 0.28045570850372314, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "step": 377 + }, + { + "clip_ratio/high_max": 0.00012842282148994855, + "clip_ratio/high_mean": 2.7283734823413397e-05, + "clip_ratio/low_mean": 8.140499608089158e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010868873005165369, + "epoch": 0.06563212154096582, + "grad_norm": 0.06222591921687126, + "kl": 0.005550384521484375, + "learning_rate": 1e-06, + "loss": 0.0346, + "step": 378 + }, + { + "clip_ratio/high_max": 0.00019942150538554415, + "clip_ratio/high_mean": 4.637208849089802e-05, + "clip_ratio/low_mean": 7.18881940429128e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001182602818516898, + "epoch": 0.06580575149213239, + "grad_norm": 0.06224103644490242, + "kl": 0.0055294036865234375, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 379 + }, + { + "clip_ratio/high_max": 0.00030399567913264036, + "clip_ratio/high_mean": 6.944563665456371e-05, + "clip_ratio/low_mean": 8.665165478305425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015609729371135472, + "epoch": 0.06597938144329897, + "grad_norm": 0.06163451448082924, + "kl": 0.00559234619140625, + "learning_rate": 1e-06, + "loss": 0.0344, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0004986577914678492, + "clip_ratio/high_mean": 0.00013183758073864738, + "clip_ratio/low_mean": 0.00019675016392284306, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003285877355665434, + "epoch": 0.06615301139446554, + "grad_norm": 0.059781916439533234, + "kl": 0.005664825439453125, + "learning_rate": 1e-06, + "loss": 0.0342, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0008315210397995543, + "clip_ratio/high_mean": 0.00023973990755621344, + "clip_ratio/low_mean": 0.00028937037222931394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005291102716000751, + "epoch": 0.06632664134563213, + "grad_norm": 0.058572035282850266, + "kl": 0.0057392120361328125, + "learning_rate": 1e-06, + "loss": 0.0339, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0013553261669585481, + "clip_ratio/high_mean": 0.0003863876627292484, + "clip_ratio/low_mean": 0.0003759399478440173, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007623276142112445, + "epoch": 0.0665002712967987, + "grad_norm": 0.05742562934756279, + "kl": 0.005706787109375, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0018094930055667646, + "clip_ratio/high_mean": 0.0005186253019928699, + "clip_ratio/low_mean": 0.0005610195330518764, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010796448368637357, + "epoch": 0.06667390124796528, + "grad_norm": 0.05606549233198166, + "kl": 0.00598907470703125, + "learning_rate": 1e-06, + "loss": 0.0333, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3044.0, + "completions/mean_length": 1011.49560546875, + "completions/mean_terminated_length": 945.0276489257812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.06684753119913185, + "grad_norm": 0.07418453693389893, + "kl": 0.006122589111328125, + "learning_rate": 1e-06, + "loss": 0.0554, + "num_tokens": 24684884.0, + "reward": 0.3035714328289032, + "reward_std": 0.27603086829185486, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "step": 385 + }, + { + "clip_ratio/high_max": 0.000352866875800828, + "clip_ratio/high_mean": 6.640938488544634e-05, + "clip_ratio/low_mean": 3.7655452047147264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010406483977476455, + "epoch": 0.06702116115029842, + "grad_norm": 0.07364295423030853, + "kl": 0.00611114501953125, + "learning_rate": 1e-06, + "loss": 0.0553, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0004125107134314021, + "clip_ratio/high_mean": 7.116471510926203e-05, + "clip_ratio/low_mean": 5.350015669591812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012466487487472477, + "epoch": 0.067194791101465, + "grad_norm": 0.07329711318016052, + "kl": 0.0060863494873046875, + "learning_rate": 1e-06, + "loss": 0.0551, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0005055143155914266, + "clip_ratio/high_mean": 0.00011390850272618991, + "clip_ratio/low_mean": 0.00013419330161923426, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024810179638734553, + "epoch": 0.06736842105263158, + "grad_norm": 0.07137156277894974, + "kl": 0.006160736083984375, + "learning_rate": 1e-06, + "loss": 0.0549, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0010914538888755487, + "clip_ratio/high_mean": 0.00020038234470121097, + "clip_ratio/low_mean": 0.0001768273932611919, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00037720973159593996, + "epoch": 0.06754205100379816, + "grad_norm": 0.06992734968662262, + "kl": 0.006137847900390625, + "learning_rate": 1e-06, + "loss": 0.0546, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0013735963966610143, + "clip_ratio/high_mean": 0.00029584498315671226, + "clip_ratio/low_mean": 0.0003325326633785153, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006283776638156269, + "epoch": 0.06771568095496473, + "grad_norm": 0.06614859402179718, + "kl": 0.0061893463134765625, + "learning_rate": 1e-06, + "loss": 0.0543, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0022526490065502003, + "clip_ratio/high_mean": 0.00047240142566806753, + "clip_ratio/low_mean": 0.0005063878488726914, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009787892413442023, + "epoch": 0.0678893109061313, + "grad_norm": 0.06549523025751114, + "kl": 0.006252288818359375, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0035865396130247973, + "clip_ratio/high_mean": 0.0007068791674100794, + "clip_ratio/low_mean": 0.000734216722776182, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014410958756343462, + "epoch": 0.06806294085729789, + "grad_norm": 0.06417727470397949, + "kl": 0.006267547607421875, + "learning_rate": 1e-06, + "loss": 0.0536, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013392857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 1039.3482666015625, + "completions/mean_terminated_length": 1011.7556762695312, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06823657080846446, + "grad_norm": 0.05862051248550415, + "kl": 0.005451202392578125, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 25216224.0, + "reward": 0.3348214328289032, + "reward_std": 0.25309935212135315, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "step": 393 + }, + { + "clip_ratio/high_max": 0.00010668078175513074, + "clip_ratio/high_mean": 3.206022620361182e-05, + "clip_ratio/low_mean": 3.85724141551691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.063263990403357e-05, + "epoch": 0.06841020075963104, + "grad_norm": 0.05794129520654678, + "kl": 0.005462646484375, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 394 + }, + { + "clip_ratio/high_max": 0.00027231494732404826, + "clip_ratio/high_mean": 7.049979626572167e-05, + "clip_ratio/low_mean": 9.032347043103073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016082326965261018, + "epoch": 0.06858383071079761, + "grad_norm": 0.05751175805926323, + "kl": 0.0054759979248046875, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 395 + }, + { + "clip_ratio/high_max": 0.00031694677818450145, + "clip_ratio/high_mean": 7.92699520388851e-05, + "clip_ratio/low_mean": 0.00011729930929504917, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019656926724564983, + "epoch": 0.06875746066196418, + "grad_norm": 0.05568217486143112, + "kl": 0.0055255889892578125, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0005117202599649318, + "clip_ratio/high_mean": 0.00011753916714951629, + "clip_ratio/low_mean": 0.00019117049896522076, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003087096674789791, + "epoch": 0.06893109061313077, + "grad_norm": 0.05319046601653099, + "kl": 0.0056018829345703125, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0006662047217105282, + "clip_ratio/high_mean": 0.00016839862382767024, + "clip_ratio/low_mean": 0.000254626169862604, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00042302478323108517, + "epoch": 0.06910472056429734, + "grad_norm": 0.05130859836935997, + "kl": 0.0056610107421875, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0010898982782236999, + "clip_ratio/high_mean": 0.00028432466660888167, + "clip_ratio/low_mean": 0.000398426929677953, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006827515826444142, + "epoch": 0.06927835051546392, + "grad_norm": 0.04964437708258629, + "kl": 0.00576019287109375, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0013905525374866556, + "clip_ratio/high_mean": 0.00036637676657846896, + "clip_ratio/low_mean": 0.0005595695110969245, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009259462967747822, + "epoch": 0.0694519804666305, + "grad_norm": 0.04843536391854286, + "kl": 0.005832672119140625, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2974.0, + "completions/mean_length": 864.6361694335938, + "completions/mean_terminated_length": 803.8829956054688, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.06962561041779707, + "grad_norm": 0.07142940163612366, + "kl": 0.007968902587890625, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 25671773.0, + "reward": 0.3660714328289032, + "reward_std": 0.24454915523529053, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.4822677969932556, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0002728405388552346, + "clip_ratio/high_mean": 5.537900256058492e-05, + "clip_ratio/low_mean": 4.8650332473698654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010402933503428358, + "epoch": 0.06979924036896365, + "grad_norm": 0.07058944553136826, + "kl": 0.008022308349609375, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 402 + }, + { + "clip_ratio/high_max": 0.00019992188481410267, + "clip_ratio/high_mean": 6.391936040017754e-05, + "clip_ratio/low_mean": 8.929291880122037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001532122792013979, + "epoch": 0.06997287032013022, + "grad_norm": 0.0684940367937088, + "kl": 0.008075714111328125, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0006094526006563683, + "clip_ratio/high_mean": 0.00013758329714619322, + "clip_ratio/low_mean": 0.00016737531041144393, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003049586061933951, + "epoch": 0.0701465002712968, + "grad_norm": 0.06710630655288696, + "kl": 0.008289337158203125, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0008618239608040312, + "clip_ratio/high_mean": 0.00019271238170404104, + "clip_ratio/low_mean": 0.00021670928072126117, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00040942167606772273, + "epoch": 0.07032013022246338, + "grad_norm": 0.06544848531484604, + "kl": 0.008281707763671875, + "learning_rate": 1e-06, + "loss": 0.0052, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0014399770479940344, + "clip_ratio/high_mean": 0.0003298482024547411, + "clip_ratio/low_mean": 0.0003534736620167678, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006833218744759506, + "epoch": 0.07049376017362995, + "grad_norm": 0.06409391760826111, + "kl": 0.008350372314453125, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0019191915580449859, + "clip_ratio/high_mean": 0.0005047339013799501, + "clip_ratio/low_mean": 0.0005321281050783, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010368619969085557, + "epoch": 0.07066739012479653, + "grad_norm": 0.06259263306856155, + "kl": 0.008594512939453125, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0030728118726983666, + "clip_ratio/high_mean": 0.000799487347194372, + "clip_ratio/low_mean": 0.0007166916193455108, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015161789997364394, + "epoch": 0.0708410200759631, + "grad_norm": 0.06172160431742668, + "kl": 0.008762359619140625, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3056.0, + "completions/mean_length": 972.2188110351562, + "completions/mean_terminated_length": 904.48388671875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.07101465002712969, + "grad_norm": 0.04366274178028107, + "kl": 0.008502960205078125, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 26168631.0, + "reward": 0.2321428656578064, + "reward_std": 0.163971409201622, + "rewards/accuracy_reward/mean": 0.2321428507566452, + "rewards/accuracy_reward/std": 0.4226716458797455, + "step": 409 + }, + { + "clip_ratio/high_max": 9.383076121594058e-05, + "clip_ratio/high_mean": 1.549490502839035e-05, + "clip_ratio/low_mean": 4.198062413252046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.747552722823457e-05, + "epoch": 0.07118827997829626, + "grad_norm": 0.0434463694691658, + "kl": 0.008396148681640625, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 410 + }, + { + "clip_ratio/high_max": 0.00012047775089740753, + "clip_ratio/high_mean": 1.72111068650338e-05, + "clip_ratio/low_mean": 4.2207720298392815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.9418827504487126e-05, + "epoch": 0.07136190992946283, + "grad_norm": 0.0431443490087986, + "kl": 0.008533477783203125, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0001803086443032953, + "clip_ratio/high_mean": 3.742725255051482e-05, + "clip_ratio/low_mean": 7.685498485443532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011428223751863698, + "epoch": 0.07153553988062941, + "grad_norm": 0.042289454489946365, + "kl": 0.00852203369140625, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 412 + }, + { + "clip_ratio/high_max": 0.00042466634022275684, + "clip_ratio/high_mean": 7.754144917271333e-05, + "clip_ratio/low_mean": 8.696327608959109e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016450472844553587, + "epoch": 0.07170916983179598, + "grad_norm": 0.04204561933875084, + "kl": 0.008403778076171875, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0005568761062022531, + "clip_ratio/high_mean": 9.960420629795408e-05, + "clip_ratio/low_mean": 0.00014955328992982686, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024915749054343905, + "epoch": 0.07188279978296257, + "grad_norm": 0.04087040200829506, + "kl": 0.008449554443359375, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0006966561286390061, + "clip_ratio/high_mean": 0.0001483443070355861, + "clip_ratio/low_mean": 0.00020027076880069217, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003486150617391104, + "epoch": 0.07205642973412914, + "grad_norm": 0.03985971584916115, + "kl": 0.008544921875, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0011922293051611632, + "clip_ratio/high_mean": 0.0002501657481843722, + "clip_ratio/low_mean": 0.00029634740985784447, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005465131562232273, + "epoch": 0.07223005968529571, + "grad_norm": 0.03936678543686867, + "kl": 0.008647918701171875, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 951.26123046875, + "completions/mean_terminated_length": 887.8827514648438, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.07240368963646229, + "grad_norm": 0.0712510421872139, + "kl": 0.0083770751953125, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 26669724.0, + "reward": 0.3392857313156128, + "reward_std": 0.2685191035270691, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399619221687317, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0001980896604436566, + "clip_ratio/high_mean": 4.039480995743361e-05, + "clip_ratio/low_mean": 3.0251079124354874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.064588930916216e-05, + "epoch": 0.07257731958762886, + "grad_norm": 0.07169051468372345, + "kl": 0.00839996337890625, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 418 + }, + { + "clip_ratio/high_max": 0.00037027615326223895, + "clip_ratio/high_mean": 9.366267477162182e-05, + "clip_ratio/low_mean": 5.6742893775663106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015040556627354817, + "epoch": 0.07275094953879545, + "grad_norm": 0.07036883383989334, + "kl": 0.0084991455078125, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0006121776496001985, + "clip_ratio/high_mean": 0.00013705155038223893, + "clip_ratio/low_mean": 0.0001048125393481314, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024186410064430675, + "epoch": 0.07292457948996202, + "grad_norm": 0.0681179016828537, + "kl": 0.00864410400390625, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 420 + }, + { + "clip_ratio/high_max": 0.000898930376934004, + "clip_ratio/high_mean": 0.00021959520108794095, + "clip_ratio/low_mean": 0.00020296448019507807, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004225596685500932, + "epoch": 0.07309820944112859, + "grad_norm": 0.0674973726272583, + "kl": 0.0087127685546875, + "learning_rate": 1e-06, + "loss": 0.0048, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0016956567778834142, + "clip_ratio/high_mean": 0.00039947976347320946, + "clip_ratio/low_mean": 0.00038196422246983275, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007814439977664733, + "epoch": 0.07327183939229517, + "grad_norm": 0.06373700499534607, + "kl": 0.00872039794921875, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0024017421183089027, + "clip_ratio/high_mean": 0.0005979948896310816, + "clip_ratio/low_mean": 0.0004905500445602229, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010885449337365571, + "epoch": 0.07344546934346174, + "grad_norm": 0.06043197587132454, + "kl": 0.008098602294921875, + "learning_rate": 1e-06, + "loss": 0.0042, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0035094933664368, + "clip_ratio/high_mean": 0.0008663268708914984, + "clip_ratio/low_mean": 0.0006749141375621548, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015412410757562611, + "epoch": 0.07361909929462833, + "grad_norm": 0.05864177271723747, + "kl": 0.00823974609375, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022321428571428603, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 900.5178833007812, + "completions/mean_terminated_length": 850.9406127929688, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.0737927292457949, + "grad_norm": 0.06137944385409355, + "kl": 0.00884246826171875, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 27140444.0, + "reward": 0.3861607313156128, + "reward_std": 0.20132926106452942, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124526977539, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0002520838024793193, + "clip_ratio/high_mean": 5.545030830944597e-05, + "clip_ratio/low_mean": 6.998085245868424e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012543115872176713, + "epoch": 0.07396635919696147, + "grad_norm": 0.06072656065225601, + "kl": 0.0089263916015625, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0003339539107400924, + "clip_ratio/high_mean": 5.282412439555628e-05, + "clip_ratio/low_mean": 7.376592020591488e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012659004369197646, + "epoch": 0.07413998914812805, + "grad_norm": 0.059641845524311066, + "kl": 0.008968353271484375, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0004723690726677887, + "clip_ratio/high_mean": 9.638588971938589e-05, + "clip_ratio/low_mean": 0.00011440723619671189, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021079312500660308, + "epoch": 0.07431361909929463, + "grad_norm": 0.05852828174829483, + "kl": 0.009014129638671875, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0007509872848459054, + "clip_ratio/high_mean": 0.00018970008477481315, + "clip_ratio/low_mean": 0.0001685194174569915, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035821950950776227, + "epoch": 0.07448724905046121, + "grad_norm": 0.05751383677124977, + "kl": 0.00910186767578125, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 429 + }, + { + "clip_ratio/high_max": 0.000881353575096, + "clip_ratio/high_mean": 0.00023693785624345765, + "clip_ratio/low_mean": 0.00029066501156194136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005276028659864096, + "epoch": 0.07466087900162778, + "grad_norm": 0.05541093647480011, + "kl": 0.0091705322265625, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0015520932283834554, + "clip_ratio/high_mean": 0.0003970212219428504, + "clip_ratio/low_mean": 0.0004051732557854848, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008021944704523776, + "epoch": 0.07483450895279435, + "grad_norm": 0.05405551195144653, + "kl": 0.00917816162109375, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 431 + }, + { + "clip_ratio/high_max": 0.002093695591611322, + "clip_ratio/high_mean": 0.0005334510369721102, + "clip_ratio/low_mean": 0.0006424109797080746, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011758620166801848, + "epoch": 0.07500813890396094, + "grad_norm": 0.052185870707035065, + "kl": 0.009548187255859375, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0379464285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2959.0, + "completions/mean_length": 910.5313110351562, + "completions/mean_terminated_length": 825.2760620117188, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.0751817688551275, + "grad_norm": 0.06408514082431793, + "kl": 0.00791168212890625, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 27613002.0, + "reward": 0.3571428656578064, + "reward_std": 0.22747182846069336, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "step": 433 + }, + { + "clip_ratio/high_max": 0.00032915492192842066, + "clip_ratio/high_mean": 6.146695795905543e-05, + "clip_ratio/low_mean": 4.9082045052273315e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011054900414819713, + "epoch": 0.07535539880629409, + "grad_norm": 0.06386161595582962, + "kl": 0.00794219970703125, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 434 + }, + { + "clip_ratio/high_max": 0.00033029128371708794, + "clip_ratio/high_mean": 6.62878933326283e-05, + "clip_ratio/low_mean": 7.654843079762941e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014283632071965258, + "epoch": 0.07552902875746066, + "grad_norm": 0.06266144663095474, + "kl": 0.00798797607421875, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0004461141888896236, + "clip_ratio/high_mean": 0.00010708630998124136, + "clip_ratio/low_mean": 8.269277532235719e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018977908712258795, + "epoch": 0.07570265870862723, + "grad_norm": 0.060915980488061905, + "kl": 0.007991790771484375, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0006687416444037808, + "clip_ratio/high_mean": 0.00018520303660807258, + "clip_ratio/low_mean": 0.00017971742590816575, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003649204491011915, + "epoch": 0.07587628865979382, + "grad_norm": 0.05859021842479706, + "kl": 0.00811004638671875, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0011784889447881142, + "clip_ratio/high_mean": 0.00034302008361919434, + "clip_ratio/low_mean": 0.00039042806611178094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007334481388170389, + "epoch": 0.07604991861096039, + "grad_norm": 0.058285217732191086, + "kl": 0.008167266845703125, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0013429916798486374, + "clip_ratio/high_mean": 0.00040874550359149, + "clip_ratio/low_mean": 0.0005450707162708568, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009538162539683981, + "epoch": 0.07622354856212697, + "grad_norm": 0.05627021566033363, + "kl": 0.00823974609375, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0021648298716172576, + "clip_ratio/high_mean": 0.000634390271443408, + "clip_ratio/low_mean": 0.0007858561630200711, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014202464790287195, + "epoch": 0.07639717851329354, + "grad_norm": 0.05428881570696831, + "kl": 0.008388519287109375, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0401785714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2999.0, + "completions/mean_length": 944.2567138671875, + "completions/mean_terminated_length": 855.1883544921875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07657080846446011, + "grad_norm": 0.07092641294002533, + "kl": 0.00841522216796875, + "learning_rate": 1e-06, + "loss": 0.0627, + "num_tokens": 28100981.0, + "reward": 0.2790178656578064, + "reward_std": 0.23897472023963928, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "step": 441 + }, + { + "clip_ratio/high_max": 0.00028675900648522656, + "clip_ratio/high_mean": 5.576549665420316e-05, + "clip_ratio/low_mean": 5.556601399803185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011133151065223501, + "epoch": 0.0767444384156267, + "grad_norm": 0.06717579066753387, + "kl": 0.008350372314453125, + "learning_rate": 1e-06, + "loss": 0.0627, + "step": 442 + }, + { + "clip_ratio/high_max": 0.00025926873149728635, + "clip_ratio/high_mean": 6.155711344035808e-05, + "clip_ratio/low_mean": 0.00013610883524961537, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019766595733017311, + "epoch": 0.07691806836679327, + "grad_norm": 0.07398205250501633, + "kl": 0.008419036865234375, + "learning_rate": 1e-06, + "loss": 0.0625, + "step": 443 + }, + { + "clip_ratio/high_max": 0.00046373161057999823, + "clip_ratio/high_mean": 0.00011668151273624972, + "clip_ratio/low_mean": 0.00038669602736263187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005033775505580707, + "epoch": 0.07709169831795985, + "grad_norm": 0.0684816986322403, + "kl": 0.00856781005859375, + "learning_rate": 1e-06, + "loss": 0.0623, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0007958723199408269, + "clip_ratio/high_mean": 0.00020857080949099327, + "clip_ratio/low_mean": 0.0005750166155849001, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007835874466763926, + "epoch": 0.07726532826912642, + "grad_norm": 0.06696873158216476, + "kl": 0.008373260498046875, + "learning_rate": 1e-06, + "loss": 0.062, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0016781306967459386, + "clip_ratio/high_mean": 0.0003922469509234361, + "clip_ratio/low_mean": 0.0007365903793470352, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011288373334537027, + "epoch": 0.077438958220293, + "grad_norm": 0.0638066977262497, + "kl": 0.0086517333984375, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0031658518200856633, + "clip_ratio/high_mean": 0.0006687960112685687, + "clip_ratio/low_mean": 0.0009419996295036981, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016107955707411747, + "epoch": 0.07761258817145958, + "grad_norm": 0.062295448035001755, + "kl": 0.00868988037109375, + "learning_rate": 1e-06, + "loss": 0.0614, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0034769001867971383, + "clip_ratio/high_mean": 0.0007939349707157817, + "clip_ratio/low_mean": 0.001156302341769333, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001950237354321871, + "epoch": 0.07778621812262615, + "grad_norm": 0.0612034797668457, + "kl": 0.00891876220703125, + "learning_rate": 1e-06, + "loss": 0.0611, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3015.0, + "completions/mean_length": 996.794677734375, + "completions/mean_terminated_length": 919.9351806640625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.07795984807379273, + "grad_norm": 0.07190510630607605, + "kl": 0.009708404541015625, + "learning_rate": 1e-06, + "loss": 0.036, + "num_tokens": 28619889.0, + "reward": 0.2924107313156128, + "reward_std": 0.2514510452747345, + "rewards/accuracy_reward/mean": 0.2924107015132904, + "rewards/accuracy_reward/std": 0.4553784728050232, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0002039910377789056, + "clip_ratio/high_mean": 4.3777133669209434e-05, + "clip_ratio/low_mean": 5.606702507066075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.984415783037548e-05, + "epoch": 0.0781334780249593, + "grad_norm": 0.06664367765188217, + "kl": 0.009746551513671875, + "learning_rate": 1e-06, + "loss": 0.036, + "step": 450 + }, + { + "clip_ratio/high_max": 0.00029485604136425536, + "clip_ratio/high_mean": 6.758294739483972e-05, + "clip_ratio/low_mean": 8.304155221594556e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015062450165714836, + "epoch": 0.07830710797612588, + "grad_norm": 0.065585196018219, + "kl": 0.009796142578125, + "learning_rate": 1e-06, + "loss": 0.0358, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0006348847855406348, + "clip_ratio/high_mean": 0.00015713584116383572, + "clip_ratio/low_mean": 0.00013917691194365034, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002963127535622334, + "epoch": 0.07848073792729246, + "grad_norm": 0.06515642255544662, + "kl": 0.009876251220703125, + "learning_rate": 1e-06, + "loss": 0.0356, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0011169466852152254, + "clip_ratio/high_mean": 0.0002542041484048241, + "clip_ratio/low_mean": 0.00016535621352886665, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004195603614789434, + "epoch": 0.07865436787845903, + "grad_norm": 0.06426515430212021, + "kl": 0.009990692138671875, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0016634222884022165, + "clip_ratio/high_mean": 0.0003812199938693084, + "clip_ratio/low_mean": 0.00028937515799043467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006705951382173225, + "epoch": 0.07882799782962562, + "grad_norm": 0.06321360170841217, + "kl": 0.00995635986328125, + "learning_rate": 1e-06, + "loss": 0.035, + "step": 454 + }, + { + "clip_ratio/high_max": 0.002406130501185544, + "clip_ratio/high_mean": 0.0005649960767186712, + "clip_ratio/low_mean": 0.0003828420267382171, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009478381070948672, + "epoch": 0.07900162778079219, + "grad_norm": 0.0604434497654438, + "kl": 0.0101165771484375, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0034112166467821226, + "clip_ratio/high_mean": 0.0008614017679065, + "clip_ratio/low_mean": 0.0004977028493158286, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013591045499197207, + "epoch": 0.07917525773195876, + "grad_norm": 0.05947640538215637, + "kl": 0.010150909423828125, + "learning_rate": 1e-06, + "loss": 0.0344, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 957.4308471679688, + "completions/mean_terminated_length": 858.619140625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.07934888768312534, + "grad_norm": 0.07571888715028763, + "kl": 0.008075714111328125, + "learning_rate": 1e-06, + "loss": 0.047, + "num_tokens": 29118522.0, + "reward": 0.3348214328289032, + "reward_std": 0.23942893743515015, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0001382025739076198, + "clip_ratio/high_mean": 2.0901050675092847e-05, + "clip_ratio/low_mean": 2.4410988885392726e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5312037400435656e-05, + "epoch": 0.07952251763429191, + "grad_norm": 0.07198500633239746, + "kl": 0.008098602294921875, + "learning_rate": 1e-06, + "loss": 0.047, + "step": 458 + }, + { + "clip_ratio/high_max": 0.00029282597824931145, + "clip_ratio/high_mean": 6.090299370953289e-05, + "clip_ratio/low_mean": 6.852567832993373e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012942866942466935, + "epoch": 0.0796961475854585, + "grad_norm": 0.06921639293432236, + "kl": 0.0081329345703125, + "learning_rate": 1e-06, + "loss": 0.0468, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0005976662905595731, + "clip_ratio/high_mean": 0.00012058914944645949, + "clip_ratio/low_mean": 0.0001481662434343889, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026875539697357453, + "epoch": 0.07986977753662507, + "grad_norm": 0.06589902192354202, + "kl": 0.00849151611328125, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0009460280889470596, + "clip_ratio/high_mean": 0.00023149694152380107, + "clip_ratio/low_mean": 0.0003092713386649848, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005407682820077753, + "epoch": 0.08004340748779164, + "grad_norm": 0.06466376781463623, + "kl": 0.008792877197265625, + "learning_rate": 1e-06, + "loss": 0.0463, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0014135776546027046, + "clip_ratio/high_mean": 0.0003597660520426871, + "clip_ratio/low_mean": 0.00042870851211773697, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007884745191404363, + "epoch": 0.08021703743895822, + "grad_norm": 0.06328453868627548, + "kl": 0.009098052978515625, + "learning_rate": 1e-06, + "loss": 0.046, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0021365940119721927, + "clip_ratio/high_mean": 0.0005611996148218168, + "clip_ratio/low_mean": 0.0007339924104599049, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012951920725754462, + "epoch": 0.08039066739012479, + "grad_norm": 0.060620568692684174, + "kl": 0.00952911376953125, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0028105016608606093, + "clip_ratio/high_mean": 0.0007667056306672748, + "clip_ratio/low_mean": 0.0008036287854338298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001570334436109988, + "epoch": 0.08056429734129138, + "grad_norm": 0.059903573244810104, + "kl": 0.0110931396484375, + "learning_rate": 1e-06, + "loss": 0.0454, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2998.0, + "completions/mean_length": 875.2522583007812, + "completions/mean_terminated_length": 809.602294921875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.08073792729245795, + "grad_norm": 0.08080428093671799, + "kl": 0.010009765625, + "learning_rate": 1e-06, + "loss": 0.0749, + "num_tokens": 29574883.0, + "reward": 0.3906250298023224, + "reward_std": 0.24506700038909912, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "step": 465 + }, + { + "clip_ratio/high_max": 0.00015592802901664982, + "clip_ratio/high_mean": 3.214462606138113e-05, + "clip_ratio/low_mean": 3.6921463561157e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.906608905410394e-05, + "epoch": 0.08091155724362452, + "grad_norm": 0.07947537302970886, + "kl": 0.01006317138671875, + "learning_rate": 1e-06, + "loss": 0.0748, + "step": 466 + }, + { + "clip_ratio/high_max": 0.00028147303964942694, + "clip_ratio/high_mean": 6.997926811891375e-05, + "clip_ratio/low_mean": 8.904413925847621e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001590234096511267, + "epoch": 0.0810851871947911, + "grad_norm": 0.07425659894943237, + "kl": 0.0102081298828125, + "learning_rate": 1e-06, + "loss": 0.0746, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0004964243544236524, + "clip_ratio/high_mean": 0.00013633510729960108, + "clip_ratio/low_mean": 0.0001434014147889684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00027973652504442725, + "epoch": 0.08125881714595767, + "grad_norm": 0.0686664953827858, + "kl": 0.01013946533203125, + "learning_rate": 1e-06, + "loss": 0.0744, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0008825306667858968, + "clip_ratio/high_mean": 0.00022595294103666674, + "clip_ratio/low_mean": 0.00022684548548568273, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00045279842015588656, + "epoch": 0.08143244709712426, + "grad_norm": 0.06770581752061844, + "kl": 0.01021575927734375, + "learning_rate": 1e-06, + "loss": 0.0741, + "step": 469 + }, + { + "clip_ratio/high_max": 0.001452717639040202, + "clip_ratio/high_mean": 0.0003859325934172375, + "clip_ratio/low_mean": 0.00036456312955124304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007504957193305017, + "epoch": 0.08160607704829083, + "grad_norm": 0.06468882411718369, + "kl": 0.010223388671875, + "learning_rate": 1e-06, + "loss": 0.0738, + "step": 470 + }, + { + "clip_ratio/high_max": 0.002253315324196592, + "clip_ratio/high_mean": 0.000612005414950545, + "clip_ratio/low_mean": 0.0005898926119698444, + "clip_ratio/low_min": 1.4333219041873235e-05, + "clip_ratio/region_mean": 0.001201898034196347, + "epoch": 0.08177970699945741, + "grad_norm": 0.06164233759045601, + "kl": 0.010345458984375, + "learning_rate": 1e-06, + "loss": 0.0735, + "step": 471 + }, + { + "clip_ratio/high_max": 0.002942253959190566, + "clip_ratio/high_mean": 0.0007968794607222662, + "clip_ratio/low_mean": 0.0007579805132991169, + "clip_ratio/low_min": 1.4333219041873235e-05, + "clip_ratio/region_mean": 0.0015548599658359308, + "epoch": 0.08195333695062398, + "grad_norm": 0.0613122321665287, + "kl": 0.01047515869140625, + "learning_rate": 1e-06, + "loss": 0.0732, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2942.0, + "completions/mean_length": 1029.6585693359375, + "completions/mean_terminated_length": 919.1317138671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.08212696690179055, + "grad_norm": 0.05986720323562622, + "kl": 0.01380157470703125, + "learning_rate": 1e-06, + "loss": 0.0513, + "num_tokens": 30103178.0, + "reward": 0.3214285969734192, + "reward_std": 0.2368021309375763, + "rewards/accuracy_reward/mean": 0.3214285671710968, + "rewards/accuracy_reward/std": 0.4675469994544983, + "step": 473 + }, + { + "clip_ratio/high_max": 0.00016146952930284897, + "clip_ratio/high_mean": 3.2751140224718256e-05, + "clip_ratio/low_mean": 4.689451236572495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.964565247675637e-05, + "epoch": 0.08230059685295714, + "grad_norm": 0.05966801568865776, + "kl": 0.0138702392578125, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0002358176634515985, + "clip_ratio/high_mean": 4.795559129888716e-05, + "clip_ratio/low_mean": 6.470488744980685e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011266047738445195, + "epoch": 0.08247422680412371, + "grad_norm": 0.058940593153238297, + "kl": 0.01373291015625, + "learning_rate": 1e-06, + "loss": 0.0511, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0001989734882954508, + "clip_ratio/high_mean": 3.4430059258738765e-05, + "clip_ratio/low_mean": 0.00013333030074136332, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016776035909060738, + "epoch": 0.0826478567552903, + "grad_norm": 0.05709509178996086, + "kl": 0.01419830322265625, + "learning_rate": 1e-06, + "loss": 0.051, + "step": 476 + }, + { + "clip_ratio/high_max": 0.00039067193029040936, + "clip_ratio/high_mean": 0.00010620298508001724, + "clip_ratio/low_mean": 0.0001853068670243374, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029150985483283876, + "epoch": 0.08282148670645686, + "grad_norm": 0.0553724579513073, + "kl": 0.01401519775390625, + "learning_rate": 1e-06, + "loss": 0.0509, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0006422748738259543, + "clip_ratio/high_mean": 0.00016922746272030054, + "clip_ratio/low_mean": 0.0003395449421077501, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005087724184704712, + "epoch": 0.08299511665762344, + "grad_norm": 0.052293356508016586, + "kl": 0.01436614990234375, + "learning_rate": 1e-06, + "loss": 0.0506, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0010985812878061552, + "clip_ratio/high_mean": 0.0002638073503931082, + "clip_ratio/low_mean": 0.0004924911172565771, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007562984410469653, + "epoch": 0.08316874660879002, + "grad_norm": 0.05177067965269089, + "kl": 0.0146331787109375, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0015262793531292118, + "clip_ratio/high_mean": 0.0004433856902323896, + "clip_ratio/low_mean": 0.0006560317542607663, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010994174554070923, + "epoch": 0.08334237655995659, + "grad_norm": 0.0510915145277977, + "kl": 0.0152587890625, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033482142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3061.0, + "completions/mean_length": 945.6719360351562, + "completions/mean_terminated_length": 872.0115356445312, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.08351600651112318, + "grad_norm": 0.06212655454874039, + "kl": 0.0097503662109375, + "learning_rate": 1e-06, + "loss": 0.0818, + "num_tokens": 30590823.0, + "reward": 0.3549107313156128, + "reward_std": 0.24348649382591248, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0002519359750294825, + "clip_ratio/high_mean": 4.3620444785119616e-05, + "clip_ratio/low_mean": 4.097622593235428e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.459667196802911e-05, + "epoch": 0.08368963646228975, + "grad_norm": 0.06153044477105141, + "kl": 0.0099334716796875, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 482 + }, + { + "clip_ratio/high_max": 0.00033465247179265134, + "clip_ratio/high_mean": 6.441856362471299e-05, + "clip_ratio/low_mean": 0.00010692933483369416, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017134790095951757, + "epoch": 0.08386326641345632, + "grad_norm": 0.05868428573012352, + "kl": 0.0099945068359375, + "learning_rate": 1e-06, + "loss": 0.0816, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0006801343788538361, + "clip_ratio/high_mean": 0.00014394641152648546, + "clip_ratio/low_mean": 0.00014504879027299467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002889952011173591, + "epoch": 0.0840368963646229, + "grad_norm": 0.058025408536195755, + "kl": 0.0101165771484375, + "learning_rate": 1e-06, + "loss": 0.0814, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0010280759452143684, + "clip_ratio/high_mean": 0.00020393272552610142, + "clip_ratio/low_mean": 0.00024021057197387563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00044414330659492407, + "epoch": 0.08421052631578947, + "grad_norm": 0.054604385048151016, + "kl": 0.01024627685546875, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0012773060661857016, + "clip_ratio/high_mean": 0.00027123058407596545, + "clip_ratio/low_mean": 0.00036341459599498194, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006346451755234739, + "epoch": 0.08438415626695606, + "grad_norm": 0.053782716393470764, + "kl": 0.010406494140625, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0017270066527999006, + "clip_ratio/high_mean": 0.00041296827475889586, + "clip_ratio/low_mean": 0.00048764439088699874, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009006126601889264, + "epoch": 0.08455778621812263, + "grad_norm": 0.051963213831186295, + "kl": 0.01055145263671875, + "learning_rate": 1e-06, + "loss": 0.0808, + "step": 487 + }, + { + "clip_ratio/high_max": 0.002564054557296913, + "clip_ratio/high_mean": 0.0006153437134344131, + "clip_ratio/low_mean": 0.0006469719837696175, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012623157199413981, + "epoch": 0.0847314161692892, + "grad_norm": 0.04945249855518341, + "kl": 0.01059722900390625, + "learning_rate": 1e-06, + "loss": 0.0805, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2950.0, + "completions/mean_length": 1034.515625, + "completions/mean_terminated_length": 914.096923828125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.08490504612045578, + "grad_norm": 0.060983531177043915, + "kl": 0.01161956787109375, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 31121246.0, + "reward": 0.3058035969734192, + "reward_std": 0.2316855490207672, + "rewards/accuracy_reward/mean": 0.3058035671710968, + "rewards/accuracy_reward/std": 0.4612620174884796, + "step": 489 + }, + { + "clip_ratio/high_max": 0.00017491710605099797, + "clip_ratio/high_mean": 3.592424150156148e-05, + "clip_ratio/low_mean": 4.531008460162411e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.123432508000406e-05, + "epoch": 0.08507867607162235, + "grad_norm": 0.06080056354403496, + "kl": 0.011745452880859375, + "learning_rate": 1e-06, + "loss": 0.0633, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0001592534281371627, + "clip_ratio/high_mean": 3.510348324198276e-05, + "clip_ratio/low_mean": 0.00010286489714417257, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013796838186408422, + "epoch": 0.08525230602278894, + "grad_norm": 0.05835289880633354, + "kl": 0.01165008544921875, + "learning_rate": 1e-06, + "loss": 0.0632, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0002486409448465565, + "clip_ratio/high_mean": 4.8569039563517435e-05, + "clip_ratio/low_mean": 0.00018970187477407308, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023827092763895052, + "epoch": 0.08542593597395551, + "grad_norm": 0.058663032948970795, + "kl": 0.0121612548828125, + "learning_rate": 1e-06, + "loss": 0.063, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0003392478156456491, + "clip_ratio/high_mean": 8.314332376357925e-05, + "clip_ratio/low_mean": 0.0003665440076474624, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004496873161770054, + "epoch": 0.08559956592512208, + "grad_norm": 0.05644400790333748, + "kl": 0.013263702392578125, + "learning_rate": 1e-06, + "loss": 0.0628, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0007035539783828426, + "clip_ratio/high_mean": 0.00015702182781751617, + "clip_ratio/low_mean": 0.000566471459933382, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007234932982100872, + "epoch": 0.08577319587628866, + "grad_norm": 0.05389365926384926, + "kl": 0.01403045654296875, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0007585997118439991, + "clip_ratio/high_mean": 0.00020993632278987207, + "clip_ratio/low_mean": 0.0007817905107003753, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009917268489516573, + "epoch": 0.08594682582745523, + "grad_norm": 0.07498685270547867, + "kl": 0.026973724365234375, + "learning_rate": 1e-06, + "loss": 0.0624, + "step": 495 + }, + { + "clip_ratio/high_max": 0.001043860142090125, + "clip_ratio/high_mean": 0.0002962798189400928, + "clip_ratio/low_mean": 0.0009745389124873327, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012708186950476374, + "epoch": 0.08612045577862182, + "grad_norm": 0.0504300631582737, + "kl": 0.014781951904296875, + "learning_rate": 1e-06, + "loss": 0.0622, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3022.0, + "completions/mean_length": 968.8795166015625, + "completions/mean_terminated_length": 844.58154296875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.08629408572978839, + "grad_norm": 0.07226653397083282, + "kl": 0.011791229248046875, + "learning_rate": 1e-06, + "loss": 0.0565, + "num_tokens": 31617000.0, + "reward": 0.3839285969734192, + "reward_std": 0.2570226788520813, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "step": 497 + }, + { + "clip_ratio/high_max": 0.00011765897033910733, + "clip_ratio/high_mean": 3.458973571923707e-05, + "clip_ratio/low_mean": 4.3183729985685204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.777346536386176e-05, + "epoch": 0.08646771568095496, + "grad_norm": 0.06889041513204575, + "kl": 0.0120697021484375, + "learning_rate": 1e-06, + "loss": 0.0564, + "step": 498 + }, + { + "clip_ratio/high_max": 0.00030082203920756, + "clip_ratio/high_mean": 6.813219169998774e-05, + "clip_ratio/low_mean": 0.00012613713283826655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019426932294663857, + "epoch": 0.08664134563212154, + "grad_norm": 0.06453359872102737, + "kl": 0.012386322021484375, + "learning_rate": 1e-06, + "loss": 0.0563, + "step": 499 + }, + { + "clip_ratio/high_max": 0.00035340800241101533, + "clip_ratio/high_mean": 9.744585804583039e-05, + "clip_ratio/low_mean": 0.00020257014284652541, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030001600407558726, + "epoch": 0.08681497558328811, + "grad_norm": 0.0625876784324646, + "kl": 0.012786865234375, + "learning_rate": 1e-06, + "loss": 0.056, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0004815108623006381, + "clip_ratio/high_mean": 0.00015412010679938248, + "clip_ratio/low_mean": 0.00034608738951646956, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005002074940421153, + "epoch": 0.0869886055344547, + "grad_norm": 0.06027127429842949, + "kl": 0.013092041015625, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0008529822080163285, + "clip_ratio/high_mean": 0.0002634975426190067, + "clip_ratio/low_mean": 0.0004930003251502058, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007564979077869793, + "epoch": 0.08716223548562127, + "grad_norm": 0.05922144651412964, + "kl": 0.01345062255859375, + "learning_rate": 1e-06, + "loss": 0.0555, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0012476019110181369, + "clip_ratio/high_mean": 0.0004059981965838233, + "clip_ratio/low_mean": 0.0007284230978257256, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011344213162374217, + "epoch": 0.08733586543678784, + "grad_norm": 0.05674389377236366, + "kl": 0.013965606689453125, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 503 + }, + { + "clip_ratio/high_max": 0.001676234824117273, + "clip_ratio/high_mean": 0.000592338230489986, + "clip_ratio/low_mean": 0.0009694157179183094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015617539866070729, + "epoch": 0.08750949538795443, + "grad_norm": 0.05616435781121254, + "kl": 0.01425933837890625, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3069.0, + "completions/mean_length": 1008.37060546875, + "completions/mean_terminated_length": 886.4066162109375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.087683125339121, + "grad_norm": 0.11310028284788132, + "kl": 0.014617919921875, + "learning_rate": 1e-06, + "loss": 0.0951, + "num_tokens": 32135566.0, + "reward": 0.3058035969734192, + "reward_std": 0.23815098404884338, + "rewards/accuracy_reward/mean": 0.3058035671710968, + "rewards/accuracy_reward/std": 0.461262047290802, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0002784872194752097, + "clip_ratio/high_mean": 4.799258886123425e-05, + "clip_ratio/low_mean": 6.144659801066155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010943918823613785, + "epoch": 0.08785675529028758, + "grad_norm": 0.1071036234498024, + "kl": 0.0152740478515625, + "learning_rate": 1e-06, + "loss": 0.095, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0003662943599920254, + "clip_ratio/high_mean": 8.819130653137108e-05, + "clip_ratio/low_mean": 0.0003068508940486936, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003950422069465276, + "epoch": 0.08803038524145415, + "grad_norm": 0.08276881277561188, + "kl": 0.01611328125, + "learning_rate": 1e-06, + "loss": 0.0947, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0005431726931419689, + "clip_ratio/high_mean": 0.00012954488533978292, + "clip_ratio/low_mean": 0.000647766933980165, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007773118231853005, + "epoch": 0.08820401519262072, + "grad_norm": 0.07499915361404419, + "kl": 0.017303466796875, + "learning_rate": 1e-06, + "loss": 0.0945, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0012288490070204716, + "clip_ratio/high_mean": 0.0002627983894853969, + "clip_ratio/low_mean": 0.0010058399529953022, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012686383342952468, + "epoch": 0.0883776451437873, + "grad_norm": 0.07293414324522018, + "kl": 0.0185699462890625, + "learning_rate": 1e-06, + "loss": 0.0941, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0014327327844512183, + "clip_ratio/high_mean": 0.00030834701101412065, + "clip_ratio/low_mean": 0.001280644537473563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015889915302977897, + "epoch": 0.08855127509495388, + "grad_norm": 0.06990481168031693, + "kl": 0.0198516845703125, + "learning_rate": 1e-06, + "loss": 0.0939, + "step": 510 + }, + { + "clip_ratio/high_max": 0.00203613664052682, + "clip_ratio/high_mean": 0.0004814665371668525, + "clip_ratio/low_mean": 0.0015811335215403233, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002062600113276858, + "epoch": 0.08872490504612046, + "grad_norm": 0.06437516212463379, + "kl": 0.0209197998046875, + "learning_rate": 1e-06, + "loss": 0.0935, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0027423587889643386, + "clip_ratio/high_mean": 0.0006705494379275478, + "clip_ratio/low_mean": 0.0018219305638922378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002492479994543828, + "epoch": 0.08889853499728703, + "grad_norm": 0.06246088072657585, + "kl": 0.021820068359375, + "learning_rate": 1e-06, + "loss": 0.0933, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0736607142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3023.0, + "completions/mean_length": 1036.712158203125, + "completions/mean_terminated_length": 874.869873046875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0890721649484536, + "grad_norm": 0.06348582357168198, + "kl": 0.01929473876953125, + "learning_rate": 1e-06, + "loss": 0.0394, + "num_tokens": 32666645.0, + "reward": 0.296875, + "reward_std": 0.23372390866279602, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45739173889160156, + "step": 513 + }, + { + "clip_ratio/high_max": 0.00018095238738169428, + "clip_ratio/high_mean": 3.002503126481315e-05, + "clip_ratio/low_mean": 4.201684112103976e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.204187215847924e-05, + "epoch": 0.08924579489962019, + "grad_norm": 0.061720896512269974, + "kl": 0.019317626953125, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 514 + }, + { + "clip_ratio/high_max": 0.00021659562298737, + "clip_ratio/high_mean": 5.068701284471899e-05, + "clip_ratio/low_mean": 5.738877064231929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00010807578019011999, + "epoch": 0.08941942485078676, + "grad_norm": 0.06098145619034767, + "kl": 0.0195770263671875, + "learning_rate": 1e-06, + "loss": 0.0392, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0004323845032558893, + "clip_ratio/high_mean": 0.00011135401746287243, + "clip_ratio/low_mean": 0.00017057859724900482, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028193260413900134, + "epoch": 0.08959305480195334, + "grad_norm": 0.057154376059770584, + "kl": 0.01970672607421875, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0007908407987997634, + "clip_ratio/high_mean": 0.00020713153071483248, + "clip_ratio/low_mean": 0.00027662156594487897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004837530977965798, + "epoch": 0.08976668475311991, + "grad_norm": 0.05614238232374191, + "kl": 0.0203704833984375, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0009348532548756339, + "clip_ratio/high_mean": 0.0002894922427003621, + "clip_ratio/low_mean": 0.0005204976619097579, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008099899096123409, + "epoch": 0.08994031470428648, + "grad_norm": 0.05459915101528168, + "kl": 0.02060699462890625, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0015838956824154593, + "clip_ratio/high_mean": 0.0004430647713888902, + "clip_ratio/low_mean": 0.0007357650310950703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011788298234023387, + "epoch": 0.09011394465545307, + "grad_norm": 0.05324577912688255, + "kl": 0.0209503173828125, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 519 + }, + { + "clip_ratio/high_max": 0.002225075411843136, + "clip_ratio/high_mean": 0.0006202897566254251, + "clip_ratio/low_mean": 0.0010112727341038408, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016315624852722976, + "epoch": 0.09028757460661964, + "grad_norm": 0.04997394606471062, + "kl": 0.02117156982421875, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3050.0, + "completions/mean_length": 1174.450927734375, + "completions/mean_terminated_length": 967.787109375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.09046120455778622, + "grad_norm": 0.058505360037088394, + "kl": 0.014251708984375, + "learning_rate": 1e-06, + "loss": 0.0899, + "num_tokens": 33263087.0, + "reward": 0.2901785969734192, + "reward_std": 0.1920231282711029, + "rewards/accuracy_reward/mean": 0.2901785671710968, + "rewards/accuracy_reward/std": 0.4543520212173462, + "step": 521 + }, + { + "clip_ratio/high_max": 0.00018939151323138503, + "clip_ratio/high_mean": 2.7055931127506483e-05, + "clip_ratio/low_mean": 1.7372587876707257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4428519686334766e-05, + "epoch": 0.0906348345089528, + "grad_norm": 0.058450400829315186, + "kl": 0.0144805908203125, + "learning_rate": 1e-06, + "loss": 0.0898, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0001865392678155331, + "clip_ratio/high_mean": 3.178898418809695e-05, + "clip_ratio/low_mean": 5.343288239600952e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.522186658410646e-05, + "epoch": 0.09080846446011936, + "grad_norm": 0.0574953518807888, + "kl": 0.0145721435546875, + "learning_rate": 1e-06, + "loss": 0.0897, + "step": 523 + }, + { + "clip_ratio/high_max": 0.00021607689268421382, + "clip_ratio/high_mean": 5.018674153234315e-05, + "clip_ratio/low_mean": 0.00010101058035161259, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001511973232481978, + "epoch": 0.09098209441128595, + "grad_norm": 0.05667233094573021, + "kl": 0.01490020751953125, + "learning_rate": 1e-06, + "loss": 0.0896, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0002604551355034346, + "clip_ratio/high_mean": 6.826247908975347e-05, + "clip_ratio/low_mean": 0.00022604179252994072, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002943042750302993, + "epoch": 0.09115572436245252, + "grad_norm": 0.054369840770959854, + "kl": 0.0150604248046875, + "learning_rate": 1e-06, + "loss": 0.0894, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0005568139640672598, + "clip_ratio/high_mean": 0.0001695622927400109, + "clip_ratio/low_mean": 0.0003658834987163573, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005354458107831306, + "epoch": 0.0913293543136191, + "grad_norm": 0.052510615438222885, + "kl": 0.01525115966796875, + "learning_rate": 1e-06, + "loss": 0.0892, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0007949760693009011, + "clip_ratio/high_mean": 0.00024754407513682963, + "clip_ratio/low_mean": 0.0005546507495637343, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000802194826064806, + "epoch": 0.09150298426478568, + "grad_norm": 0.049648914486169815, + "kl": 0.015594482421875, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0011350465720170178, + "clip_ratio/high_mean": 0.0003483728701212385, + "clip_ratio/low_mean": 0.0007451051237694628, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001093477992071712, + "epoch": 0.09167661421595225, + "grad_norm": 0.0474773645401001, + "kl": 0.015625, + "learning_rate": 1e-06, + "loss": 0.0888, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0870535714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2996.0, + "completions/mean_length": 1004.3906860351562, + "completions/mean_terminated_length": 807.2347412109375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.09185024416711883, + "grad_norm": 0.12496041506528854, + "kl": 0.023162841796875, + "learning_rate": 1e-06, + "loss": 0.1205, + "num_tokens": 33776550.0, + "reward": 0.4040178656578064, + "reward_std": 0.23484933376312256, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0002989161275763763, + "clip_ratio/high_mean": 6.70192625875643e-05, + "clip_ratio/low_mean": 2.6756818783724157e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.37760842134594e-05, + "epoch": 0.0920238741182854, + "grad_norm": 0.12704452872276306, + "kl": 0.02356719970703125, + "learning_rate": 1e-06, + "loss": 0.1203, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0001826482885007863, + "clip_ratio/high_mean": 3.262148391058872e-05, + "clip_ratio/low_mean": 0.00014739640732841508, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018001789203481167, + "epoch": 0.09219750406945199, + "grad_norm": 0.12202641367912292, + "kl": 0.02437591552734375, + "learning_rate": 1e-06, + "loss": 0.12, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0004636226949514821, + "clip_ratio/high_mean": 0.00011774114500440191, + "clip_ratio/low_mean": 0.0005971784921712242, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007149196335376473, + "epoch": 0.09237113402061856, + "grad_norm": 0.11640335619449615, + "kl": 0.02568817138671875, + "learning_rate": 1e-06, + "loss": 0.1194, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0004757142251037294, + "clip_ratio/high_mean": 0.00013730248838328407, + "clip_ratio/low_mean": 0.0011970669456786709, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013343694226932712, + "epoch": 0.09254476397178513, + "grad_norm": 0.12437038123607635, + "kl": 0.02751922607421875, + "learning_rate": 1e-06, + "loss": 0.1188, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0009233936325472314, + "clip_ratio/high_mean": 0.0002580384625616716, + "clip_ratio/low_mean": 0.0019656805852719117, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022237191151361912, + "epoch": 0.09271839392295171, + "grad_norm": 0.15404506027698517, + "kl": 0.02921295166015625, + "learning_rate": 1e-06, + "loss": 0.118, + "step": 534 + }, + { + "clip_ratio/high_max": 0.001292944867600454, + "clip_ratio/high_mean": 0.00037816305757587543, + "clip_ratio/low_mean": 0.0030763038412260357, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00345446676510619, + "epoch": 0.09289202387411828, + "grad_norm": 0.2130662053823471, + "kl": 0.0317840576171875, + "learning_rate": 1e-06, + "loss": 0.1169, + "step": 535 + }, + { + "clip_ratio/high_max": 0.002173201664845692, + "clip_ratio/high_mean": 0.0005982196335025947, + "clip_ratio/low_mean": 0.005105946947878692, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005704166505893227, + "epoch": 0.09306565382528487, + "grad_norm": 0.33149293065071106, + "kl": 0.0341033935546875, + "learning_rate": 1e-06, + "loss": 0.1151, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0892857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3028.0, + "completions/mean_length": 1272.2523193359375, + "completions/mean_terminated_length": 1095.806396484375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.09323928377645144, + "grad_norm": 0.07863554358482361, + "kl": 0.02123260498046875, + "learning_rate": 1e-06, + "loss": 0.0762, + "num_tokens": 34415759.0, + "reward": 0.3169642984867096, + "reward_std": 0.23296070098876953, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0003040993651666213, + "clip_ratio/high_mean": 6.461458724515978e-05, + "clip_ratio/low_mean": 5.5714149880259356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012032873655698495, + "epoch": 0.09341291372761801, + "grad_norm": 0.07925178855657578, + "kl": 0.0226898193359375, + "learning_rate": 1e-06, + "loss": 0.076, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0005464224886964075, + "clip_ratio/high_mean": 0.00012489397295212257, + "clip_ratio/low_mean": 0.00037720665568485856, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005021006199967815, + "epoch": 0.09358654367878459, + "grad_norm": 0.07895436882972717, + "kl": 0.02400970458984375, + "learning_rate": 1e-06, + "loss": 0.0758, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0006468031733675161, + "clip_ratio/high_mean": 0.00017037380530382507, + "clip_ratio/low_mean": 0.0009339469543192536, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011043207450711634, + "epoch": 0.09376017362995116, + "grad_norm": 0.0744888037443161, + "kl": 0.025726318359375, + "learning_rate": 1e-06, + "loss": 0.0756, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0010563496252871118, + "clip_ratio/high_mean": 0.00026857949615077814, + "clip_ratio/low_mean": 0.0014048888115212321, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016734683413233142, + "epoch": 0.09393380358111775, + "grad_norm": 0.0809512734413147, + "kl": 0.027069091796875, + "learning_rate": 1e-06, + "loss": 0.0753, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0013745773539994843, + "clip_ratio/high_mean": 0.0003676870528579457, + "clip_ratio/low_mean": 0.0020067759469384328, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002374463052547071, + "epoch": 0.09410743353228432, + "grad_norm": 0.08664121478796005, + "kl": 0.028564453125, + "learning_rate": 1e-06, + "loss": 0.0749, + "step": 542 + }, + { + "clip_ratio/high_max": 0.001857527022366412, + "clip_ratio/high_mean": 0.0005080788523628144, + "clip_ratio/low_mean": 0.0028379877330735326, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0033460664926678874, + "epoch": 0.09428106348345089, + "grad_norm": 0.09937120229005814, + "kl": 0.0304107666015625, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0022967568365857005, + "clip_ratio/high_mean": 0.000624309401246137, + "clip_ratio/low_mean": 0.0040163086232496426, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004640617858967744, + "epoch": 0.09445469343461747, + "grad_norm": 0.12508922815322876, + "kl": 0.0320892333984375, + "learning_rate": 1e-06, + "loss": 0.074, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0959821428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3048.0, + "completions/mean_length": 1089.7054443359375, + "completions/mean_terminated_length": 879.2395629882812, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.09462832338578404, + "grad_norm": 0.2625873386859894, + "kl": 0.0465087890625, + "learning_rate": 1e-06, + "loss": 0.0945, + "num_tokens": 34967323.0, + "reward": 0.3169642984867096, + "reward_std": 0.24897116422653198, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0005054846087659826, + "clip_ratio/high_mean": 9.584138774698658e-05, + "clip_ratio/low_mean": 7.650214843124559e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017234353845196893, + "epoch": 0.09480195333695063, + "grad_norm": 0.2898998260498047, + "kl": 0.0494842529296875, + "learning_rate": 1e-06, + "loss": 0.0936, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0006608037247133325, + "clip_ratio/high_mean": 0.00013870749535271898, + "clip_ratio/low_mean": 0.0018206840090897458, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019593914962570125, + "epoch": 0.0949755832881172, + "grad_norm": 0.2907351553440094, + "kl": 0.05401611328125, + "learning_rate": 1e-06, + "loss": 0.0921, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0008963173495430965, + "clip_ratio/high_mean": 0.00021387828383012675, + "clip_ratio/low_mean": 0.009547454135827138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009761332446942106, + "epoch": 0.09514921323928377, + "grad_norm": 0.26232901215553284, + "kl": 0.061920166015625, + "learning_rate": 1e-06, + "loss": 0.0903, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0012662637454923242, + "clip_ratio/high_mean": 0.00034828326761271455, + "clip_ratio/low_mean": 0.021875227645068662, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02222351179079851, + "epoch": 0.09532284319045035, + "grad_norm": 0.2798546850681305, + "kl": 0.07403564453125, + "learning_rate": 1e-06, + "loss": 0.0884, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0017091440276999492, + "clip_ratio/high_mean": 0.00044742689169652294, + "clip_ratio/low_mean": 0.0384919215211994, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.038939345882681664, + "epoch": 0.09549647314161692, + "grad_norm": 0.19144611060619354, + "kl": 0.093841552734375, + "learning_rate": 1e-06, + "loss": 0.0866, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0024227378416981082, + "clip_ratio/high_mean": 0.0006704614079353632, + "clip_ratio/low_mean": 0.04955312468155171, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05022358605856425, + "epoch": 0.09567010309278351, + "grad_norm": 0.17801299691200256, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": 0.0853, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0033653479113127105, + "clip_ratio/high_mean": 0.0009353125187772093, + "clip_ratio/low_mean": 0.05843951608039788, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0593748302962922, + "epoch": 0.09584373304395008, + "grad_norm": 0.18499353528022766, + "kl": 0.17572021484375, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3030.0, + "completions/mean_length": 937.6652221679688, + "completions/mean_terminated_length": 878.9219970703125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.09601736299511665, + "grad_norm": 0.37915924191474915, + "kl": 0.07330322265625, + "learning_rate": 1e-06, + "loss": 0.1104, + "num_tokens": 35450373.0, + "reward": 0.3348214328289032, + "reward_std": 0.2289067953824997, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "step": 553 + }, + { + "clip_ratio/high_max": 0.00021775357436126797, + "clip_ratio/high_mean": 4.958740487381874e-05, + "clip_ratio/low_mean": 0.0025995454063831858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026491328630982025, + "epoch": 0.09619099294628324, + "grad_norm": 0.24498675763607025, + "kl": 0.08251953125, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0003937987748940941, + "clip_ratio/high_mean": 8.885893976184889e-05, + "clip_ratio/low_mean": 0.006574953778908821, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006663812797341961, + "epoch": 0.0963646228974498, + "grad_norm": 0.17716802656650543, + "kl": 0.09820556640625, + "learning_rate": 1e-06, + "loss": 0.1083, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0007635476667928742, + "clip_ratio/high_mean": 0.00018920662751042983, + "clip_ratio/low_mean": 0.009844635125773493, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010033842037955765, + "epoch": 0.09653825284861639, + "grad_norm": 0.11271374672651291, + "kl": 0.119720458984375, + "learning_rate": 1e-06, + "loss": 0.1076, + "step": 556 + }, + { + "clip_ratio/high_max": 0.001080135683878325, + "clip_ratio/high_mean": 0.00032758905490481993, + "clip_ratio/low_mean": 0.012776187373674475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013103777324431576, + "epoch": 0.09671188279978296, + "grad_norm": 0.08881672471761703, + "kl": 0.147918701171875, + "learning_rate": 1e-06, + "loss": 0.1071, + "step": 557 + }, + { + "clip_ratio/high_max": 0.00147506167559186, + "clip_ratio/high_mean": 0.0004436824492586311, + "clip_ratio/low_mean": 0.015480168964131735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015923851518891752, + "epoch": 0.09688551275094955, + "grad_norm": 0.08065946400165558, + "kl": 0.18212890625, + "learning_rate": 1e-06, + "loss": 0.1067, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0020431840675882995, + "clip_ratio/high_mean": 0.0006639306302531622, + "clip_ratio/low_mean": 0.017476870460086502, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018140800850233063, + "epoch": 0.09705914270211612, + "grad_norm": 0.08254239708185196, + "kl": 0.221923828125, + "learning_rate": 1e-06, + "loss": 0.1063, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0026953942433465272, + "clip_ratio/high_mean": 0.0008732588721613865, + "clip_ratio/low_mean": 0.018732943310169503, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019606202462455258, + "epoch": 0.09723277265328269, + "grad_norm": 0.0837223157286644, + "kl": 0.25567626953125, + "learning_rate": 1e-06, + "loss": 0.106, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2234.0, + "completions/mean_length": 832.294677734375, + "completions/mean_terminated_length": 807.0158081054688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.09740640260444927, + "grad_norm": 0.572095513343811, + "kl": 0.2568359375, + "learning_rate": 1e-06, + "loss": 0.0278, + "num_tokens": 35892385.0, + "reward": 0.3571428656578064, + "reward_std": 0.2751179337501526, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0005813209245388862, + "clip_ratio/high_mean": 0.00011569647813303163, + "clip_ratio/low_mean": 0.0035241650257376023, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003639861461124383, + "epoch": 0.09758003255561584, + "grad_norm": 0.31742629408836365, + "kl": 0.3162841796875, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0008306621566589456, + "clip_ratio/high_mean": 0.00019597981372498907, + "clip_ratio/low_mean": 0.011486168834380805, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011682148688123561, + "epoch": 0.09775366250678243, + "grad_norm": 0.23384420573711395, + "kl": 0.379638671875, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0019910637056455016, + "clip_ratio/high_mean": 0.00047866486056591384, + "clip_ratio/low_mean": 0.015350540503277443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01582920560031198, + "epoch": 0.097927292457949, + "grad_norm": 0.20674023032188416, + "kl": 0.43505859375, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0021971423411741853, + "clip_ratio/high_mean": 0.0006057178215996828, + "clip_ratio/low_mean": 0.01746332665788941, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018069043988361955, + "epoch": 0.09810092240911557, + "grad_norm": 0.16309677064418793, + "kl": 0.4775390625, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 565 + }, + { + "clip_ratio/high_max": 0.003826558036962524, + "clip_ratio/high_mean": 0.001091672202164773, + "clip_ratio/low_mean": 0.018750762625131756, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019842436158796772, + "epoch": 0.09827455236028215, + "grad_norm": 0.13427001237869263, + "kl": 0.50244140625, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0060576970427064225, + "clip_ratio/high_mean": 0.0016957492043729872, + "clip_ratio/low_mean": 0.01974070313735865, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02143645117757842, + "epoch": 0.09844818231144872, + "grad_norm": 0.1237688735127449, + "kl": 0.516845703125, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 567 + }, + { + "clip_ratio/high_max": 0.013446684810332954, + "clip_ratio/high_mean": 0.0030946295883040875, + "clip_ratio/low_mean": 0.020272769761504605, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023367399640846997, + "epoch": 0.09862181226261531, + "grad_norm": 0.13202856481075287, + "kl": 0.523193359375, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2842.0, + "completions/mean_length": 1016.3638916015625, + "completions/mean_terminated_length": 954.9310302734375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.09879544221378188, + "grad_norm": 0.10529527068138123, + "kl": 0.1856689453125, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 36416452.0, + "reward": 0.283482164144516, + "reward_std": 0.22582855820655823, + "rewards/accuracy_reward/mean": 0.2834821343421936, + "rewards/accuracy_reward/std": 0.4511922299861908, + "step": 569 + }, + { + "clip_ratio/high_max": 0.00021573750200332142, + "clip_ratio/high_mean": 4.336871347732085e-05, + "clip_ratio/low_mean": 3.509745397423103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.846616563256248e-05, + "epoch": 0.09896907216494845, + "grad_norm": 0.10366082191467285, + "kl": 0.18280029296875, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 570 + }, + { + "clip_ratio/high_max": 0.005275058503684704, + "clip_ratio/high_mean": 0.000778637557800721, + "clip_ratio/low_mean": 9.193240498461819e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008705699306119641, + "epoch": 0.09914270211611503, + "grad_norm": 0.09738457202911377, + "kl": 0.180908203125, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 571 + }, + { + "clip_ratio/high_max": 0.010816334928676952, + "clip_ratio/high_mean": 0.0016024466822273098, + "clip_ratio/low_mean": 0.0002964428967970889, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018988894917129073, + "epoch": 0.0993163320672816, + "grad_norm": 0.09940585494041443, + "kl": 0.18035888671875, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 572 + }, + { + "clip_ratio/high_max": 0.014238215080695227, + "clip_ratio/high_mean": 0.002106691350491019, + "clip_ratio/low_mean": 0.0009225303583662026, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0030292216106317937, + "epoch": 0.09948996201844819, + "grad_norm": 0.09260519593954086, + "kl": 0.1849365234375, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 573 + }, + { + "clip_ratio/high_max": 0.017239402804989368, + "clip_ratio/high_mean": 0.002615036452880304, + "clip_ratio/low_mean": 0.0017235304258065298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004338567072409205, + "epoch": 0.09966359196961476, + "grad_norm": 0.08561917394399643, + "kl": 0.1937255859375, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 574 + }, + { + "clip_ratio/high_max": 0.01999792282731505, + "clip_ratio/high_mean": 0.0030689344530401286, + "clip_ratio/low_mean": 0.0028736825770465657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005942616844549775, + "epoch": 0.09983722192078133, + "grad_norm": 0.0773790031671524, + "kl": 0.202880859375, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 575 + }, + { + "clip_ratio/high_max": 0.022380143804184627, + "clip_ratio/high_mean": 0.0035066731325059664, + "clip_ratio/low_mean": 0.003924697346519679, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007431370613630861, + "epoch": 0.10001085187194791, + "grad_norm": 0.0736330971121788, + "kl": 0.2149658203125, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2476.0, + "completions/mean_length": 810.1585083007812, + "completions/mean_terminated_length": 789.7815551757812, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10018448182311449, + "grad_norm": 1.2772879600524902, + "kl": 0.37890625, + "learning_rate": 1e-06, + "loss": 0.0528, + "num_tokens": 36843867.0, + "reward": 0.3437500298023224, + "reward_std": 0.2888752818107605, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0002612620401123422, + "clip_ratio/high_mean": 7.155830917326966e-05, + "clip_ratio/low_mean": 0.009197029401548207, + "clip_ratio/low_min": 7.649938925169408e-05, + "clip_ratio/region_mean": 0.009268587629776448, + "epoch": 0.10035811177428107, + "grad_norm": 0.5471004843711853, + "kl": 0.477783203125, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0009207809816871304, + "clip_ratio/high_mean": 0.0002628334659675602, + "clip_ratio/low_mean": 0.024099400965496898, + "clip_ratio/low_min": 0.00036443150020204484, + "clip_ratio/region_mean": 0.02436223457334563, + "epoch": 0.10053174172544764, + "grad_norm": 0.2657545804977417, + "kl": 0.6396484375, + "learning_rate": 1e-06, + "loss": 0.0464, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0015168095342232846, + "clip_ratio/high_mean": 0.00045315780698729213, + "clip_ratio/low_mean": 0.03135586279677227, + "clip_ratio/low_min": 0.00048199002048932016, + "clip_ratio/region_mean": 0.031809021544177085, + "epoch": 0.10070537167661421, + "grad_norm": 0.2656930088996887, + "kl": 0.822509765625, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 580 + }, + { + "clip_ratio/high_max": 0.001962836249731481, + "clip_ratio/high_mean": 0.0006771465923520736, + "clip_ratio/low_mean": 0.034930469118990004, + "clip_ratio/low_min": 0.0005407693097367883, + "clip_ratio/region_mean": 0.03560761600965634, + "epoch": 0.1008790016277808, + "grad_norm": 0.2681480050086975, + "kl": 0.9931640625, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 581 + }, + { + "clip_ratio/high_max": 0.002674753646715544, + "clip_ratio/high_mean": 0.000948868411796866, + "clip_ratio/low_mean": 0.03691972343949601, + "clip_ratio/low_min": 0.0005172576056793332, + "clip_ratio/region_mean": 0.03786859137471765, + "epoch": 0.10105263157894737, + "grad_norm": 0.24651777744293213, + "kl": 1.12451171875, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0031685368448961526, + "clip_ratio/high_mean": 0.001219699544890318, + "clip_ratio/low_mean": 0.038191506115254015, + "clip_ratio/low_min": 0.00039969905628822744, + "clip_ratio/region_mean": 0.039411205041687936, + "epoch": 0.10122626153011395, + "grad_norm": 0.20935000479221344, + "kl": 1.2109375, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 583 + }, + { + "clip_ratio/high_max": 0.004168231360381469, + "clip_ratio/high_mean": 0.0015859595368965529, + "clip_ratio/low_mean": 0.038649689697194844, + "clip_ratio/low_min": 0.0003761873522307724, + "clip_ratio/region_mean": 0.04023564967792481, + "epoch": 0.10139989148128052, + "grad_norm": 0.14459004998207092, + "kl": 1.24267578125, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2987.0, + "completions/mean_length": 798.8638916015625, + "completions/mean_terminated_length": 773.2077026367188, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.10157352143244709, + "grad_norm": 0.4311932325363159, + "kl": 0.543212890625, + "learning_rate": 1e-06, + "loss": 0.0305, + "num_tokens": 37263094.0, + "reward": 0.3348214328289032, + "reward_std": 0.2461118996143341, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "step": 585 + }, + { + "clip_ratio/high_max": 0.00029831845495209564, + "clip_ratio/high_mean": 8.135705252243497e-05, + "clip_ratio/low_mean": 0.0001534583279863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023481537209590897, + "epoch": 0.10174715138361368, + "grad_norm": 0.40137624740600586, + "kl": 0.565673828125, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0007362750038737431, + "clip_ratio/high_mean": 0.0002670908752406831, + "clip_ratio/low_mean": 0.003612981498008594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003880072443280369, + "epoch": 0.10192078133478025, + "grad_norm": 0.24040406942367554, + "kl": 0.6240234375, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0013989802537253127, + "clip_ratio/high_mean": 0.0005115943276905455, + "clip_ratio/low_mean": 0.011723557021468878, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012235152011271566, + "epoch": 0.10209441128594683, + "grad_norm": 0.15667647123336792, + "kl": 0.702392578125, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0019105732353636995, + "clip_ratio/high_mean": 0.0007567326683783904, + "clip_ratio/low_mean": 0.017795170191675425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018551903252955526, + "epoch": 0.1022680412371134, + "grad_norm": 0.16088135540485382, + "kl": 0.789794921875, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0027712986629921943, + "clip_ratio/high_mean": 0.0010265068340231664, + "clip_ratio/low_mean": 0.021646621404215693, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02267312933690846, + "epoch": 0.10244167118827997, + "grad_norm": 0.15986508131027222, + "kl": 0.87060546875, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 590 + }, + { + "clip_ratio/high_max": 0.003379987261723727, + "clip_ratio/high_mean": 0.0012569222890306264, + "clip_ratio/low_mean": 0.0239271231694147, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02518404519651085, + "epoch": 0.10261530113944656, + "grad_norm": 0.15419812500476837, + "kl": 0.9345703125, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 591 + }, + { + "clip_ratio/high_max": 0.004022586712380871, + "clip_ratio/high_mean": 0.0014482497936114669, + "clip_ratio/low_mean": 0.02565131289884448, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027099563041701913, + "epoch": 0.10278893109061313, + "grad_norm": 0.14795953035354614, + "kl": 0.999755859375, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2825.0, + "completions/mean_length": 743.4420166015625, + "completions/mean_terminated_length": 717.1602783203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.10296256104177971, + "grad_norm": 0.3971327543258667, + "kl": 0.9775390625, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 37656548.0, + "reward": 0.3504464328289032, + "reward_std": 0.2588193118572235, + "rewards/accuracy_reward/mean": 0.3504464328289032, + "rewards/accuracy_reward/std": 0.47764313220977783, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0002824744242388988, + "clip_ratio/high_mean": 6.015211124577036e-05, + "clip_ratio/low_mean": 0.0001477909772802377, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020794309602933936, + "epoch": 0.10313619099294628, + "grad_norm": 0.3704015910625458, + "kl": 1.05517578125, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 594 + }, + { + "clip_ratio/high_max": 0.00035245155049778987, + "clip_ratio/high_mean": 9.035296056936204e-05, + "clip_ratio/low_mean": 0.0031132111471379176, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003203564199793618, + "epoch": 0.10330982094411285, + "grad_norm": 0.271714985370636, + "kl": 1.1708984375, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0008464343045488931, + "clip_ratio/high_mean": 0.00027169288478035014, + "clip_ratio/low_mean": 0.009938626695657149, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010210319713223726, + "epoch": 0.10348345089527944, + "grad_norm": 0.2593633830547333, + "kl": 1.30419921875, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 596 + }, + { + "clip_ratio/high_max": 0.001434485449863132, + "clip_ratio/high_mean": 0.00047574098516633967, + "clip_ratio/low_mean": 0.0157760790316388, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016251820314209908, + "epoch": 0.10365708084644601, + "grad_norm": 0.250699907541275, + "kl": 1.40087890625, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 597 + }, + { + "clip_ratio/high_max": 0.002085937918309355, + "clip_ratio/high_mean": 0.0007588684320580796, + "clip_ratio/low_mean": 0.019806593423709273, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020565461600199342, + "epoch": 0.1038307107976126, + "grad_norm": 0.22722899913787842, + "kl": 1.4462890625, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0025347844275529496, + "clip_ratio/high_mean": 0.0009499708048679167, + "clip_ratio/low_mean": 0.022192087664734572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02314205828588456, + "epoch": 0.10400434074877916, + "grad_norm": 0.17428654432296753, + "kl": 1.44970703125, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 599 + }, + { + "clip_ratio/high_max": 0.003452606499195099, + "clip_ratio/high_mean": 0.001370205089187948, + "clip_ratio/low_mean": 0.02355581399751827, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02492601959966123, + "epoch": 0.10417797069994574, + "grad_norm": 0.12400983273983002, + "kl": 1.42333984375, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2242.0, + "completions/mean_length": 653.1674194335938, + "completions/mean_terminated_length": 647.7561645507812, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.10435160065111232, + "grad_norm": 0.49277326464653015, + "kl": 2.0830078125, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 38011239.0, + "reward": 0.25, + "reward_std": 0.21530278027057648, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43349677324295044, + "step": 601 + }, + { + "clip_ratio/high_max": 0.00044139448800706305, + "clip_ratio/high_mean": 8.746096500544809e-05, + "clip_ratio/low_mean": 0.00018578718845674302, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002732481561906752, + "epoch": 0.10452523060227889, + "grad_norm": 0.45146024227142334, + "kl": 2.1337890625, + "learning_rate": 1e-06, + "loss": -0.0169, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0006997763084655162, + "clip_ratio/high_mean": 0.0001774450051925669, + "clip_ratio/low_mean": 0.003663782321382314, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003841227269731462, + "epoch": 0.10469886055344547, + "grad_norm": 0.2768205404281616, + "kl": 2.28466796875, + "learning_rate": 1e-06, + "loss": -0.0186, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0015612240349582862, + "clip_ratio/high_mean": 0.00048171962043852545, + "clip_ratio/low_mean": 0.012631829420570284, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013113548746332526, + "epoch": 0.10487249050461205, + "grad_norm": 0.21085558831691742, + "kl": 2.4921875, + "learning_rate": 1e-06, + "loss": -0.0197, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0021652372051903512, + "clip_ratio/high_mean": 0.0007053582103253575, + "clip_ratio/low_mean": 0.02146156900562346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022166927345097065, + "epoch": 0.10504612045577862, + "grad_norm": 0.24813611805438995, + "kl": 2.677734375, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0028776760227628984, + "clip_ratio/high_mean": 0.0009236359001079109, + "clip_ratio/low_mean": 0.0282519340980798, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029175569768995047, + "epoch": 0.1052197504069452, + "grad_norm": 0.24759487807750702, + "kl": 2.8046875, + "learning_rate": 1e-06, + "loss": -0.021, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0035119160020258278, + "clip_ratio/high_mean": 0.0011971444055234315, + "clip_ratio/low_mean": 0.03213659883476794, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03333374275825918, + "epoch": 0.10539338035811177, + "grad_norm": 0.20858578383922577, + "kl": 2.859375, + "learning_rate": 1e-06, + "loss": -0.0216, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0044502966266009025, + "clip_ratio/high_mean": 0.0014534032343362924, + "clip_ratio/low_mean": 0.034378798212856054, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.035832200781442225, + "epoch": 0.10556701030927836, + "grad_norm": 0.165052130818367, + "kl": 2.8662109375, + "learning_rate": 1e-06, + "loss": -0.0221, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004464285714285698, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3057.0, + "completions/mean_length": 732.3839721679688, + "completions/mean_terminated_length": 721.8923950195312, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.10574064026044493, + "grad_norm": 0.5255489945411682, + "kl": 2.671875, + "learning_rate": 1e-06, + "loss": -0.0453, + "num_tokens": 38402331.0, + "reward": 0.2075892984867096, + "reward_std": 0.2329593002796173, + "rewards/accuracy_reward/mean": 0.2075892835855484, + "rewards/accuracy_reward/std": 0.4060344398021698, + "step": 609 + }, + { + "clip_ratio/high_max": 4.366049688542262e-05, + "clip_ratio/high_mean": 6.237213710846845e-06, + "clip_ratio/low_mean": 7.183123716458795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.80684517849295e-05, + "epoch": 0.1059142702116115, + "grad_norm": 0.48377078771591187, + "kl": 2.8681640625, + "learning_rate": 1e-06, + "loss": -0.0467, + "step": 610 + }, + { + "clip_ratio/high_max": 0.00019057484132645186, + "clip_ratio/high_mean": 3.876686560033704e-05, + "clip_ratio/low_mean": 0.005993725324515253, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0060324923251755536, + "epoch": 0.10608790016277808, + "grad_norm": 0.266008198261261, + "kl": 3.26953125, + "learning_rate": 1e-06, + "loss": -0.0484, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0005481868429342285, + "clip_ratio/high_mean": 0.00011200043695680506, + "clip_ratio/low_mean": 0.019367937929928303, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019479937269352376, + "epoch": 0.10626153011394465, + "grad_norm": 0.34379372000694275, + "kl": 3.728515625, + "learning_rate": 1e-06, + "loss": -0.0493, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0008298284701595549, + "clip_ratio/high_mean": 0.00019642440020106733, + "clip_ratio/low_mean": 0.027246260782703757, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027442683815024793, + "epoch": 0.10643516006511124, + "grad_norm": 0.37733131647109985, + "kl": 3.9921875, + "learning_rate": 1e-06, + "loss": -0.0499, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0015552872555417707, + "clip_ratio/high_mean": 0.0003578667697183846, + "clip_ratio/low_mean": 0.03146353631746024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031821402721107006, + "epoch": 0.10660879001627781, + "grad_norm": 0.3163529336452484, + "kl": 3.9765625, + "learning_rate": 1e-06, + "loss": -0.0507, + "step": 614 + }, + { + "clip_ratio/high_max": 0.003250576846767217, + "clip_ratio/high_mean": 0.0008295837760670111, + "clip_ratio/low_mean": 0.032610182184726, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03343976614996791, + "epoch": 0.10678241996744438, + "grad_norm": 0.18202972412109375, + "kl": 3.796875, + "learning_rate": 1e-06, + "loss": -0.0515, + "step": 615 + }, + { + "clip_ratio/high_max": 0.006122976919868961, + "clip_ratio/high_mean": 0.0016006997757358477, + "clip_ratio/low_mean": 0.03302354761399329, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.034624248277395964, + "epoch": 0.10695604991861096, + "grad_norm": 0.21380575001239777, + "kl": 3.5703125, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.006696428571428603, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2886.0, + "completions/mean_length": 673.9955444335938, + "completions/mean_terminated_length": 657.8292236328125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.10712967986977753, + "grad_norm": 0.9599938988685608, + "kl": 3.1591796875, + "learning_rate": 1e-06, + "loss": -0.0321, + "num_tokens": 38770433.0, + "reward": 0.1986607164144516, + "reward_std": 0.22289502620697021, + "rewards/accuracy_reward/mean": 0.1986607164144516, + "rewards/accuracy_reward/std": 0.3994380533695221, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0002519217159715481, + "clip_ratio/high_mean": 5.144684337210492e-05, + "clip_ratio/low_mean": 0.001462544321839232, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015139911920414306, + "epoch": 0.10730330982094412, + "grad_norm": 0.6943337321281433, + "kl": 3.50390625, + "learning_rate": 1e-06, + "loss": -0.0341, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0008941831656557042, + "clip_ratio/high_mean": 0.0002329121555249003, + "clip_ratio/low_mean": 0.022007843013852835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022240756079554558, + "epoch": 0.10747693977211069, + "grad_norm": 0.33290112018585205, + "kl": 4.25, + "learning_rate": 1e-06, + "loss": -0.0351, + "step": 619 + }, + { + "clip_ratio/high_max": 0.001463829743443057, + "clip_ratio/high_mean": 0.00045699246584263165, + "clip_ratio/low_mean": 0.02931503113359213, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02977202320471406, + "epoch": 0.10765056972327726, + "grad_norm": 0.4742668569087982, + "kl": 4.8046875, + "learning_rate": 1e-06, + "loss": -0.0349, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0017879624283523299, + "clip_ratio/high_mean": 0.000553971187400748, + "clip_ratio/low_mean": 0.03136679227463901, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031920763896778226, + "epoch": 0.10782419967444384, + "grad_norm": 0.4765302836894989, + "kl": 4.9375, + "learning_rate": 1e-06, + "loss": -0.0352, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0021164309946470894, + "clip_ratio/high_mean": 0.0006073708318581339, + "clip_ratio/low_mean": 0.03018611273728311, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030793483601883054, + "epoch": 0.10799782962561041, + "grad_norm": 0.3854944407939911, + "kl": 4.685546875, + "learning_rate": 1e-06, + "loss": -0.0362, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0033430549810873345, + "clip_ratio/high_mean": 0.0008926578339014668, + "clip_ratio/low_mean": 0.02657561027444899, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027468268061056733, + "epoch": 0.108171459576777, + "grad_norm": 0.21459896862506866, + "kl": 4.228515625, + "learning_rate": 1e-06, + "loss": -0.0374, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0045433912600856274, + "clip_ratio/high_mean": 0.0012118398954044096, + "clip_ratio/low_mean": 0.022658867645077407, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023870707023888826, + "epoch": 0.10834508952794357, + "grad_norm": 0.16254353523254395, + "kl": 3.759765625, + "learning_rate": 1e-06, + "loss": -0.038, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2780.0, + "completions/mean_length": 760.4219360351562, + "completions/mean_terminated_length": 755.2505493164062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.10851871947911014, + "grad_norm": 2.1489577293395996, + "kl": 2.884765625, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 39181494.0, + "reward": 0.2723214328289032, + "reward_std": 0.27227136492729187, + "rewards/accuracy_reward/mean": 0.2723214328289032, + "rewards/accuracy_reward/std": 0.4456520676612854, + "step": 625 + }, + { + "clip_ratio/high_max": 7.979586735018529e-05, + "clip_ratio/high_mean": 1.1399409231671598e-05, + "clip_ratio/low_mean": 0.0008011586742213694, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00081255808277092, + "epoch": 0.10869234943027672, + "grad_norm": 1.872145652770996, + "kl": 3.1884765625, + "learning_rate": 1e-06, + "loss": -0.0408, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0003267186402808875, + "clip_ratio/high_mean": 7.508683074775035e-05, + "clip_ratio/low_mean": 0.045597691321745515, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04567277687601745, + "epoch": 0.1088659793814433, + "grad_norm": 0.24328932166099548, + "kl": 4.0927734375, + "learning_rate": 1e-06, + "loss": -0.0445, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0008614385333203245, + "clip_ratio/high_mean": 0.00020539966953947442, + "clip_ratio/low_mean": 0.05567242577672005, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05587782431393862, + "epoch": 0.10903960933260988, + "grad_norm": 0.427501380443573, + "kl": 5.0390625, + "learning_rate": 1e-06, + "loss": -0.0441, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0014034903215360828, + "clip_ratio/high_mean": 0.00040514532156521454, + "clip_ratio/low_mean": 0.05913233174942434, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05953747755847871, + "epoch": 0.10921323928377645, + "grad_norm": 0.4987650513648987, + "kl": 5.74609375, + "learning_rate": 1e-06, + "loss": -0.0439, + "step": 629 + }, + { + "clip_ratio/high_max": 0.002778789774311008, + "clip_ratio/high_mean": 0.0006521331151816412, + "clip_ratio/low_mean": 0.06060770247131586, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06125983502715826, + "epoch": 0.10938686923494302, + "grad_norm": 0.5003465414047241, + "kl": 5.970703125, + "learning_rate": 1e-06, + "loss": -0.0444, + "step": 630 + }, + { + "clip_ratio/high_max": 0.004048718754347647, + "clip_ratio/high_mean": 0.0008676015031596762, + "clip_ratio/low_mean": 0.06103156483732164, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06189916655421257, + "epoch": 0.1095604991861096, + "grad_norm": 0.446664035320282, + "kl": 5.79296875, + "learning_rate": 1e-06, + "loss": -0.0455, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0057174094654328655, + "clip_ratio/high_mean": 0.0012544812425403506, + "clip_ratio/low_mean": 0.05920460028573871, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.060459082713350654, + "epoch": 0.10973412913727618, + "grad_norm": 0.3102502226829529, + "kl": 5.283203125, + "learning_rate": 1e-06, + "loss": -0.0467, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0200892857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2859.0, + "completions/mean_length": 772.7053833007812, + "completions/mean_terminated_length": 725.5671997070312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.10990775908844276, + "grad_norm": 306.0694885253906, + "kl": 98.44140625, + "learning_rate": 1e-06, + "loss": 0.084, + "num_tokens": 39590546.0, + "reward": 0.2566964328289032, + "reward_std": 0.25159937143325806, + "rewards/accuracy_reward/mean": 0.2566964328289032, + "rewards/accuracy_reward/std": 0.4372987747192383, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0005277524915072718, + "clip_ratio/high_mean": 0.00011331284247262374, + "clip_ratio/low_mean": 6.202662984833296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017533947629999602, + "epoch": 0.11008138903960933, + "grad_norm": 80.43470001220703, + "kl": 20.97265625, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0024060889591055457, + "clip_ratio/high_mean": 0.0006280253287513915, + "clip_ratio/low_mean": 0.00013034390758548398, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007583692522530328, + "epoch": 0.1102550189907759, + "grad_norm": 15.676263809204102, + "kl": 6.75244140625, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0038469959290523548, + "clip_ratio/high_mean": 0.0009743035125211463, + "clip_ratio/low_mean": 0.00030387000833798083, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012781734903910547, + "epoch": 0.11042864894194249, + "grad_norm": 0.754383385181427, + "kl": 2.03271484375, + "learning_rate": 1e-06, + "loss": -0.0083, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0034859093320847023, + "clip_ratio/high_mean": 0.0009097034949263616, + "clip_ratio/low_mean": 0.00034692233202804346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012566258646984352, + "epoch": 0.11060227889310906, + "grad_norm": 28.18426513671875, + "kl": 1.904296875, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0029919691041868646, + "clip_ratio/high_mean": 0.000829929000701668, + "clip_ratio/low_mean": 0.0005260121433821041, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013559411472670035, + "epoch": 0.11077590884427564, + "grad_norm": 101.46619415283203, + "kl": 1.95556640625, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0030570315466320608, + "clip_ratio/high_mean": 0.0008335942206940672, + "clip_ratio/low_mean": 0.0009105716380872764, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017441658874304267, + "epoch": 0.11094953879544221, + "grad_norm": 126.52120208740234, + "kl": 1.998046875, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0028788824020011816, + "clip_ratio/high_mean": 0.0008092611469692201, + "clip_ratio/low_mean": 0.0014489409986708779, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022582022138522007, + "epoch": 0.1111231687466088, + "grad_norm": 101.35161590576172, + "kl": 2.0478515625, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2842.0, + "completions/mean_length": 719.5848388671875, + "completions/mean_terminated_length": 693.0338745117188, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.11129679869777537, + "grad_norm": 1.1386404037475586, + "kl": 2.0595703125, + "learning_rate": 1e-06, + "loss": -0.0666, + "num_tokens": 39977824.0, + "reward": 0.2924107313156128, + "reward_std": 0.3111969828605652, + "rewards/accuracy_reward/mean": 0.2924107015132904, + "rewards/accuracy_reward/std": 0.4553784430027008, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0002611834333947627, + "clip_ratio/high_mean": 4.865459732172894e-05, + "clip_ratio/low_mean": 0.0029702919418923557, + "clip_ratio/low_min": 3.8191261410247535e-05, + "clip_ratio/region_mean": 0.0030189465469447896, + "epoch": 0.11147042864894194, + "grad_norm": 0.8400412797927856, + "kl": 2.4306640625, + "learning_rate": 1e-06, + "loss": -0.07, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0006439860935643082, + "clip_ratio/high_mean": 0.0001959966270987934, + "clip_ratio/low_mean": 0.032481621485203505, + "clip_ratio/low_min": 0.0002546083997003734, + "clip_ratio/region_mean": 0.032677618437446654, + "epoch": 0.11164405860010852, + "grad_norm": 0.24052652716636658, + "kl": 3.1533203125, + "learning_rate": 1e-06, + "loss": -0.0712, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0011642221579677425, + "clip_ratio/high_mean": 0.000412797733588377, + "clip_ratio/low_mean": 0.03878950932994485, + "clip_ratio/low_min": 0.000547408068086952, + "clip_ratio/region_mean": 0.039202305022627115, + "epoch": 0.1118176885512751, + "grad_norm": 0.3440190255641937, + "kl": 3.919921875, + "learning_rate": 1e-06, + "loss": -0.0707, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0014815281392657198, + "clip_ratio/high_mean": 0.0005714107210224029, + "clip_ratio/low_mean": 0.04109106445685029, + "clip_ratio/low_min": 0.0006619818741455674, + "clip_ratio/region_mean": 0.04166247509419918, + "epoch": 0.11199131850244168, + "grad_norm": 0.39806780219078064, + "kl": 4.4970703125, + "learning_rate": 1e-06, + "loss": -0.0704, + "step": 645 + }, + { + "clip_ratio/high_max": 0.002455813344568014, + "clip_ratio/high_mean": 0.0008897982770577073, + "clip_ratio/low_mean": 0.041906824335455894, + "clip_ratio/low_min": 0.0006619818741455674, + "clip_ratio/region_mean": 0.04279662296175957, + "epoch": 0.11216494845360825, + "grad_norm": 0.40997326374053955, + "kl": 4.8037109375, + "learning_rate": 1e-06, + "loss": -0.0707, + "step": 646 + }, + { + "clip_ratio/high_max": 0.003530762405716814, + "clip_ratio/high_mean": 0.001094668397854548, + "clip_ratio/low_mean": 0.04157067742198706, + "clip_ratio/low_min": 0.000636521028354764, + "clip_ratio/region_mean": 0.04266534675844014, + "epoch": 0.11233857840477482, + "grad_norm": 0.3845458924770355, + "kl": 4.8349609375, + "learning_rate": 1e-06, + "loss": -0.0714, + "step": 647 + }, + { + "clip_ratio/high_max": 0.005916245499975048, + "clip_ratio/high_mean": 0.0015689119827584364, + "clip_ratio/low_mean": 0.04074595216661692, + "clip_ratio/low_min": 0.0005092167994007468, + "clip_ratio/region_mean": 0.04231486306525767, + "epoch": 0.1125122083559414, + "grad_norm": 0.3416982889175415, + "kl": 4.59765625, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2712.0, + "completions/mean_length": 776.2031860351562, + "completions/mean_terminated_length": 739.7619018554688, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.11268583830710797, + "grad_norm": 0.464707612991333, + "kl": 3.044921875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 40399747.0, + "reward": 0.345982164144516, + "reward_std": 0.2985723614692688, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0003482559313852107, + "clip_ratio/high_mean": 5.512882285074738e-05, + "clip_ratio/low_mean": 0.0002071316944238788, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026226051386402105, + "epoch": 0.11285946825827456, + "grad_norm": 0.4505893290042877, + "kl": 3.0029296875, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 650 + }, + { + "clip_ratio/high_max": 0.00033475216059741797, + "clip_ratio/high_mean": 7.544343191057123e-05, + "clip_ratio/low_mean": 0.0009921241417032434, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010675675366655923, + "epoch": 0.11303309820944113, + "grad_norm": 0.3903134763240814, + "kl": 3.076171875, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0005085181328468025, + "clip_ratio/high_mean": 0.00012274366645215196, + "clip_ratio/low_mean": 0.0032352301859646104, + "clip_ratio/low_min": 2.664677049324382e-05, + "clip_ratio/region_mean": 0.003357973830134142, + "epoch": 0.1132067281606077, + "grad_norm": 0.3097863495349884, + "kl": 3.24609375, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0011543337641342077, + "clip_ratio/high_mean": 0.00032505448871233966, + "clip_ratio/low_mean": 0.0076023939764127135, + "clip_ratio/low_min": 5.329354098648764e-05, + "clip_ratio/region_mean": 0.007927448459668085, + "epoch": 0.11338035811177428, + "grad_norm": 0.23493099212646484, + "kl": 3.494140625, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0019325695902807638, + "clip_ratio/high_mean": 0.0005031644632254029, + "clip_ratio/low_mean": 0.01296780549455434, + "clip_ratio/low_min": 9.326369763584808e-05, + "clip_ratio/region_mean": 0.013470969803165644, + "epoch": 0.11355398806294086, + "grad_norm": 0.18820630013942719, + "kl": 3.763671875, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 654 + }, + { + "clip_ratio/high_max": 0.00326030574797187, + "clip_ratio/high_mean": 0.00088747005793266, + "clip_ratio/low_mean": 0.018721132539212704, + "clip_ratio/low_min": 0.00017144162848126143, + "clip_ratio/region_mean": 0.01960860169492662, + "epoch": 0.11372761801410744, + "grad_norm": 0.19905301928520203, + "kl": 4.0126953125, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 655 + }, + { + "clip_ratio/high_max": 0.004028455121442676, + "clip_ratio/high_mean": 0.0012612718637683429, + "clip_ratio/low_mean": 0.023441870231181383, + "clip_ratio/low_min": 0.00017320401093456894, + "clip_ratio/region_mean": 0.024703143164515495, + "epoch": 0.11390124796527401, + "grad_norm": 0.20670488476753235, + "kl": 4.1728515625, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.006696428571428603, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3038.0, + "completions/mean_length": 693.2433471679688, + "completions/mean_terminated_length": 677.2067260742188, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.11407487791644058, + "grad_norm": 0.36765459179878235, + "kl": 7.298828125, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 40771992.0, + "reward": 0.2209821492433548, + "reward_std": 0.24016934633255005, + "rewards/accuracy_reward/mean": 0.2209821492433548, + "rewards/accuracy_reward/std": 0.4153723120689392, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0001250129771506181, + "clip_ratio/high_mean": 2.419997792912909e-05, + "clip_ratio/low_mean": 0.0003367218753282941, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00036092184677727346, + "epoch": 0.11424850786760717, + "grad_norm": 0.33131304383277893, + "kl": 7.5078125, + "learning_rate": 1e-06, + "loss": -0.0189, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0005382003246268141, + "clip_ratio/high_mean": 0.00010499809343400557, + "clip_ratio/low_mean": 0.002406984383924282, + "clip_ratio/low_min": 7.418397581204772e-05, + "clip_ratio/region_mean": 0.002511982505893684, + "epoch": 0.11442213781877374, + "grad_norm": 0.29130080342292786, + "kl": 7.69921875, + "learning_rate": 1e-06, + "loss": -0.0197, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0011801818600361003, + "clip_ratio/high_mean": 0.00022909407357474265, + "clip_ratio/low_mean": 0.004107752152776811, + "clip_ratio/low_min": 7.418397581204772e-05, + "clip_ratio/region_mean": 0.004336846359365154, + "epoch": 0.11459576776994032, + "grad_norm": 0.28750213980674744, + "kl": 7.8828125, + "learning_rate": 1e-06, + "loss": -0.0206, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0025106486900767777, + "clip_ratio/high_mean": 0.0005127164749865187, + "clip_ratio/low_mean": 0.006557230495673139, + "clip_ratio/low_min": 0.00014836795162409544, + "clip_ratio/region_mean": 0.00706994671782013, + "epoch": 0.11476939772110689, + "grad_norm": 0.2582945227622986, + "kl": 7.958984375, + "learning_rate": 1e-06, + "loss": -0.0215, + "step": 661 + }, + { + "clip_ratio/high_max": 0.004817518693016609, + "clip_ratio/high_mean": 0.0009760780121723656, + "clip_ratio/low_mean": 0.01074460122617893, + "clip_ratio/low_min": 0.00022255192743614316, + "clip_ratio/region_mean": 0.011720678783603944, + "epoch": 0.11494302767227346, + "grad_norm": 0.20326831936836243, + "kl": 7.916015625, + "learning_rate": 1e-06, + "loss": -0.0225, + "step": 662 + }, + { + "clip_ratio/high_max": 0.006561471276654629, + "clip_ratio/high_mean": 0.0013646450370288221, + "clip_ratio/low_mean": 0.016150905648828484, + "clip_ratio/low_min": 0.0001854599395301193, + "clip_ratio/region_mean": 0.017515550527605228, + "epoch": 0.11511665762344005, + "grad_norm": 0.1489320546388626, + "kl": 7.8203125, + "learning_rate": 1e-06, + "loss": -0.0233, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0085283105581766, + "clip_ratio/high_mean": 0.0018049097889161203, + "clip_ratio/low_mean": 0.020808621440664865, + "clip_ratio/low_min": 0.0002967359032481909, + "clip_ratio/region_mean": 0.022613529989030212, + "epoch": 0.11529028757460662, + "grad_norm": 0.13393846154212952, + "kl": 7.74609375, + "learning_rate": 1e-06, + "loss": -0.0239, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3029.0, + "completions/mean_length": 680.109375, + "completions/mean_terminated_length": 653.1128540039062, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.1154639175257732, + "grad_norm": 0.8126171231269836, + "kl": 5.271484375, + "learning_rate": 1e-06, + "loss": -0.0647, + "num_tokens": 41138953.0, + "reward": 0.2566964328289032, + "reward_std": 0.2676740288734436, + "rewards/accuracy_reward/mean": 0.2566964328289032, + "rewards/accuracy_reward/std": 0.4372987747192383, + "step": 665 + }, + { + "clip_ratio/high_max": 0.00022936935238249134, + "clip_ratio/high_mean": 4.026548367619398e-05, + "clip_ratio/low_mean": 0.0016838945548443007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017241600380657474, + "epoch": 0.11563754747693977, + "grad_norm": 0.6560788750648499, + "kl": 5.716796875, + "learning_rate": 1e-06, + "loss": -0.0667, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0006214553977770265, + "clip_ratio/high_mean": 0.0001190310495076119, + "clip_ratio/low_mean": 0.017573916469700634, + "clip_ratio/low_min": 0.0001666153984842822, + "clip_ratio/region_mean": 0.017692946828901768, + "epoch": 0.11581117742810634, + "grad_norm": 0.251058965921402, + "kl": 6.63671875, + "learning_rate": 1e-06, + "loss": -0.0685, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0015217331238090992, + "clip_ratio/high_mean": 0.00033614488575040014, + "clip_ratio/low_mean": 0.029511447879485786, + "clip_ratio/low_min": 0.00023069824965205044, + "clip_ratio/region_mean": 0.02984759386163205, + "epoch": 0.11598480737927293, + "grad_norm": 0.47080737352371216, + "kl": 7.529296875, + "learning_rate": 1e-06, + "loss": -0.0688, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0026264358129992615, + "clip_ratio/high_mean": 0.0006635536292378674, + "clip_ratio/low_mean": 0.03325444017536938, + "clip_ratio/low_min": 0.0003075976565014571, + "clip_ratio/region_mean": 0.03391799388919026, + "epoch": 0.1161584373304395, + "grad_norm": 0.5310782194137573, + "kl": 8.005859375, + "learning_rate": 1e-06, + "loss": -0.0692, + "step": 669 + }, + { + "clip_ratio/high_max": 0.004030575495562516, + "clip_ratio/high_mean": 0.0010049421889561927, + "clip_ratio/low_mean": 0.033713193610310555, + "clip_ratio/low_min": 0.00034604736720211804, + "clip_ratio/region_mean": 0.03471813537180424, + "epoch": 0.11633206728160608, + "grad_norm": 0.4843617379665375, + "kl": 7.966796875, + "learning_rate": 1e-06, + "loss": -0.0701, + "step": 670 + }, + { + "clip_ratio/high_max": 0.005619428317004349, + "clip_ratio/high_mean": 0.0013814457051921636, + "clip_ratio/low_mean": 0.032521067187190056, + "clip_ratio/low_min": 0.0002563313755672425, + "clip_ratio/region_mean": 0.03390251228120178, + "epoch": 0.11650569723277265, + "grad_norm": 0.3508221507072449, + "kl": 7.490234375, + "learning_rate": 1e-06, + "loss": -0.0713, + "step": 671 + }, + { + "clip_ratio/high_max": 0.009446338764973916, + "clip_ratio/high_mean": 0.0021355955250328407, + "clip_ratio/low_mean": 0.030788743984885514, + "clip_ratio/low_min": 0.00026914794580079615, + "clip_ratio/region_mean": 0.032924340572208166, + "epoch": 0.11667932718393922, + "grad_norm": 0.19437843561172485, + "kl": 6.884765625, + "learning_rate": 1e-06, + "loss": -0.0723, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2893.0, + "completions/mean_length": 648.7455444335938, + "completions/mean_terminated_length": 626.9144287109375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.11685295713510581, + "grad_norm": 0.5294825434684753, + "kl": 9.5078125, + "learning_rate": 1e-06, + "loss": -0.0648, + "num_tokens": 41493367.0, + "reward": 0.3169642984867096, + "reward_std": 0.2548309564590454, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0004361910487205023, + "clip_ratio/high_mean": 9.606866888134391e-05, + "clip_ratio/low_mean": 0.00013957597570879443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023564463708680705, + "epoch": 0.11702658708627238, + "grad_norm": 0.5484299659729004, + "kl": 9.19140625, + "learning_rate": 1e-06, + "loss": -0.0653, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0010105850378749892, + "clip_ratio/high_mean": 0.0002398659316895646, + "clip_ratio/low_mean": 0.000746842437365558, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009867083754215855, + "epoch": 0.11720021703743896, + "grad_norm": 0.497395783662796, + "kl": 9.3984375, + "learning_rate": 1e-06, + "loss": -0.0664, + "step": 675 + }, + { + "clip_ratio/high_max": 0.001578275790961925, + "clip_ratio/high_mean": 0.00043838565215992276, + "clip_ratio/low_mean": 0.002980129633215256, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003418515385419596, + "epoch": 0.11737384698860553, + "grad_norm": 0.3782389461994171, + "kl": 9.9921875, + "learning_rate": 1e-06, + "loss": -0.068, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0026164127921219915, + "clip_ratio/high_mean": 0.0007917889670352452, + "clip_ratio/low_mean": 0.010751545167295262, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011543334141606465, + "epoch": 0.1175474769397721, + "grad_norm": 0.2847672700881958, + "kl": 11.01171875, + "learning_rate": 1e-06, + "loss": -0.0692, + "step": 677 + }, + { + "clip_ratio/high_max": 0.003692533355206251, + "clip_ratio/high_mean": 0.0011546306559466757, + "clip_ratio/low_mean": 0.02072130615124479, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021875937934964895, + "epoch": 0.11772110689093869, + "grad_norm": 0.4478859603404999, + "kl": 11.83203125, + "learning_rate": 1e-06, + "loss": -0.0699, + "step": 678 + }, + { + "clip_ratio/high_max": 0.004253494858858176, + "clip_ratio/high_mean": 0.0014264539022406098, + "clip_ratio/low_mean": 0.025911460164934397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02733791375067085, + "epoch": 0.11789473684210526, + "grad_norm": 0.4349210560321808, + "kl": 11.9375, + "learning_rate": 1e-06, + "loss": -0.0707, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0056857692543417215, + "clip_ratio/high_mean": 0.001825912950152997, + "clip_ratio/low_mean": 0.0278399697272107, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029665881767868996, + "epoch": 0.11806836679327185, + "grad_norm": 0.26387783885002136, + "kl": 11.515625, + "learning_rate": 1e-06, + "loss": -0.0716, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004464285714285698, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2402.0, + "completions/mean_length": 614.5267944335938, + "completions/mean_terminated_length": 603.5067749023438, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.11824199674443842, + "grad_norm": 1.057134985923767, + "kl": 11.73046875, + "learning_rate": 1e-06, + "loss": -0.0458, + "num_tokens": 41840259.0, + "reward": 0.2767857313156128, + "reward_std": 0.24589210748672485, + "rewards/accuracy_reward/mean": 0.2767857015132904, + "rewards/accuracy_reward/std": 0.44790980219841003, + "step": 681 + }, + { + "clip_ratio/high_max": 0.00043431236372271087, + "clip_ratio/high_mean": 9.255178349576454e-05, + "clip_ratio/low_mean": 0.00020368358400446596, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029623536011058604, + "epoch": 0.11841562669560499, + "grad_norm": 0.9377912282943726, + "kl": 12.29296875, + "learning_rate": 1e-06, + "loss": -0.0478, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0010419547870696988, + "clip_ratio/high_mean": 0.0002492497114872094, + "clip_ratio/low_mean": 0.015468305384274572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015717554779257625, + "epoch": 0.11858925664677157, + "grad_norm": 0.31762343645095825, + "kl": 14.1328125, + "learning_rate": 1e-06, + "loss": -0.0504, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0014245361453504302, + "clip_ratio/high_mean": 0.0003801634356932482, + "clip_ratio/low_mean": 0.03509276208933443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03547292680013925, + "epoch": 0.11876288659793814, + "grad_norm": 0.8181064128875732, + "kl": 16.0859375, + "learning_rate": 1e-06, + "loss": -0.05, + "step": 684 + }, + { + "clip_ratio/high_max": 0.002247457108751405, + "clip_ratio/high_mean": 0.0005686468266503653, + "clip_ratio/low_mean": 0.037386909360066056, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03795555653050542, + "epoch": 0.11893651654910473, + "grad_norm": 0.8537558913230896, + "kl": 16.40625, + "learning_rate": 1e-06, + "loss": -0.0504, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0025898398671415634, + "clip_ratio/high_mean": 0.000680348341120407, + "clip_ratio/low_mean": 0.03349249542225152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03417284297756851, + "epoch": 0.1191101465002713, + "grad_norm": 0.5478917360305786, + "kl": 15.1328125, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 686 + }, + { + "clip_ratio/high_max": 0.003247669665142894, + "clip_ratio/high_mean": 0.0008908346462703776, + "clip_ratio/low_mean": 0.026118723559193313, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027009558049030602, + "epoch": 0.11928377645143787, + "grad_norm": 0.28566834330558777, + "kl": 13.3984375, + "learning_rate": 1e-06, + "loss": -0.0535, + "step": 687 + }, + { + "clip_ratio/high_max": 0.004426188315846957, + "clip_ratio/high_mean": 0.0012730817252304405, + "clip_ratio/low_mean": 0.020707304123789072, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021980386460199952, + "epoch": 0.11945740640260445, + "grad_norm": 0.538597583770752, + "kl": 12.01953125, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2857.0, + "completions/mean_length": 614.4017944335938, + "completions/mean_terminated_length": 592.2612915039062, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.11963103635377102, + "grad_norm": 0.7378387451171875, + "kl": 11.171875, + "learning_rate": 1e-06, + "loss": -0.0431, + "num_tokens": 42177567.0, + "reward": 0.283482164144516, + "reward_std": 0.2672211825847626, + "rewards/accuracy_reward/mean": 0.2834821343421936, + "rewards/accuracy_reward/std": 0.4511922299861908, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0005006133578717709, + "clip_ratio/high_mean": 0.00011063688680224004, + "clip_ratio/low_mean": 0.00027677248499458074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003874093667945999, + "epoch": 0.11980466630493761, + "grad_norm": 0.6361890435218811, + "kl": 11.7578125, + "learning_rate": 1e-06, + "loss": -0.0441, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0013378266703512054, + "clip_ratio/high_mean": 0.0003025026790055563, + "clip_ratio/low_mean": 0.007103945157723501, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007406447810353711, + "epoch": 0.11997829625610418, + "grad_norm": 0.4497068524360657, + "kl": 13.09765625, + "learning_rate": 1e-06, + "loss": -0.0455, + "step": 691 + }, + { + "clip_ratio/high_max": 0.002104917664837558, + "clip_ratio/high_mean": 0.0005724101483792765, + "clip_ratio/low_mean": 0.014890748250763863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01546315848827362, + "epoch": 0.12015192620727075, + "grad_norm": 0.559781551361084, + "kl": 14.0, + "learning_rate": 1e-06, + "loss": -0.0465, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0025696518569020554, + "clip_ratio/high_mean": 0.0007730627548880875, + "clip_ratio/low_mean": 0.019494248321279883, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020267311134375632, + "epoch": 0.12032555615843733, + "grad_norm": 0.5229817628860474, + "kl": 13.9375, + "learning_rate": 1e-06, + "loss": -0.048, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0035425283276708797, + "clip_ratio/high_mean": 0.0011242681503063068, + "clip_ratio/low_mean": 0.018472710042260587, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01959697814891115, + "epoch": 0.1204991861096039, + "grad_norm": 0.2480653077363968, + "kl": 13.15234375, + "learning_rate": 1e-06, + "loss": -0.0495, + "step": 694 + }, + { + "clip_ratio/high_max": 0.004971231464878656, + "clip_ratio/high_mean": 0.0015654434755560942, + "clip_ratio/low_mean": 0.022072449792176485, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02363789326045662, + "epoch": 0.12067281606077049, + "grad_norm": 0.2818325161933899, + "kl": 12.43359375, + "learning_rate": 1e-06, + "loss": -0.0505, + "step": 695 + }, + { + "clip_ratio/high_max": 0.006383289641235024, + "clip_ratio/high_mean": 0.0020549223117996007, + "clip_ratio/low_mean": 0.02735915663652122, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029414079966954887, + "epoch": 0.12084644601193706, + "grad_norm": 0.30497002601623535, + "kl": 12.08984375, + "learning_rate": 1e-06, + "loss": -0.0514, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2968.0, + "completions/max_terminated_length": 2968.0, + "completions/mean_length": 670.4777221679688, + "completions/mean_terminated_length": 670.4777221679688, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.12102007596310363, + "grad_norm": 0.5068403482437134, + "kl": 13.4296875, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 42536061.0, + "reward": 0.2723214328289032, + "reward_std": 0.2957935333251953, + "rewards/accuracy_reward/mean": 0.2723214328289032, + "rewards/accuracy_reward/std": 0.4456520676612854, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0004223357973387465, + "clip_ratio/high_mean": 8.917866261981544e-05, + "clip_ratio/low_mean": 6.415211669263954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015333078044932336, + "epoch": 0.12119370591427021, + "grad_norm": 0.4514373540878296, + "kl": 13.9765625, + "learning_rate": 1e-06, + "loss": -0.0264, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0005598397019639378, + "clip_ratio/high_mean": 0.0001164139655429608, + "clip_ratio/low_mean": 0.0018725597037700936, + "clip_ratio/low_min": 1.9769096979871392e-05, + "clip_ratio/region_mean": 0.001988973715924658, + "epoch": 0.12136733586543678, + "grad_norm": 0.3185842037200928, + "kl": 14.90625, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0009186568859149702, + "clip_ratio/high_mean": 0.00023309874995902646, + "clip_ratio/low_mean": 0.014777723583392799, + "clip_ratio/low_min": 3.740275133168325e-05, + "clip_ratio/region_mean": 0.015010821865871549, + "epoch": 0.12154096581660337, + "grad_norm": 0.47053104639053345, + "kl": 15.953125, + "learning_rate": 1e-06, + "loss": -0.0283, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0017083885104511864, + "clip_ratio/high_mean": 0.0004768441322084982, + "clip_ratio/low_mean": 0.024534757481887937, + "clip_ratio/low_min": 0.00011220826127100736, + "clip_ratio/region_mean": 0.025011601275764406, + "epoch": 0.12171459576776994, + "grad_norm": 0.6438759565353394, + "kl": 16.3125, + "learning_rate": 1e-06, + "loss": -0.0289, + "step": 701 + }, + { + "clip_ratio/high_max": 0.002865908812964335, + "clip_ratio/high_mean": 0.0008278902460006066, + "clip_ratio/low_mean": 0.0223706834949553, + "clip_ratio/low_min": 0.00011220826127100736, + "clip_ratio/region_mean": 0.023198573966510594, + "epoch": 0.12188822571893651, + "grad_norm": 0.4525564908981323, + "kl": 15.703125, + "learning_rate": 1e-06, + "loss": -0.0301, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0047902277001412585, + "clip_ratio/high_mean": 0.0012683085369644687, + "clip_ratio/low_mean": 0.017840619198977947, + "clip_ratio/low_min": 0.0002618192811496556, + "clip_ratio/region_mean": 0.01910892711021006, + "epoch": 0.1220618556701031, + "grad_norm": 0.20877709984779358, + "kl": 14.5703125, + "learning_rate": 1e-06, + "loss": -0.0313, + "step": 703 + }, + { + "clip_ratio/high_max": 0.007265792606631294, + "clip_ratio/high_mean": 0.0019484695149003528, + "clip_ratio/low_mean": 0.016484632273204625, + "clip_ratio/low_min": 0.0003740275278687477, + "clip_ratio/region_mean": 0.018433101591654122, + "epoch": 0.12223548562126967, + "grad_norm": 0.2209397405385971, + "kl": 13.5703125, + "learning_rate": 1e-06, + "loss": -0.0321, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2875.0, + "completions/mean_length": 637.9285888671875, + "completions/mean_terminated_length": 599.2925415039062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.12240911557243625, + "grad_norm": 0.4362111985683441, + "kl": 21.4296875, + "learning_rate": 1e-06, + "loss": -0.0377, + "num_tokens": 42886957.0, + "reward": 0.227678582072258, + "reward_std": 0.2382984608411789, + "rewards/accuracy_reward/mean": 0.2276785671710968, + "rewards/accuracy_reward/std": 0.41980284452438354, + "step": 705 + }, + { + "clip_ratio/high_max": 0.000502253787999507, + "clip_ratio/high_mean": 9.914156680679298e-05, + "clip_ratio/low_mean": 0.00010691120951378252, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020605278314178577, + "epoch": 0.12258274552360282, + "grad_norm": 0.4340856075286865, + "kl": 20.8671875, + "learning_rate": 1e-06, + "loss": -0.0384, + "step": 706 + }, + { + "clip_ratio/high_max": 0.000674797276587924, + "clip_ratio/high_mean": 0.00012064862130500842, + "clip_ratio/low_mean": 0.0010949741044896655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012156227421655785, + "epoch": 0.12275637547476939, + "grad_norm": 0.3860345184803009, + "kl": 20.5625, + "learning_rate": 1e-06, + "loss": -0.0395, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0013120527291903272, + "clip_ratio/high_mean": 0.00034762829045575927, + "clip_ratio/low_mean": 0.004174543239059858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0045221713808132336, + "epoch": 0.12293000542593598, + "grad_norm": 0.3030325174331665, + "kl": 20.875, + "learning_rate": 1e-06, + "loss": -0.0408, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0024040008865995333, + "clip_ratio/high_mean": 0.0006403755287465174, + "clip_ratio/low_mean": 0.00829006260028109, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008930437790695578, + "epoch": 0.12310363537710255, + "grad_norm": 0.2764405310153961, + "kl": 21.2265625, + "learning_rate": 1e-06, + "loss": -0.0419, + "step": 709 + }, + { + "clip_ratio/high_max": 0.004547749180346727, + "clip_ratio/high_mean": 0.0011639917429420166, + "clip_ratio/low_mean": 0.014231910987291485, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015395902679301798, + "epoch": 0.12327726532826913, + "grad_norm": 0.31677061319351196, + "kl": 21.5078125, + "learning_rate": 1e-06, + "loss": -0.0429, + "step": 710 + }, + { + "clip_ratio/high_max": 0.006067883019568399, + "clip_ratio/high_mean": 0.00150514482811559, + "clip_ratio/low_mean": 0.018630710314027965, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02013585576787591, + "epoch": 0.1234508952794357, + "grad_norm": 0.3218320608139038, + "kl": 21.171875, + "learning_rate": 1e-06, + "loss": -0.0439, + "step": 711 + }, + { + "clip_ratio/high_max": 0.00852405495243147, + "clip_ratio/high_mean": 0.0021275342733133584, + "clip_ratio/low_mean": 0.020321425166912377, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02244895964395255, + "epoch": 0.12362452523060227, + "grad_norm": 0.23393113911151886, + "kl": 20.46875, + "learning_rate": 1e-06, + "loss": -0.045, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004464285714285698, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2833.0, + "completions/mean_length": 571.2120971679688, + "completions/mean_terminated_length": 559.997802734375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.12379815518176886, + "grad_norm": 0.844511866569519, + "kl": 22.0546875, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 43201876.0, + "reward": 0.345982164144516, + "reward_std": 0.24311557412147522, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0002597727261672844, + "clip_ratio/high_mean": 4.747833156670822e-05, + "clip_ratio/low_mean": 0.00017991056984101306, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002273889053867606, + "epoch": 0.12397178513293543, + "grad_norm": 0.6565654277801514, + "kl": 23.5546875, + "learning_rate": 1e-06, + "loss": -0.0166, + "step": 714 + }, + { + "clip_ratio/high_max": 0.00045800968655385077, + "clip_ratio/high_mean": 0.0001082020974081388, + "clip_ratio/low_mean": 0.01822957443073392, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018337776564294472, + "epoch": 0.12414541508410201, + "grad_norm": 0.7580183148384094, + "kl": 26.5, + "learning_rate": 1e-06, + "loss": -0.0179, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0009834185511863325, + "clip_ratio/high_mean": 0.0002459450633978122, + "clip_ratio/low_mean": 0.02964492584578693, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029890872072428465, + "epoch": 0.12431904503526858, + "grad_norm": 0.8491480946540833, + "kl": 27.1796875, + "learning_rate": 1e-06, + "loss": -0.0189, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0018510265144868754, + "clip_ratio/high_mean": 0.0004337946629675571, + "clip_ratio/low_mean": 0.022949302685447037, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023383096558973193, + "epoch": 0.12449267498643515, + "grad_norm": 0.3857336640357971, + "kl": 25.296875, + "learning_rate": 1e-06, + "loss": -0.0202, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0027962009771727026, + "clip_ratio/high_mean": 0.0006686511314910604, + "clip_ratio/low_mean": 0.02125896653160453, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02192761783953756, + "epoch": 0.12466630493760174, + "grad_norm": 0.4497033655643463, + "kl": 23.375, + "learning_rate": 1e-06, + "loss": -0.0209, + "step": 718 + }, + { + "clip_ratio/high_max": 0.003941635790397413, + "clip_ratio/high_mean": 0.0009439620152988937, + "clip_ratio/low_mean": 0.021388595807366073, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02233255736064166, + "epoch": 0.12483993488876831, + "grad_norm": 0.5077589154243469, + "kl": 22.390625, + "learning_rate": 1e-06, + "loss": -0.0217, + "step": 719 + }, + { + "clip_ratio/high_max": 0.005532946815947071, + "clip_ratio/high_mean": 0.0014299757312983274, + "clip_ratio/low_mean": 0.0249417198356241, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026371695450507104, + "epoch": 0.1250135648399349, + "grad_norm": 0.38664859533309937, + "kl": 22.4609375, + "learning_rate": 1e-06, + "loss": -0.0229, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2935.0, + "completions/mean_length": 683.6473388671875, + "completions/mean_terminated_length": 662.1306762695312, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.12518719479110146, + "grad_norm": 0.7476671934127808, + "kl": 26.953125, + "learning_rate": 1e-06, + "loss": -0.0166, + "num_tokens": 43571926.0, + "reward": 0.2187500149011612, + "reward_std": 0.251229852437973, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4138607978820801, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0005206793694014777, + "clip_ratio/high_mean": 0.00010113227210695186, + "clip_ratio/low_mean": 0.009645169164286926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009746301861014217, + "epoch": 0.12536082474226803, + "grad_norm": 0.718325674533844, + "kl": 31.140625, + "learning_rate": 1e-06, + "loss": -0.0179, + "step": 722 + }, + { + "clip_ratio/high_max": 0.000554788956833363, + "clip_ratio/high_mean": 0.00013990106492656196, + "clip_ratio/low_mean": 0.033988814626354724, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03412871528416872, + "epoch": 0.1255344546934346, + "grad_norm": 1.7202353477478027, + "kl": 32.640625, + "learning_rate": 1e-06, + "loss": -0.0179, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0010388791015429888, + "clip_ratio/high_mean": 0.0002944316647699452, + "clip_ratio/low_mean": 0.009180679480778053, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009475110899074934, + "epoch": 0.1257080846446012, + "grad_norm": 0.29151713848114014, + "kl": 29.265625, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 724 + }, + { + "clip_ratio/high_max": 0.002147727835108526, + "clip_ratio/high_mean": 0.0005311939294188051, + "clip_ratio/low_mean": 0.006062938104150817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006594131933525205, + "epoch": 0.12588171459576777, + "grad_norm": 0.5123766660690308, + "kl": 26.21875, + "learning_rate": 1e-06, + "loss": -0.0212, + "step": 725 + }, + { + "clip_ratio/high_max": 0.003139905515126884, + "clip_ratio/high_mean": 0.0007965798213263042, + "clip_ratio/low_mean": 0.00912831025198102, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009924889833200723, + "epoch": 0.12605534454693434, + "grad_norm": 0.5350141525268555, + "kl": 25.34375, + "learning_rate": 1e-06, + "loss": -0.022, + "step": 726 + }, + { + "clip_ratio/high_max": 0.004765344536281191, + "clip_ratio/high_mean": 0.0012609330369741656, + "clip_ratio/low_mean": 0.018233147158753127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019494080392178148, + "epoch": 0.12622897449810092, + "grad_norm": 0.26549217104911804, + "kl": 26.40625, + "learning_rate": 1e-06, + "loss": -0.0232, + "step": 727 + }, + { + "clip_ratio/high_max": 0.006262515715206973, + "clip_ratio/high_mean": 0.0015791638870723546, + "clip_ratio/low_mean": 0.026242037711199373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027821201889310032, + "epoch": 0.12640260444926749, + "grad_norm": 0.44607532024383545, + "kl": 27.25, + "learning_rate": 1e-06, + "loss": -0.024, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2605.0, + "completions/mean_length": 726.0111694335938, + "completions/mean_terminated_length": 683.3568115234375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.12657623440043408, + "grad_norm": 1.7821310758590698, + "kl": 35.953125, + "learning_rate": 1e-06, + "loss": 0.0603, + "num_tokens": 43968987.0, + "reward": 0.2879464328289032, + "reward_std": 0.28767478466033936, + "rewards/accuracy_reward/mean": 0.2879464328289032, + "rewards/accuracy_reward/std": 0.4533121883869171, + "step": 729 + }, + { + "clip_ratio/high_max": 0.00043652971362462267, + "clip_ratio/high_mean": 8.235984137172636e-05, + "clip_ratio/low_mean": 0.0003030972093256423, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003854570313706063, + "epoch": 0.12674986435160066, + "grad_norm": 1.2466399669647217, + "kl": 39.96875, + "learning_rate": 1e-06, + "loss": 0.0564, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0016646132153255166, + "clip_ratio/high_mean": 0.00027320378421791247, + "clip_ratio/low_mean": 0.06577640469186008, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06604960886761546, + "epoch": 0.12692349430276723, + "grad_norm": 2.1273996829986572, + "kl": 48.609375, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0016282315282296622, + "clip_ratio/high_mean": 0.00030084022455412196, + "clip_ratio/low_mean": 0.08646585047245026, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08676668908447027, + "epoch": 0.1270971242539338, + "grad_norm": 2.5490400791168213, + "kl": 51.4375, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0014116856254986487, + "clip_ratio/high_mean": 0.0003258292381360661, + "clip_ratio/low_mean": 0.08114128303714097, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08146711252629757, + "epoch": 0.12727075420510037, + "grad_norm": 2.098358154296875, + "kl": 48.3125, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0020089622848900035, + "clip_ratio/high_mean": 0.000467218630546995, + "clip_ratio/low_mean": 0.042262921342626214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0427301392192021, + "epoch": 0.12744438415626697, + "grad_norm": 0.6683831810951233, + "kl": 41.109375, + "learning_rate": 1e-06, + "loss": 0.0517, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0037806392792845145, + "clip_ratio/high_mean": 0.0008313740108860657, + "clip_ratio/low_mean": 0.02718017384177074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028011547750793397, + "epoch": 0.12761801410743354, + "grad_norm": 1.2172889709472656, + "kl": 36.84375, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 735 + }, + { + "clip_ratio/high_max": 0.006229607301065698, + "clip_ratio/high_mean": 0.0013613662886200473, + "clip_ratio/low_mean": 0.02459827222628519, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025959639227949083, + "epoch": 0.1277916440586001, + "grad_norm": 1.2146602869033813, + "kl": 36.671875, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.024553571428571397, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2671.0, + "completions/mean_length": 729.9107666015625, + "completions/mean_terminated_length": 670.9564819335938, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.12796527400976668, + "grad_norm": 2.5040786266326904, + "kl": 46.34375, + "learning_rate": 1e-06, + "loss": 0.1352, + "num_tokens": 44362043.0, + "reward": 0.3169642984867096, + "reward_std": 0.25581032037734985, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "step": 737 + }, + { + "clip_ratio/high_max": 0.000400779284973396, + "clip_ratio/high_mean": 8.08332656561106e-05, + "clip_ratio/low_mean": 0.021229912352282554, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021310745243681595, + "epoch": 0.12813890396093325, + "grad_norm": 1.4405072927474976, + "kl": 54.59375, + "learning_rate": 1e-06, + "loss": 0.1288, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0009463762053201208, + "clip_ratio/high_mean": 0.00017246381355562335, + "clip_ratio/low_mean": 0.0920712947845459, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09224375709891319, + "epoch": 0.12831253391209985, + "grad_norm": 2.8465914726257324, + "kl": 62.96875, + "learning_rate": 1e-06, + "loss": 0.1311, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0013966588066978147, + "clip_ratio/high_mean": 0.00026077190420892293, + "clip_ratio/low_mean": 0.10407100198790431, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.10433177510276437, + "epoch": 0.12848616386326642, + "grad_norm": 2.9912407398223877, + "kl": 63.890625, + "learning_rate": 1e-06, + "loss": 0.1312, + "step": 740 + }, + { + "clip_ratio/high_max": 0.001549617847558693, + "clip_ratio/high_mean": 0.0003375528029891939, + "clip_ratio/low_mean": 0.07982926978729665, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08016681671142578, + "epoch": 0.128659793814433, + "grad_norm": 1.6735918521881104, + "kl": 59.03125, + "learning_rate": 1e-06, + "loss": 0.1273, + "step": 741 + }, + { + "clip_ratio/high_max": 0.002565731190770748, + "clip_ratio/high_mean": 0.0005194415034566191, + "clip_ratio/low_mean": 0.05097265262156725, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.051492093480192125, + "epoch": 0.12883342376559956, + "grad_norm": 0.6801656484603882, + "kl": 50.109375, + "learning_rate": 1e-06, + "loss": 0.1251, + "step": 742 + }, + { + "clip_ratio/high_max": 0.005787100815723534, + "clip_ratio/high_mean": 0.001012707802942714, + "clip_ratio/low_mean": 0.037676974054193124, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03868967975722626, + "epoch": 0.12900705371676613, + "grad_norm": 1.3840670585632324, + "kl": 44.03125, + "learning_rate": 1e-06, + "loss": 0.126, + "step": 743 + }, + { + "clip_ratio/high_max": 0.006358775128319394, + "clip_ratio/high_mean": 0.0012021344391541788, + "clip_ratio/low_mean": 0.03353850849089213, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03474064252804965, + "epoch": 0.12918068366793273, + "grad_norm": 1.489719271659851, + "kl": 42.671875, + "learning_rate": 1e-06, + "loss": 0.1252, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.049107142857142905, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3025.0, + "completions/mean_length": 927.49560546875, + "completions/mean_terminated_length": 816.7465209960938, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.1293543136190993, + "grad_norm": 2.2461822032928467, + "kl": 44.3359375, + "learning_rate": 1e-06, + "loss": 0.1027, + "num_tokens": 44849865.0, + "reward": 0.345982164144516, + "reward_std": 0.253793329000473, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0004250576130289119, + "clip_ratio/high_mean": 8.815306864562444e-05, + "clip_ratio/low_mean": 0.013264698922284879, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013352851150557399, + "epoch": 0.12952794357026587, + "grad_norm": 1.0266549587249756, + "kl": 50.890625, + "learning_rate": 1e-06, + "loss": 0.0963, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0009499599600530928, + "clip_ratio/high_mean": 0.000260471225828951, + "clip_ratio/low_mean": 0.09373105992563069, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.093991530360654, + "epoch": 0.12970157352143244, + "grad_norm": 2.617295265197754, + "kl": 62.078125, + "learning_rate": 1e-06, + "loss": 0.1006, + "step": 747 + }, + { + "clip_ratio/high_max": 0.001219954570842674, + "clip_ratio/high_mean": 0.00036379172297529294, + "clip_ratio/low_mean": 0.10866109537892044, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.10902489116415381, + "epoch": 0.129875203472599, + "grad_norm": 3.090850591659546, + "kl": 67.28125, + "learning_rate": 1e-06, + "loss": 0.1049, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0015783804337843321, + "clip_ratio/high_mean": 0.0004537050581348012, + "clip_ratio/low_mean": 0.10437127808108926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.10482498188503087, + "epoch": 0.1300488334237656, + "grad_norm": 2.834176778793335, + "kl": 65.78125, + "learning_rate": 1e-06, + "loss": 0.1034, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0016270127453026362, + "clip_ratio/high_mean": 0.0005282318325043889, + "clip_ratio/low_mean": 0.08413277775980532, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08466101088561118, + "epoch": 0.13022246337493218, + "grad_norm": 2.0064496994018555, + "kl": 58.90625, + "learning_rate": 1e-06, + "loss": 0.0982, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0020655298212659545, + "clip_ratio/high_mean": 0.0005813258112539188, + "clip_ratio/low_mean": 0.05617044138489291, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.056751769327092916, + "epoch": 0.13039609332609875, + "grad_norm": 0.9604671597480774, + "kl": 49.34375, + "learning_rate": 1e-06, + "loss": 0.0931, + "step": 751 + }, + { + "clip_ratio/high_max": 0.005454467347590253, + "clip_ratio/high_mean": 0.0011836054272862384, + "clip_ratio/low_mean": 0.024697908578673378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025881513603962958, + "epoch": 0.13056972327726532, + "grad_norm": 1.7044836282730103, + "kl": 41.09375, + "learning_rate": 1e-06, + "loss": 0.0935, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2926.0, + "completions/mean_length": 769.3438110351562, + "completions/mean_terminated_length": 684.0601806640625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.1307433532284319, + "grad_norm": 2.237470865249634, + "kl": 55.21875, + "learning_rate": 1e-06, + "loss": 0.0907, + "num_tokens": 45256331.0, + "reward": 0.2611607313156128, + "reward_std": 0.28188419342041016, + "rewards/accuracy_reward/mean": 0.2611607015132904, + "rewards/accuracy_reward/std": 0.43975839018821716, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0006729250972057343, + "clip_ratio/high_mean": 0.0001255282963938953, + "clip_ratio/low_mean": 0.00010088060071211657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022640889710601186, + "epoch": 0.1309169831795985, + "grad_norm": 2.04978609085083, + "kl": 56.59375, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0008185167080227984, + "clip_ratio/high_mean": 0.00017267988732783124, + "clip_ratio/low_mean": 0.02811765088699758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028290330606978387, + "epoch": 0.13109061313076506, + "grad_norm": 1.8712111711502075, + "kl": 62.59375, + "learning_rate": 1e-06, + "loss": 0.0868, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0026534885892033344, + "clip_ratio/high_mean": 0.0004743099143524887, + "clip_ratio/low_mean": 0.050590890226885676, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.051065201638266444, + "epoch": 0.13126424308193163, + "grad_norm": 1.8145229816436768, + "kl": 68.09375, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 756 + }, + { + "clip_ratio/high_max": 0.004222834293614142, + "clip_ratio/high_mean": 0.0008103911568468902, + "clip_ratio/low_mean": 0.08598732762038708, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08679772028699517, + "epoch": 0.1314378730330982, + "grad_norm": 2.4029183387756348, + "kl": 68.3125, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 757 + }, + { + "clip_ratio/high_max": 0.004606629016052466, + "clip_ratio/high_mean": 0.0009528208283882122, + "clip_ratio/low_mean": 0.06434378866106272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06529660546220839, + "epoch": 0.13161150298426477, + "grad_norm": 1.8169597387313843, + "kl": 63.875, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 758 + }, + { + "clip_ratio/high_max": 0.005195158999413252, + "clip_ratio/high_mean": 0.0010778641026263358, + "clip_ratio/low_mean": 0.041292975074611604, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04237083822954446, + "epoch": 0.13178513293543137, + "grad_norm": 1.3364989757537842, + "kl": 57.71875, + "learning_rate": 1e-06, + "loss": 0.0798, + "step": 759 + }, + { + "clip_ratio/high_max": 0.005741680688515771, + "clip_ratio/high_mean": 0.0012905606254207669, + "clip_ratio/low_mean": 0.02960908366367221, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030899645120371133, + "epoch": 0.13195876288659794, + "grad_norm": 1.4866185188293457, + "kl": 54.625, + "learning_rate": 1e-06, + "loss": 0.0786, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2877.0, + "completions/mean_length": 1018.65185546875, + "completions/mean_terminated_length": 731.2875366210938, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.1321323928377645, + "grad_norm": 1.8105686902999878, + "kl": 63.9375, + "learning_rate": 1e-06, + "loss": 0.1269, + "num_tokens": 45787271.0, + "reward": 0.2254464328289032, + "reward_std": 0.23604397475719452, + "rewards/accuracy_reward/mean": 0.2254464328289032, + "rewards/accuracy_reward/std": 0.41834309697151184, + "step": 761 + }, + { + "clip_ratio/high_max": 0.00018667020594875794, + "clip_ratio/high_mean": 3.7687590747736976e-05, + "clip_ratio/low_mean": 0.0005341049545677379, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005717925469070906, + "epoch": 0.13230602278893108, + "grad_norm": 1.4963299036026, + "kl": 66.65625, + "learning_rate": 1e-06, + "loss": 0.1241, + "step": 762 + }, + { + "clip_ratio/high_max": 0.000260242148215184, + "clip_ratio/high_mean": 6.547723387484439e-05, + "clip_ratio/low_mean": 0.04575055650775539, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04581603336828266, + "epoch": 0.13247965274009765, + "grad_norm": 1.1714023351669312, + "kl": 72.21875, + "learning_rate": 1e-06, + "loss": 0.1217, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0018164177472499432, + "clip_ratio/high_mean": 0.000292630119020032, + "clip_ratio/low_mean": 0.09240578998560522, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09269842550838803, + "epoch": 0.13265328269126425, + "grad_norm": 2.0532360076904297, + "kl": 72.5625, + "learning_rate": 1e-06, + "loss": 0.1217, + "step": 764 + }, + { + "clip_ratio/high_max": 0.005075796198070748, + "clip_ratio/high_mean": 0.0007938718745208462, + "clip_ratio/low_mean": 0.08814978783448169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08894366023559996, + "epoch": 0.13282691264243082, + "grad_norm": 1.7194164991378784, + "kl": 68.21875, + "learning_rate": 1e-06, + "loss": 0.1201, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0073910101418732665, + "clip_ratio/high_mean": 0.0011699972183123464, + "clip_ratio/low_mean": 0.0476365410663675, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.048806536703978054, + "epoch": 0.1330005425935974, + "grad_norm": 0.8910926580429077, + "kl": 60.75, + "learning_rate": 1e-06, + "loss": 0.1181, + "step": 766 + }, + { + "clip_ratio/high_max": 0.009443512160942191, + "clip_ratio/high_mean": 0.0014932575954844651, + "clip_ratio/low_mean": 0.02849821512336348, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02999147189075302, + "epoch": 0.13317417254476396, + "grad_norm": 1.3007543087005615, + "kl": 56.828125, + "learning_rate": 1e-06, + "loss": 0.1172, + "step": 767 + }, + { + "clip_ratio/high_max": 0.009435670122911688, + "clip_ratio/high_mean": 0.0015716399930170155, + "clip_ratio/low_mean": 0.03555028114351444, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.037121920573554235, + "epoch": 0.13334780249593056, + "grad_norm": 0.9452931880950928, + "kl": 58.15625, + "learning_rate": 1e-06, + "loss": 0.1151, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1941964285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3066.0, + "completions/mean_length": 1234.078125, + "completions/mean_terminated_length": 791.14404296875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.13352143244709713, + "grad_norm": 3.654850959777832, + "kl": 77.34375, + "learning_rate": 1e-06, + "loss": 0.2567, + "num_tokens": 46402458.0, + "reward": 0.2946428656578064, + "reward_std": 0.24581019580364227, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639169216156006, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0005284457101879525, + "clip_ratio/high_mean": 0.00011822663077509787, + "clip_ratio/low_mean": 0.057520439266227186, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05763866740744561, + "epoch": 0.1336950623982637, + "grad_norm": 2.8574657440185547, + "kl": 93.40625, + "learning_rate": 1e-06, + "loss": 0.2482, + "step": 770 + }, + { + "clip_ratio/high_max": 0.000739504227112775, + "clip_ratio/high_mean": 0.00018040561928955867, + "clip_ratio/low_mean": 0.14355288445949554, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1437332914210856, + "epoch": 0.13386869234943027, + "grad_norm": 4.619664669036865, + "kl": 106.25, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0006059953102521831, + "clip_ratio/high_mean": 0.0001489139272052853, + "clip_ratio/low_mean": 0.17302960343658924, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1731785237789154, + "epoch": 0.13404232230059684, + "grad_norm": 5.126173496246338, + "kl": 108.75, + "learning_rate": 1e-06, + "loss": 0.2538, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0007724988454356208, + "clip_ratio/high_mean": 0.00018137857887268183, + "clip_ratio/low_mean": 0.14287080522626638, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.14305218355730176, + "epoch": 0.13421595225176344, + "grad_norm": 3.187932252883911, + "kl": 101.375, + "learning_rate": 1e-06, + "loss": 0.2475, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0008177857553164358, + "clip_ratio/high_mean": 0.00020139597677371057, + "clip_ratio/low_mean": 0.09355358080938458, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375497652217746, + "epoch": 0.13438958220293, + "grad_norm": 1.5500853061676025, + "kl": 88.21875, + "learning_rate": 1e-06, + "loss": 0.2426, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0023902081538835773, + "clip_ratio/high_mean": 0.0004099698114714556, + "clip_ratio/low_mean": 0.015936982177663594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016346952528692782, + "epoch": 0.13456321215409658, + "grad_norm": 3.133394718170166, + "kl": 76.96875, + "learning_rate": 1e-06, + "loss": 0.2443, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0024768876119196648, + "clip_ratio/high_mean": 0.00045920257502984896, + "clip_ratio/low_mean": 0.012660016538575292, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013119218696374446, + "epoch": 0.13473684210526315, + "grad_norm": 3.565126419067383, + "kl": 74.125, + "learning_rate": 1e-06, + "loss": 0.2455, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2410714285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3038.0, + "completions/mean_length": 1298.28125, + "completions/mean_terminated_length": 734.86474609375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.13491047205642973, + "grad_norm": 3.3048603534698486, + "kl": 126.0, + "learning_rate": 1e-06, + "loss": 0.3711, + "num_tokens": 47043848.0, + "reward": 0.2879464328289032, + "reward_std": 0.26452797651290894, + "rewards/accuracy_reward/mean": 0.2879464328289032, + "rewards/accuracy_reward/std": 0.4533121883869171, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0004137473752052756, + "clip_ratio/high_mean": 8.94434188012383e-05, + "clip_ratio/low_mean": 0.019951082998886704, + "clip_ratio/low_min": 9.677419438958168e-05, + "clip_ratio/region_mean": 0.0200405262876302, + "epoch": 0.13508410200759632, + "grad_norm": 2.188936710357666, + "kl": 140.6875, + "learning_rate": 1e-06, + "loss": 0.3634, + "step": 778 + }, + { + "clip_ratio/high_max": 0.00044048839845345356, + "clip_ratio/high_mean": 9.568350401423231e-05, + "clip_ratio/low_mean": 0.08639753190800548, + "clip_ratio/low_min": 0.0017096773954108357, + "clip_ratio/region_mean": 0.08649321692064404, + "epoch": 0.1352577319587629, + "grad_norm": 3.9684832096099854, + "kl": 151.4375, + "learning_rate": 1e-06, + "loss": 0.3638, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0007267099545060773, + "clip_ratio/high_mean": 0.0001642544301603266, + "clip_ratio/low_mean": 0.09766465961001813, + "clip_ratio/low_min": 0.011387096717953682, + "clip_ratio/region_mean": 0.09782890998758376, + "epoch": 0.13543136190992947, + "grad_norm": 3.447263240814209, + "kl": 149.3125, + "learning_rate": 1e-06, + "loss": 0.3616, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0005429339689726476, + "clip_ratio/high_mean": 0.00015278372143257002, + "clip_ratio/low_mean": 0.08436104282736778, + "clip_ratio/low_min": 0.009370191022753716, + "clip_ratio/region_mean": 0.08451382676139474, + "epoch": 0.13560499186109604, + "grad_norm": 2.1250641345977783, + "kl": 134.875, + "learning_rate": 1e-06, + "loss": 0.3559, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0006412040283976239, + "clip_ratio/high_mean": 0.00016290105247662723, + "clip_ratio/low_mean": 0.05552526586689055, + "clip_ratio/low_min": 0.007130688056349754, + "clip_ratio/region_mean": 0.055688167456537485, + "epoch": 0.1357786218122626, + "grad_norm": 2.203540325164795, + "kl": 117.3125, + "learning_rate": 1e-06, + "loss": 0.3512, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0009719731906443485, + "clip_ratio/high_mean": 0.0002233091453263114, + "clip_ratio/low_mean": 0.02666444194619544, + "clip_ratio/low_min": 0.003316079033538699, + "clip_ratio/region_mean": 0.026887751708272845, + "epoch": 0.1359522517634292, + "grad_norm": 2.894392490386963, + "kl": 112.0, + "learning_rate": 1e-06, + "loss": 0.349, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0014705288613185985, + "clip_ratio/high_mean": 0.00031802420926396735, + "clip_ratio/low_mean": 0.04936532903229818, + "clip_ratio/low_min": 0.004043998662382364, + "clip_ratio/region_mean": 0.04968335333978757, + "epoch": 0.13612588171459578, + "grad_norm": 2.1619443893432617, + "kl": 118.125, + "learning_rate": 1e-06, + "loss": 0.345, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2366071428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3007.0, + "completions/mean_length": 1317.560302734375, + "completions/mean_terminated_length": 773.7865600585938, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.13629951166576235, + "grad_norm": 6.698406219482422, + "kl": 76.25, + "learning_rate": 1e-06, + "loss": 0.3113, + "num_tokens": 47698547.0, + "reward": 0.2366071492433548, + "reward_std": 0.24626021087169647, + "rewards/accuracy_reward/mean": 0.2366071492433548, + "rewards/accuracy_reward/std": 0.4254741966724396, + "step": 785 + }, + { + "clip_ratio/high_max": 0.00022945849650568562, + "clip_ratio/high_mean": 4.106636276901554e-05, + "clip_ratio/low_mean": 0.1018415167927742, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.10188257973641157, + "epoch": 0.13647314161692892, + "grad_norm": 2.967012643814087, + "kl": 94.0, + "learning_rate": 1e-06, + "loss": 0.2929, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0003976300040449132, + "clip_ratio/high_mean": 8.659494665153034e-05, + "clip_ratio/low_mean": 0.1854246580041945, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18551124818623066, + "epoch": 0.1366467715680955, + "grad_norm": 5.255557537078857, + "kl": 108.9375, + "learning_rate": 1e-06, + "loss": 0.3009, + "step": 787 + }, + { + "clip_ratio/high_max": 0.00042971021503035445, + "clip_ratio/high_mean": 0.00010273708039676421, + "clip_ratio/low_mean": 0.19343007169663906, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.19353281147778034, + "epoch": 0.13682040151926209, + "grad_norm": 5.394523620605469, + "kl": 110.6875, + "learning_rate": 1e-06, + "loss": 0.3017, + "step": 788 + }, + { + "clip_ratio/high_max": 0.00045416820648824796, + "clip_ratio/high_mean": 0.0001025551446218742, + "clip_ratio/low_mean": 0.14288975251838565, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.14299230743199587, + "epoch": 0.13699403147042866, + "grad_norm": 2.7723424434661865, + "kl": 100.4375, + "learning_rate": 1e-06, + "loss": 0.2947, + "step": 789 + }, + { + "clip_ratio/high_max": 0.00041900723044818733, + "clip_ratio/high_mean": 8.19107850702494e-05, + "clip_ratio/low_mean": 0.09514775313436985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09522966761142015, + "epoch": 0.13716766142159523, + "grad_norm": 3.6951067447662354, + "kl": 84.0, + "learning_rate": 1e-06, + "loss": 0.2918, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0007149677048801095, + "clip_ratio/high_mean": 0.000149846292060829, + "clip_ratio/low_mean": 0.03443086822517216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0345807148842141, + "epoch": 0.1373412913727618, + "grad_norm": 5.147875785827637, + "kl": 77.15625, + "learning_rate": 1e-06, + "loss": 0.2931, + "step": 791 + }, + { + "clip_ratio/high_max": 0.000932492604079016, + "clip_ratio/high_mean": 0.000195670681705451, + "clip_ratio/low_mean": 0.04262206272687763, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.042817733134143054, + "epoch": 0.13751492132392837, + "grad_norm": 4.022861480712891, + "kl": 79.9375, + "learning_rate": 1e-06, + "loss": 0.2894, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3236607142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3060.0, + "completions/mean_length": 1558.040283203125, + "completions/mean_terminated_length": 833.5379638671875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.13768855127509497, + "grad_norm": 1.8598934412002563, + "kl": 115.5, + "learning_rate": 1e-06, + "loss": 0.2766, + "num_tokens": 48462333.0, + "reward": 0.1852678656578064, + "reward_std": 0.19267436861991882, + "rewards/accuracy_reward/mean": 0.1852678507566452, + "rewards/accuracy_reward/std": 0.38894903659820557, + "step": 793 + }, + { + "clip_ratio/high_max": 0.00011600146626733476, + "clip_ratio/high_mean": 2.152856995962793e-05, + "clip_ratio/low_mean": 7.136738281587895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.289595118389116e-05, + "epoch": 0.13786218122626154, + "grad_norm": 2.3403172492980957, + "kl": 123.25, + "learning_rate": 1e-06, + "loss": 0.2751, + "step": 794 + }, + { + "clip_ratio/high_max": 0.00013574236072599888, + "clip_ratio/high_mean": 2.9468572279256477e-05, + "clip_ratio/low_mean": 0.010359380510635674, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01038884847321242, + "epoch": 0.1380358111774281, + "grad_norm": 2.2356069087982178, + "kl": 121.125, + "learning_rate": 1e-06, + "loss": 0.2712, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0014676363698526984, + "clip_ratio/high_mean": 0.00023557679503483087, + "clip_ratio/low_mean": 0.012025003437884152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012260580318979919, + "epoch": 0.13820944112859468, + "grad_norm": 1.4892464876174927, + "kl": 109.3125, + "learning_rate": 1e-06, + "loss": 0.2655, + "step": 796 + }, + { + "clip_ratio/high_max": 0.014300603381343535, + "clip_ratio/high_mean": 0.002096177780003927, + "clip_ratio/low_mean": 0.00626198402915179, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00835816189282923, + "epoch": 0.13838307107976125, + "grad_norm": 1.637275218963623, + "kl": 96.9375, + "learning_rate": 1e-06, + "loss": 0.2604, + "step": 797 + }, + { + "clip_ratio/high_max": 0.016223127875491627, + "clip_ratio/high_mean": 0.002414258941371372, + "clip_ratio/low_mean": 0.023383931556963944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02579819077254797, + "epoch": 0.13855670103092785, + "grad_norm": 1.4654210805892944, + "kl": 94.75, + "learning_rate": 1e-06, + "loss": 0.2558, + "step": 798 + }, + { + "clip_ratio/high_max": 0.016940000725298887, + "clip_ratio/high_mean": 0.0025465849012107356, + "clip_ratio/low_mean": 0.04851913763923221, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0510657208833436, + "epoch": 0.13873033098209442, + "grad_norm": 1.2289332151412964, + "kl": 100.875, + "learning_rate": 1e-06, + "loss": 0.2517, + "step": 799 + }, + { + "clip_ratio/high_max": 0.021506152090296382, + "clip_ratio/high_mean": 0.0032153780393855413, + "clip_ratio/low_mean": 0.0690689063485479, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07228428453527158, + "epoch": 0.138903960933261, + "grad_norm": 1.58564031124115, + "kl": 102.75, + "learning_rate": 1e-06, + "loss": 0.2482, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4866071428571429, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2977.0, + "completions/mean_length": 1901.49560546875, + "completions/mean_terminated_length": 792.0608520507812, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.13907759088442756, + "grad_norm": 3.758915424346924, + "kl": 138.4375, + "learning_rate": 1e-06, + "loss": 0.3401, + "num_tokens": 49381827.0, + "reward": 0.2165178656578064, + "reward_std": 0.21905863285064697, + "rewards/accuracy_reward/mean": 0.2165178507566452, + "rewards/accuracy_reward/std": 0.41233164072036743, + "step": 801 + }, + { + "clip_ratio/high_max": 0.016221645308178267, + "clip_ratio/high_mean": 0.0023512240250056493, + "clip_ratio/low_mean": 3.170201171087683e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023829260139791586, + "epoch": 0.13925122083559413, + "grad_norm": 2.8806381225585938, + "kl": 121.5, + "learning_rate": 1e-06, + "loss": 0.3348, + "step": 802 + }, + { + "clip_ratio/high_max": 0.029960240324726328, + "clip_ratio/high_mean": 0.004402173804919585, + "clip_ratio/low_mean": 0.016006629390176386, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020408802665770054, + "epoch": 0.13942485078676073, + "grad_norm": 2.5216917991638184, + "kl": 117.6875, + "learning_rate": 1e-06, + "loss": 0.3303, + "step": 803 + }, + { + "clip_ratio/high_max": 0.02855854093650123, + "clip_ratio/high_mean": 0.004265524639777141, + "clip_ratio/low_mean": 0.09156415821053088, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09582968428730965, + "epoch": 0.1395984807379273, + "grad_norm": 3.3901422023773193, + "kl": 126.1875, + "learning_rate": 1e-06, + "loss": 0.3312, + "step": 804 + }, + { + "clip_ratio/high_max": 0.027211393280595075, + "clip_ratio/high_mean": 0.004102347544176155, + "clip_ratio/low_mean": 0.09558075363747776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0996830987278372, + "epoch": 0.13977211068909387, + "grad_norm": 3.4747204780578613, + "kl": 128.25, + "learning_rate": 1e-06, + "loss": 0.3287, + "step": 805 + }, + { + "clip_ratio/high_max": 0.025965989647374954, + "clip_ratio/high_mean": 0.003953257037210278, + "clip_ratio/low_mean": 0.0768674046266824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08082066266797483, + "epoch": 0.13994574064026044, + "grad_norm": 2.561691999435425, + "kl": 123.3125, + "learning_rate": 1e-06, + "loss": 0.3203, + "step": 806 + }, + { + "clip_ratio/high_max": 0.028138450514234137, + "clip_ratio/high_mean": 0.004267090154826292, + "clip_ratio/low_mean": 0.020701201807241887, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024968291516415775, + "epoch": 0.140119370591427, + "grad_norm": 1.5740927457809448, + "kl": 113.0, + "learning_rate": 1e-06, + "loss": 0.3143, + "step": 807 + }, + { + "clip_ratio/high_max": 0.02535862860531779, + "clip_ratio/high_mean": 0.0038807549699413357, + "clip_ratio/low_mean": 0.02614115155301988, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030021906830370426, + "epoch": 0.1402930005425936, + "grad_norm": 2.0910468101501465, + "kl": 114.8125, + "learning_rate": 1e-06, + "loss": 0.3125, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3006.0, + "completions/mean_length": 1725.4554443359375, + "completions/mean_terminated_length": 804.1353759765625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.14046663049376018, + "grad_norm": 2.258481979370117, + "kl": 101.59375, + "learning_rate": 1e-06, + "loss": 0.3352, + "num_tokens": 50221359.0, + "reward": 0.2455357313156128, + "reward_std": 0.23717302083969116, + "rewards/accuracy_reward/mean": 0.2455357164144516, + "rewards/accuracy_reward/std": 0.43088552355766296, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0002169284498449997, + "clip_ratio/high_mean": 3.764810719530942e-05, + "clip_ratio/low_mean": 0.0009202495534736954, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009578976396369399, + "epoch": 0.14064026044492675, + "grad_norm": 1.6595088243484497, + "kl": 115.875, + "learning_rate": 1e-06, + "loss": 0.3299, + "step": 810 + }, + { + "clip_ratio/high_max": 0.00033870395373014617, + "clip_ratio/high_mean": 7.560530553973877e-05, + "clip_ratio/low_mean": 0.11514052806887776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11521613108925521, + "epoch": 0.14081389039609332, + "grad_norm": 5.446586608886719, + "kl": 126.6875, + "learning_rate": 1e-06, + "loss": 0.3302, + "step": 811 + }, + { + "clip_ratio/high_max": 0.00037302924511095625, + "clip_ratio/high_mean": 8.137773033922713e-05, + "clip_ratio/low_mean": 0.10992334107868373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1100047203944996, + "epoch": 0.1409875203472599, + "grad_norm": 3.691190242767334, + "kl": 121.5625, + "learning_rate": 1e-06, + "loss": 0.3262, + "step": 812 + }, + { + "clip_ratio/high_max": 0.000664936632347235, + "clip_ratio/high_mean": 0.00013108943153383734, + "clip_ratio/low_mean": 0.0493504501064308, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0494815370766446, + "epoch": 0.1411611502984265, + "grad_norm": 1.9452085494995117, + "kl": 104.03125, + "learning_rate": 1e-06, + "loss": 0.3201, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0022810842347098514, + "clip_ratio/high_mean": 0.0003719521118910052, + "clip_ratio/low_mean": 0.026929634186672047, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027301586349494755, + "epoch": 0.14133478024959306, + "grad_norm": 2.6852047443389893, + "kl": 95.78125, + "learning_rate": 1e-06, + "loss": 0.317, + "step": 814 + }, + { + "clip_ratio/high_max": 0.002803186853270745, + "clip_ratio/high_mean": 0.0004761894952025614, + "clip_ratio/low_mean": 0.04851749795489013, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04899368720361963, + "epoch": 0.14150841020075963, + "grad_norm": 1.3267393112182617, + "kl": 100.40625, + "learning_rate": 1e-06, + "loss": 0.3124, + "step": 815 + }, + { + "clip_ratio/high_max": 0.003051950207009213, + "clip_ratio/high_mean": 0.0005383714378695004, + "clip_ratio/low_mean": 0.13059209322091192, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.13113046635407954, + "epoch": 0.1416820401519262, + "grad_norm": 4.8810625076293945, + "kl": 117.6875, + "learning_rate": 1e-06, + "loss": 0.314, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2934.0, + "completions/mean_length": 1951.91748046875, + "completions/mean_terminated_length": 759.5714111328125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.14185567010309277, + "grad_norm": 5.678103446960449, + "kl": 89.75, + "learning_rate": 1e-06, + "loss": 0.3003, + "num_tokens": 51161514.0, + "reward": 0.1741071492433548, + "reward_std": 0.225436270236969, + "rewards/accuracy_reward/mean": 0.1741071492433548, + "rewards/accuracy_reward/std": 0.37962549924850464, + "step": 817 + }, + { + "clip_ratio/high_max": 0.00012686286117968848, + "clip_ratio/high_mean": 3.3054781624741736e-05, + "clip_ratio/low_mean": 0.002505845300220244, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025389000029463205, + "epoch": 0.14202930005425937, + "grad_norm": 3.6938467025756836, + "kl": 101.5, + "learning_rate": 1e-06, + "loss": 0.2902, + "step": 818 + }, + { + "clip_ratio/high_max": 0.00032929199096543016, + "clip_ratio/high_mean": 8.803982609606464e-05, + "clip_ratio/low_mean": 0.2018748135305941, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2019628523848951, + "epoch": 0.14220293000542594, + "grad_norm": 6.159348011016846, + "kl": 127.1875, + "learning_rate": 1e-06, + "loss": 0.2981, + "step": 819 + }, + { + "clip_ratio/high_max": 0.00041023037601917167, + "clip_ratio/high_mean": 0.00010867111382140138, + "clip_ratio/low_mean": 0.24521451955661178, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2453231862746179, + "epoch": 0.1423765599565925, + "grad_norm": 7.599151134490967, + "kl": 139.125, + "learning_rate": 1e-06, + "loss": 0.3073, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0004645915096261888, + "clip_ratio/high_mean": 0.00011218153099434858, + "clip_ratio/low_mean": 0.2521938346326351, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25230601988732815, + "epoch": 0.14255018990775908, + "grad_norm": 7.119176864624023, + "kl": 133.25, + "learning_rate": 1e-06, + "loss": 0.3007, + "step": 821 + }, + { + "clip_ratio/high_max": 0.00044500757303467253, + "clip_ratio/high_mean": 0.00011843845027215139, + "clip_ratio/low_mean": 0.17444402165710926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1745624542236328, + "epoch": 0.14272381985892565, + "grad_norm": 3.4784953594207764, + "kl": 114.5, + "learning_rate": 1e-06, + "loss": 0.2846, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0003909331098839175, + "clip_ratio/high_mean": 9.231707053913851e-05, + "clip_ratio/low_mean": 0.02917414781404659, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029266465629916638, + "epoch": 0.14289744981009225, + "grad_norm": 5.367585182189941, + "kl": 91.5, + "learning_rate": 1e-06, + "loss": 0.2912, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0003303216926724417, + "clip_ratio/high_mean": 8.688794241606956e-05, + "clip_ratio/low_mean": 0.023357408994343132, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023444297316018492, + "epoch": 0.14307107976125882, + "grad_norm": 7.3322882652282715, + "kl": 82.96875, + "learning_rate": 1e-06, + "loss": 0.3018, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3025.0, + "completions/mean_length": 2064.875, + "completions/mean_terminated_length": 804.7034912109375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.1432447097124254, + "grad_norm": 5.9115071296691895, + "kl": 92.4375, + "learning_rate": 1e-06, + "loss": 0.306, + "num_tokens": 52154402.0, + "reward": 0.2008928656578064, + "reward_std": 0.19148112833499908, + "rewards/accuracy_reward/mean": 0.2008928507566452, + "rewards/accuracy_reward/std": 0.4011160135269165, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0001255907654922339, + "clip_ratio/high_mean": 2.350597844724689e-05, + "clip_ratio/low_mean": 0.0013281235069371178, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013516294548026053, + "epoch": 0.14341833966359196, + "grad_norm": 3.709016799926758, + "kl": 104.25, + "learning_rate": 1e-06, + "loss": 0.2954, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0002681526557353209, + "clip_ratio/high_mean": 5.5292078286584e-05, + "clip_ratio/low_mean": 0.1887237410992384, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18877902999520302, + "epoch": 0.14359196961475854, + "grad_norm": 7.496574401855469, + "kl": 130.5625, + "learning_rate": 1e-06, + "loss": 0.3058, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0002512324153940426, + "clip_ratio/high_mean": 5.3898011401543044e-05, + "clip_ratio/low_mean": 0.21204704232513905, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2121009435504675, + "epoch": 0.14376559956592513, + "grad_norm": 8.605510711669922, + "kl": 140.625, + "learning_rate": 1e-06, + "loss": 0.3143, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0003502083218336338, + "clip_ratio/high_mean": 7.684045681344287e-05, + "clip_ratio/low_mean": 0.19531646743416786, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.19539330154657364, + "epoch": 0.1439392295170917, + "grad_norm": 7.7824530601501465, + "kl": 134.9375, + "learning_rate": 1e-06, + "loss": 0.3084, + "step": 829 + }, + { + "clip_ratio/high_max": 0.00035015216781175695, + "clip_ratio/high_mean": 6.63939165406191e-05, + "clip_ratio/low_mean": 0.135570315644145, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.13563670963048935, + "epoch": 0.14411285946825828, + "grad_norm": 5.031496047973633, + "kl": 116.1875, + "learning_rate": 1e-06, + "loss": 0.2939, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0029564766427938594, + "clip_ratio/high_mean": 0.0005705070093426912, + "clip_ratio/low_mean": 0.009428453806322068, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009998960886150599, + "epoch": 0.14428648941942485, + "grad_norm": 4.512705326080322, + "kl": 93.8125, + "learning_rate": 1e-06, + "loss": 0.291, + "step": 831 + }, + { + "clip_ratio/high_max": 0.006206038224263466, + "clip_ratio/high_mean": 0.00115343176753413, + "clip_ratio/low_mean": 0.007983371615409851, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009136803448200226, + "epoch": 0.14446011937059142, + "grad_norm": 6.093262672424316, + "kl": 86.3125, + "learning_rate": 1e-06, + "loss": 0.2964, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5558035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3049.0, + "completions/mean_length": 2062.66748046875, + "completions/mean_terminated_length": 799.733642578125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.14463374932175802, + "grad_norm": 2.2618327140808105, + "kl": 132.9375, + "learning_rate": 1e-06, + "loss": 0.3461, + "num_tokens": 53145165.0, + "reward": 0.1852678656578064, + "reward_std": 0.21966484189033508, + "rewards/accuracy_reward/mean": 0.1852678507566452, + "rewards/accuracy_reward/std": 0.38894903659820557, + "step": 833 + }, + { + "clip_ratio/high_max": 0.00018525415543990675, + "clip_ratio/high_mean": 3.14028068260086e-05, + "clip_ratio/low_mean": 0.0006788592663724557, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007102620766090695, + "epoch": 0.14480737927292459, + "grad_norm": 1.4609254598617554, + "kl": 145.4375, + "learning_rate": 1e-06, + "loss": 0.3422, + "step": 834 + }, + { + "clip_ratio/high_max": 0.00025922157783497823, + "clip_ratio/high_mean": 4.568983740682597e-05, + "clip_ratio/low_mean": 0.09208164224401116, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0921273329295218, + "epoch": 0.14498100922409116, + "grad_norm": 4.698945045471191, + "kl": 152.3125, + "learning_rate": 1e-06, + "loss": 0.3423, + "step": 835 + }, + { + "clip_ratio/high_max": 0.00034740680530376267, + "clip_ratio/high_mean": 6.560281826750725e-05, + "clip_ratio/low_mean": 0.09124352596700191, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09130912879481912, + "epoch": 0.14515463917525773, + "grad_norm": 4.255502700805664, + "kl": 146.4375, + "learning_rate": 1e-06, + "loss": 0.3382, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0006773956120014191, + "clip_ratio/high_mean": 0.00012829599074848375, + "clip_ratio/low_mean": 0.0068486670206766576, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006976962846238166, + "epoch": 0.1453282691264243, + "grad_norm": 1.7514528036117554, + "kl": 129.5, + "learning_rate": 1e-06, + "loss": 0.3322, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0013106377937219804, + "clip_ratio/high_mean": 0.00021522672727769532, + "clip_ratio/low_mean": 0.013526803901186213, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013742031354922801, + "epoch": 0.1455018990775909, + "grad_norm": 1.9702811241149902, + "kl": 125.8125, + "learning_rate": 1e-06, + "loss": 0.3286, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0014291402103481232, + "clip_ratio/high_mean": 0.0002479023314663209, + "clip_ratio/low_mean": 0.051988479448482394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0522363813361153, + "epoch": 0.14567552902875747, + "grad_norm": 0.9578021168708801, + "kl": 136.25, + "learning_rate": 1e-06, + "loss": 0.3244, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0019179983964932035, + "clip_ratio/high_mean": 0.0003357902003244817, + "clip_ratio/low_mean": 0.07349270256236196, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07382849487476051, + "epoch": 0.14584915897992404, + "grad_norm": 1.1938884258270264, + "kl": 136.75, + "learning_rate": 1e-06, + "loss": 0.3215, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5892857142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2541.0, + "completions/mean_length": 2101.51123046875, + "completions/mean_terminated_length": 709.0706787109375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1460227889310906, + "grad_norm": 5.252035140991211, + "kl": 150.6875, + "learning_rate": 1e-06, + "loss": 0.3241, + "num_tokens": 54152602.0, + "reward": 0.1629464328289032, + "reward_std": 0.20868763327598572, + "rewards/accuracy_reward/mean": 0.1629464328289032, + "rewards/accuracy_reward/std": 0.3697296679019928, + "step": 841 + }, + { + "clip_ratio/high_max": 0.00023509889251727145, + "clip_ratio/high_mean": 4.190675502968588e-05, + "clip_ratio/low_mean": 3.273296704264794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.463972065124835e-05, + "epoch": 0.14619641888225718, + "grad_norm": 3.312365770339966, + "kl": 130.875, + "learning_rate": 1e-06, + "loss": 0.3157, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0010924797961706645, + "clip_ratio/high_mean": 0.0002439078195948241, + "clip_ratio/low_mean": 6.473634994108579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030864417476550443, + "epoch": 0.14637004883342378, + "grad_norm": 2.2687737941741943, + "kl": 111.875, + "learning_rate": 1e-06, + "loss": 0.3101, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0019106075360468822, + "clip_ratio/high_mean": 0.00040661378488948685, + "clip_ratio/low_mean": 0.0006378849971042655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010444988183735404, + "epoch": 0.14654367878459035, + "grad_norm": 2.8573033809661865, + "kl": 106.625, + "learning_rate": 1e-06, + "loss": 0.3081, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0027624194826785242, + "clip_ratio/high_mean": 0.000548504941207284, + "clip_ratio/low_mean": 0.09737362654414028, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09792213386390358, + "epoch": 0.14671730873575692, + "grad_norm": 3.7025654315948486, + "kl": 113.5625, + "learning_rate": 1e-06, + "loss": 0.3088, + "step": 845 + }, + { + "clip_ratio/high_max": 0.005159801159607014, + "clip_ratio/high_mean": 0.0009050897956512927, + "clip_ratio/low_mean": 0.11268160329200327, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11358669516630471, + "epoch": 0.1468909386869235, + "grad_norm": 4.141618728637695, + "kl": 116.75, + "learning_rate": 1e-06, + "loss": 0.3096, + "step": 846 + }, + { + "clip_ratio/high_max": 0.010196905186603544, + "clip_ratio/high_mean": 0.0016313666319547337, + "clip_ratio/low_mean": 0.09226005314849317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09389142133295536, + "epoch": 0.14706456863809006, + "grad_norm": 3.4845755100250244, + "kl": 115.75, + "learning_rate": 1e-06, + "loss": 0.3041, + "step": 847 + }, + { + "clip_ratio/high_max": 0.013069235183138517, + "clip_ratio/high_mean": 0.002065678575036145, + "clip_ratio/low_mean": 0.0634561866754666, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06552186090266332, + "epoch": 0.14723819858925666, + "grad_norm": 2.180209159851074, + "kl": 111.5625, + "learning_rate": 1e-06, + "loss": 0.2967, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4754464285714286, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2991.0, + "completions/mean_length": 1919.993408203125, + "completions/mean_terminated_length": 875.833984375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.14741182854042323, + "grad_norm": 3.406743049621582, + "kl": 73.21875, + "learning_rate": 1e-06, + "loss": 0.2119, + "num_tokens": 55080319.0, + "reward": 0.2031250149011612, + "reward_std": 0.22695398330688477, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027745723724365, + "step": 849 + }, + { + "clip_ratio/high_max": 0.00015529081065324135, + "clip_ratio/high_mean": 3.7215782640487305e-05, + "clip_ratio/low_mean": 0.000172010878486617, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020922666044498328, + "epoch": 0.1475854584915898, + "grad_norm": 3.292107582092285, + "kl": 77.09375, + "learning_rate": 1e-06, + "loss": 0.2117, + "step": 850 + }, + { + "clip_ratio/high_max": 0.00023992075421119807, + "clip_ratio/high_mean": 5.726272524952947e-05, + "clip_ratio/low_mean": 0.046176289790309966, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046233552624471486, + "epoch": 0.14775908844275637, + "grad_norm": 3.4634037017822266, + "kl": 87.53125, + "learning_rate": 1e-06, + "loss": 0.2107, + "step": 851 + }, + { + "clip_ratio/high_max": 0.00027513896202435717, + "clip_ratio/high_mean": 6.950789884285768e-05, + "clip_ratio/low_mean": 0.11348189879208803, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11355140712112188, + "epoch": 0.14793271839392294, + "grad_norm": 3.9750916957855225, + "kl": 96.1875, + "learning_rate": 1e-06, + "loss": 0.2109, + "step": 852 + }, + { + "clip_ratio/high_max": 0.00035267320890852716, + "clip_ratio/high_mean": 9.100370425585425e-05, + "clip_ratio/low_mean": 0.172421270981431, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.17251227144151926, + "epoch": 0.14810634834508954, + "grad_norm": 4.636952877044678, + "kl": 97.375, + "learning_rate": 1e-06, + "loss": 0.2091, + "step": 853 + }, + { + "clip_ratio/high_max": 0.00046502009354298934, + "clip_ratio/high_mean": 0.00012063065287293284, + "clip_ratio/low_mean": 0.1283295163884759, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1284501487389207, + "epoch": 0.1482799782962561, + "grad_norm": 2.410019874572754, + "kl": 89.65625, + "learning_rate": 1e-06, + "loss": 0.203, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0012583998814079678, + "clip_ratio/high_mean": 0.00028708873992400186, + "clip_ratio/low_mean": 0.04332887171767652, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04361596051603556, + "epoch": 0.14845360824742268, + "grad_norm": 1.791815161705017, + "kl": 75.375, + "learning_rate": 1e-06, + "loss": 0.201, + "step": 855 + }, + { + "clip_ratio/high_max": 0.006825121691235836, + "clip_ratio/high_mean": 0.001231416467874169, + "clip_ratio/low_mean": 0.03896244731731713, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04019386426080018, + "epoch": 0.14862723819858925, + "grad_norm": 2.392704486846924, + "kl": 71.03125, + "learning_rate": 1e-06, + "loss": 0.2019, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4486607142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3035.0, + "completions/mean_length": 1778.602783203125, + "completions/mean_terminated_length": 726.0809936523438, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.14880086814975582, + "grad_norm": 4.096850395202637, + "kl": 96.8125, + "learning_rate": 1e-06, + "loss": 0.316, + "num_tokens": 55936637.0, + "reward": 0.2455357313156128, + "reward_std": 0.21929533779621124, + "rewards/accuracy_reward/mean": 0.2455357164144516, + "rewards/accuracy_reward/std": 0.43088552355766296, + "step": 857 + }, + { + "clip_ratio/high_max": 0.00017866776215669233, + "clip_ratio/high_mean": 4.8445909328620473e-05, + "clip_ratio/low_mean": 0.00025178594660246745, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003002318462677067, + "epoch": 0.14897449810092242, + "grad_norm": 2.397256851196289, + "kl": 108.625, + "learning_rate": 1e-06, + "loss": 0.3092, + "step": 858 + }, + { + "clip_ratio/high_max": 0.00040521596292819595, + "clip_ratio/high_mean": 0.00010485631548817764, + "clip_ratio/low_mean": 0.15862146252766252, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1587263261899352, + "epoch": 0.149148128052089, + "grad_norm": 7.994190216064453, + "kl": 132.3125, + "learning_rate": 1e-06, + "loss": 0.318, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0005164323465578491, + "clip_ratio/high_mean": 0.00013562898402597057, + "clip_ratio/low_mean": 0.18296918272972107, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18310481496155262, + "epoch": 0.14932175800325556, + "grad_norm": 8.996588706970215, + "kl": 141.3125, + "learning_rate": 1e-06, + "loss": 0.3252, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0006346700511130621, + "clip_ratio/high_mean": 0.0001686250153625224, + "clip_ratio/low_mean": 0.18034018063917756, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18050880497321486, + "epoch": 0.14949538795442213, + "grad_norm": 8.342788696289062, + "kl": 135.25, + "learning_rate": 1e-06, + "loss": 0.3185, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0008824562992231222, + "clip_ratio/high_mean": 0.00020379819807203603, + "clip_ratio/low_mean": 0.08937643235549331, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08958023088052869, + "epoch": 0.1496690179055887, + "grad_norm": 3.00089168548584, + "kl": 116.875, + "learning_rate": 1e-06, + "loss": 0.3039, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0006571527728738147, + "clip_ratio/high_mean": 0.0001889484011599052, + "clip_ratio/low_mean": 0.028083443947252817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028272392650251277, + "epoch": 0.1498426478567553, + "grad_norm": 4.135222911834717, + "kl": 94.0, + "learning_rate": 1e-06, + "loss": 0.3052, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0010746735497377813, + "clip_ratio/high_mean": 0.00026568893872536137, + "clip_ratio/low_mean": 0.017398832620528992, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017664521721599158, + "epoch": 0.15001627780792187, + "grad_norm": 5.8383917808532715, + "kl": 83.875, + "learning_rate": 1e-06, + "loss": 0.3096, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5245535714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3025.0, + "completions/mean_length": 1942.46435546875, + "completions/mean_terminated_length": 696.262939453125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.15018990775908844, + "grad_norm": 10.818574905395508, + "kl": 114.5625, + "learning_rate": 1e-06, + "loss": 0.4242, + "num_tokens": 56869205.0, + "reward": 0.2723214328289032, + "reward_std": 0.2886577546596527, + "rewards/accuracy_reward/mean": 0.2723214328289032, + "rewards/accuracy_reward/std": 0.4456520676612854, + "step": 865 + }, + { + "clip_ratio/high_max": 0.00019906975103367586, + "clip_ratio/high_mean": 4.444785486157343e-05, + "clip_ratio/low_mean": 0.0010532964233789244, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001097744250728283, + "epoch": 0.150363537710255, + "grad_norm": 7.167519569396973, + "kl": 128.125, + "learning_rate": 1e-06, + "loss": 0.4071, + "step": 866 + }, + { + "clip_ratio/high_max": 0.00032305725108017214, + "clip_ratio/high_mean": 9.083336090043304e-05, + "clip_ratio/low_mean": 0.2271408773958683, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2272316999733448, + "epoch": 0.15053716766142158, + "grad_norm": 10.448004722595215, + "kl": 158.4375, + "learning_rate": 1e-06, + "loss": 0.4221, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0003819012144958833, + "clip_ratio/high_mean": 0.00011267153877270175, + "clip_ratio/low_mean": 0.23395942896604538, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2340720947831869, + "epoch": 0.15071079761258818, + "grad_norm": 11.921975135803223, + "kl": 173.9375, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 868 + }, + { + "clip_ratio/high_max": 0.00046272928739199415, + "clip_ratio/high_mean": 0.00012198773129057372, + "clip_ratio/low_mean": 0.2354943249374628, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2356163114309311, + "epoch": 0.15088442756375475, + "grad_norm": 11.693387031555176, + "kl": 170.9375, + "learning_rate": 1e-06, + "loss": 0.4323, + "step": 869 + }, + { + "clip_ratio/high_max": 0.00046081977779977024, + "clip_ratio/high_mean": 0.00012551588133646874, + "clip_ratio/low_mean": 0.22017306834459305, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.22029858641326427, + "epoch": 0.15105805751492132, + "grad_norm": 9.720815658569336, + "kl": 151.6875, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0004324185847508488, + "clip_ratio/high_mean": 0.00011304994632155285, + "clip_ratio/low_mean": 0.03095741411380004, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031070463082869537, + "epoch": 0.1512316874660879, + "grad_norm": 5.025526523590088, + "kl": 123.375, + "learning_rate": 1e-06, + "loss": 0.4047, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0005348296199372271, + "clip_ratio/high_mean": 0.00014661678096672404, + "clip_ratio/low_mean": 0.007732835481874645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007879451834014617, + "epoch": 0.15140531741725446, + "grad_norm": 9.378189086914062, + "kl": 115.8125, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6450892857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2980.0, + "completions/mean_length": 2236.53369140625, + "completions/mean_terminated_length": 717.9810791015625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.15157894736842106, + "grad_norm": 3.562270402908325, + "kl": 113.5, + "learning_rate": 1e-06, + "loss": 0.314, + "num_tokens": 57934196.0, + "reward": 0.1674107164144516, + "reward_std": 0.22214049100875854, + "rewards/accuracy_reward/mean": 0.1674107164144516, + "rewards/accuracy_reward/std": 0.37375950813293457, + "step": 873 + }, + { + "clip_ratio/high_max": 0.00036584509689419065, + "clip_ratio/high_mean": 5.9459053431965e-05, + "clip_ratio/low_mean": 0.0029137810810766496, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029732400934108227, + "epoch": 0.15175257731958763, + "grad_norm": 1.0336616039276123, + "kl": 131.9375, + "learning_rate": 1e-06, + "loss": 0.3085, + "step": 874 + }, + { + "clip_ratio/high_max": 0.00044667410247711814, + "clip_ratio/high_mean": 8.63881203940764e-05, + "clip_ratio/low_mean": 0.25069468189030886, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25078106578439474, + "epoch": 0.1519262072707542, + "grad_norm": 11.363845825195312, + "kl": 156.3125, + "learning_rate": 1e-06, + "loss": 0.3225, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0005712292349926429, + "clip_ratio/high_mean": 0.0001376859122501628, + "clip_ratio/low_mean": 0.2641452681273222, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2642829520627856, + "epoch": 0.15209983722192078, + "grad_norm": 12.050036430358887, + "kl": 163.0, + "learning_rate": 1e-06, + "loss": 0.3287, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0006668056030321168, + "clip_ratio/high_mean": 0.0001653730369071127, + "clip_ratio/low_mean": 0.22142764669843018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2215930139645934, + "epoch": 0.15227346717308735, + "grad_norm": 9.697548866271973, + "kl": 153.75, + "learning_rate": 1e-06, + "loss": 0.3189, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0007777407063258579, + "clip_ratio/high_mean": 0.00020208870887472585, + "clip_ratio/low_mean": 0.097353060671594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09755515051074326, + "epoch": 0.15244709712425394, + "grad_norm": 2.767470598220825, + "kl": 131.375, + "learning_rate": 1e-06, + "loss": 0.3065, + "step": 878 + }, + { + "clip_ratio/high_max": 0.04140965358601534, + "clip_ratio/high_mean": 0.006036656538981333, + "clip_ratio/low_mean": 0.05671399270067923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0627506454475224, + "epoch": 0.15262072707542051, + "grad_norm": 5.222328186035156, + "kl": 107.1875, + "learning_rate": 1e-06, + "loss": 0.3075, + "step": 879 + }, + { + "clip_ratio/high_max": 0.05749111062505108, + "clip_ratio/high_mean": 0.008346931070605024, + "clip_ratio/low_mean": 0.019935731328132533, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028282661238336004, + "epoch": 0.15279435702658709, + "grad_norm": 7.102351665496826, + "kl": 97.25, + "learning_rate": 1e-06, + "loss": 0.3107, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5089285714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3062.0, + "completions/mean_length": 1949.7724609375, + "completions/mean_terminated_length": 786.736328125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.15296798697775366, + "grad_norm": 8.057353973388672, + "kl": 96.125, + "learning_rate": 1e-06, + "loss": 0.3119, + "num_tokens": 58873966.0, + "reward": 0.1741071492433548, + "reward_std": 0.2244591861963272, + "rewards/accuracy_reward/mean": 0.1741071492433548, + "rewards/accuracy_reward/std": 0.37962549924850464, + "step": 881 + }, + { + "clip_ratio/high_max": 0.00011034975068469066, + "clip_ratio/high_mean": 2.3108349978429032e-05, + "clip_ratio/low_mean": 0.008034886610403191, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008057995117269456, + "epoch": 0.15314161692892023, + "grad_norm": 3.886857032775879, + "kl": 110.5, + "learning_rate": 1e-06, + "loss": 0.2997, + "step": 882 + }, + { + "clip_ratio/high_max": 0.00021406718224170618, + "clip_ratio/high_mean": 5.413525832409505e-05, + "clip_ratio/low_mean": 0.2143416740000248, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21439581364393234, + "epoch": 0.15331524688008683, + "grad_norm": 10.933019638061523, + "kl": 141.1875, + "learning_rate": 1e-06, + "loss": 0.3212, + "step": 883 + }, + { + "clip_ratio/high_max": 0.00023887834413471865, + "clip_ratio/high_mean": 7.264062651302083e-05, + "clip_ratio/low_mean": 0.2177857868373394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2178584225475788, + "epoch": 0.1534888768312534, + "grad_norm": 12.47949504852295, + "kl": 157.375, + "learning_rate": 1e-06, + "loss": 0.3367, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0003053253294638125, + "clip_ratio/high_mean": 7.214519200715586e-05, + "clip_ratio/low_mean": 0.21796653978526592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21803868375718594, + "epoch": 0.15366250678241997, + "grad_norm": 12.389633178710938, + "kl": 156.75, + "learning_rate": 1e-06, + "loss": 0.3358, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0017158881655632285, + "clip_ratio/high_mean": 0.00028109845879953355, + "clip_ratio/low_mean": 0.21487980522215366, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2151609044522047, + "epoch": 0.15383613673358654, + "grad_norm": 10.647978782653809, + "kl": 137.9375, + "learning_rate": 1e-06, + "loss": 0.3187, + "step": 886 + }, + { + "clip_ratio/high_max": 0.00997382725472562, + "clip_ratio/high_mean": 0.001464127509507307, + "clip_ratio/low_mean": 0.08623948926106095, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08770361728966236, + "epoch": 0.1540097666847531, + "grad_norm": 2.671112537384033, + "kl": 111.0625, + "learning_rate": 1e-06, + "loss": 0.2977, + "step": 887 + }, + { + "clip_ratio/high_max": 0.01454050781467231, + "clip_ratio/high_mean": 0.002137568667421874, + "clip_ratio/low_mean": 0.00012826038346247515, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002265829130919883, + "epoch": 0.1541833966359197, + "grad_norm": 10.069171905517578, + "kl": 84.1875, + "learning_rate": 1e-06, + "loss": 0.3157, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5178571428571428, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3062.0, + "completions/mean_length": 1985.2724609375, + "completions/mean_terminated_length": 818.0463256835938, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.15435702658708628, + "grad_norm": 13.608566284179688, + "kl": 69.25, + "learning_rate": 1e-06, + "loss": 0.3326, + "num_tokens": 59831896.0, + "reward": 0.2299107313156128, + "reward_std": 0.25083982944488525, + "rewards/accuracy_reward/mean": 0.2299107164144516, + "rewards/accuracy_reward/std": 0.42124560475349426, + "step": 889 + }, + { + "clip_ratio/high_max": 0.00018199639180238592, + "clip_ratio/high_mean": 3.663679262899677e-05, + "clip_ratio/low_mean": 3.471032061952428e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.134711336220789e-05, + "epoch": 0.15453065653825285, + "grad_norm": 13.83791446685791, + "kl": 68.8125, + "learning_rate": 1e-06, + "loss": 0.3336, + "step": 890 + }, + { + "clip_ratio/high_max": 0.00033219166562048486, + "clip_ratio/high_mean": 7.436216003497975e-05, + "clip_ratio/low_mean": 0.008575549305533059, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008649911134853028, + "epoch": 0.15470428648941942, + "grad_norm": 11.172715187072754, + "kl": 75.875, + "learning_rate": 1e-06, + "loss": 0.3163, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0004944398542647832, + "clip_ratio/high_mean": 0.00010643138398336305, + "clip_ratio/low_mean": 0.2188781681470573, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21898460248485208, + "epoch": 0.154877916440586, + "grad_norm": 3.9188079833984375, + "kl": 91.375, + "learning_rate": 1e-06, + "loss": 0.3062, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0005504580913111567, + "clip_ratio/high_mean": 0.0001277971268791589, + "clip_ratio/low_mean": 0.26006196718662977, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2601897739805281, + "epoch": 0.1550515463917526, + "grad_norm": 6.788144111633301, + "kl": 98.25, + "learning_rate": 1e-06, + "loss": 0.3113, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0005903348665015073, + "clip_ratio/high_mean": 0.0001416546588188794, + "clip_ratio/low_mean": 0.24130725720897317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.24144891370087862, + "epoch": 0.15522517634291916, + "grad_norm": 5.191519737243652, + "kl": 94.1875, + "learning_rate": 1e-06, + "loss": 0.3073, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0005820472761115525, + "clip_ratio/high_mean": 0.0001437286537111504, + "clip_ratio/low_mean": 0.08394990768283606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08409363636747003, + "epoch": 0.15539880629408573, + "grad_norm": 5.600636005401611, + "kl": 83.53125, + "learning_rate": 1e-06, + "loss": 0.3044, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0005412535756477155, + "clip_ratio/high_mean": 0.000132994029627298, + "clip_ratio/low_mean": 0.05464291200041771, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05477590672671795, + "epoch": 0.1555724362452523, + "grad_norm": 7.232648849487305, + "kl": 81.25, + "learning_rate": 1e-06, + "loss": 0.3045, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5915178571428572, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2433.0, + "completions/mean_length": 2105.051513671875, + "completions/mean_terminated_length": 704.8251342773438, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.15574606619641887, + "grad_norm": 3.882009744644165, + "kl": 91.75, + "learning_rate": 1e-06, + "loss": 0.2449, + "num_tokens": 60836439.0, + "reward": 0.1785714328289032, + "reward_std": 0.20447613298892975, + "rewards/accuracy_reward/mean": 0.1785714328289032, + "rewards/accuracy_reward/std": 0.3834211826324463, + "step": 897 + }, + { + "clip_ratio/high_max": 0.00012332035021245247, + "clip_ratio/high_mean": 3.363350992913183e-05, + "clip_ratio/low_mean": 0.015030266637040768, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015063900216773618, + "epoch": 0.15591969614758547, + "grad_norm": 1.0474673509597778, + "kl": 108.75, + "learning_rate": 1e-06, + "loss": 0.2397, + "step": 898 + }, + { + "clip_ratio/high_max": 0.00011076953978772508, + "clip_ratio/high_mean": 2.955118873160245e-05, + "clip_ratio/low_mean": 0.20200675912201405, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.20203630672767758, + "epoch": 0.15609332609875204, + "grad_norm": 9.088586807250977, + "kl": 123.625, + "learning_rate": 1e-06, + "loss": 0.249, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0003780673578148708, + "clip_ratio/high_mean": 7.620273072461714e-05, + "clip_ratio/low_mean": 0.21687012072652578, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21694633085280657, + "epoch": 0.1562669560499186, + "grad_norm": 9.344820976257324, + "kl": 126.5625, + "learning_rate": 1e-06, + "loss": 0.2506, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0003608686674851924, + "clip_ratio/high_mean": 7.9843001685731e-05, + "clip_ratio/low_mean": 0.17888206150382757, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.17896190285682678, + "epoch": 0.15644058600108518, + "grad_norm": 6.884845733642578, + "kl": 117.75, + "learning_rate": 1e-06, + "loss": 0.2425, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0008652109172544442, + "clip_ratio/high_mean": 0.00016611302180535858, + "clip_ratio/low_mean": 0.03379168309038505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03395779646234587, + "epoch": 0.15661421595225175, + "grad_norm": 1.2255645990371704, + "kl": 100.3125, + "learning_rate": 1e-06, + "loss": 0.2352, + "step": 902 + }, + { + "clip_ratio/high_max": 0.011630198830971494, + "clip_ratio/high_mean": 0.0017121230994234793, + "clip_ratio/low_mean": 0.009956433277693577, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01166855694464175, + "epoch": 0.15678784590341835, + "grad_norm": 3.3680505752563477, + "kl": 93.1875, + "learning_rate": 1e-06, + "loss": 0.2357, + "step": 903 + }, + { + "clip_ratio/high_max": 0.011259090668318095, + "clip_ratio/high_mean": 0.0016609075319138356, + "clip_ratio/low_mean": 0.021677090357115958, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023337997918133624, + "epoch": 0.15696147585458492, + "grad_norm": 2.1590869426727295, + "kl": 95.75, + "learning_rate": 1e-06, + "loss": 0.2329, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5401785714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2863.0, + "completions/mean_length": 2015.8349609375, + "completions/mean_terminated_length": 775.0971069335938, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1571351058057515, + "grad_norm": 7.960583209991455, + "kl": 116.6875, + "learning_rate": 1e-06, + "loss": 0.3772, + "num_tokens": 61805653.0, + "reward": 0.2366071492433548, + "reward_std": 0.2555035650730133, + "rewards/accuracy_reward/mean": 0.2366071492433548, + "rewards/accuracy_reward/std": 0.4254741966724396, + "step": 905 + }, + { + "clip_ratio/high_max": 0.00012708731355814962, + "clip_ratio/high_mean": 2.765070314580953e-05, + "clip_ratio/low_mean": 0.07006526505574584, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0700929150916636, + "epoch": 0.15730873575691806, + "grad_norm": 2.2192304134368896, + "kl": 142.5625, + "learning_rate": 1e-06, + "loss": 0.3648, + "step": 906 + }, + { + "clip_ratio/high_max": 0.00021208496400504373, + "clip_ratio/high_mean": 4.1516070496072643e-05, + "clip_ratio/low_mean": 0.2255147397518158, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.22555625438690186, + "epoch": 0.15748236570808463, + "grad_norm": 12.311400413513184, + "kl": 158.875, + "learning_rate": 1e-06, + "loss": 0.3773, + "step": 907 + }, + { + "clip_ratio/high_max": 0.00018545834109318093, + "clip_ratio/high_mean": 4.368719419289846e-05, + "clip_ratio/low_mean": 0.2286800742149353, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.22872376441955566, + "epoch": 0.15765599565925123, + "grad_norm": 12.384166717529297, + "kl": 159.25, + "learning_rate": 1e-06, + "loss": 0.3769, + "step": 908 + }, + { + "clip_ratio/high_max": 0.00022099344596426818, + "clip_ratio/high_mean": 5.3646714945898566e-05, + "clip_ratio/low_mean": 0.1372719295322895, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.13732558395713568, + "epoch": 0.1578296256104178, + "grad_norm": 6.36605167388916, + "kl": 145.375, + "learning_rate": 1e-06, + "loss": 0.3629, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0002334891228201741, + "clip_ratio/high_mean": 6.183929747294314e-05, + "clip_ratio/low_mean": 0.003744095837646455, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0038059352194750318, + "epoch": 0.15800325556158437, + "grad_norm": 6.614505767822266, + "kl": 122.1875, + "learning_rate": 1e-06, + "loss": 0.3672, + "step": 910 + }, + { + "clip_ratio/high_max": 0.00032186056614591507, + "clip_ratio/high_mean": 8.143467493937351e-05, + "clip_ratio/low_mean": 0.007574810081678152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007656245174075593, + "epoch": 0.15817688551275094, + "grad_norm": 8.060328483581543, + "kl": 115.75, + "learning_rate": 1e-06, + "loss": 0.3716, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0002860075865100953, + "clip_ratio/high_mean": 6.525664400669484e-05, + "clip_ratio/low_mean": 0.01937928500410635, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0194445414817892, + "epoch": 0.1583505154639175, + "grad_norm": 6.690429210662842, + "kl": 121.1875, + "learning_rate": 1e-06, + "loss": 0.3657, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5870535714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3029.0, + "completions/mean_length": 2087.696533203125, + "completions/mean_terminated_length": 688.3892211914062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1585241454150841, + "grad_norm": 1.500453233718872, + "kl": 140.5625, + "learning_rate": 1e-06, + "loss": 0.3308, + "num_tokens": 62807837.0, + "reward": 0.1696428656578064, + "reward_std": 0.18126347661018372, + "rewards/accuracy_reward/mean": 0.1696428507566452, + "rewards/accuracy_reward/std": 0.37573832273483276, + "step": 913 + }, + { + "clip_ratio/high_max": 0.00010637701734594884, + "clip_ratio/high_mean": 1.7202294884555158e-05, + "clip_ratio/low_mean": 0.07319884316530079, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07321604655589908, + "epoch": 0.15869777536625068, + "grad_norm": 8.288074493408203, + "kl": 170.125, + "learning_rate": 1e-06, + "loss": 0.3338, + "step": 914 + }, + { + "clip_ratio/high_max": 0.00011164673060193309, + "clip_ratio/high_mean": 2.7038594112127612e-05, + "clip_ratio/low_mean": 0.15749489981681108, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15752193424850702, + "epoch": 0.15887140531741725, + "grad_norm": 13.169252395629883, + "kl": 182.375, + "learning_rate": 1e-06, + "loss": 0.3436, + "step": 915 + }, + { + "clip_ratio/high_max": 0.00025164939779642737, + "clip_ratio/high_mean": 5.06560887743035e-05, + "clip_ratio/low_mean": 0.13196592358872294, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.13201657449826598, + "epoch": 0.15904503526858382, + "grad_norm": 11.579682350158691, + "kl": 175.875, + "learning_rate": 1e-06, + "loss": 0.3379, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0009999656654144928, + "clip_ratio/high_mean": 0.0001562938389838564, + "clip_ratio/low_mean": 0.023280302322746138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02343659666075837, + "epoch": 0.1592186652197504, + "grad_norm": 4.078309059143066, + "kl": 153.0, + "learning_rate": 1e-06, + "loss": 0.3231, + "step": 917 + }, + { + "clip_ratio/high_max": 0.016546025154184463, + "clip_ratio/high_mean": 0.002385221668646409, + "clip_ratio/low_mean": 0.00040862758532966836, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027938492221437627, + "epoch": 0.159392295170917, + "grad_norm": 3.589702606201172, + "kl": 122.4375, + "learning_rate": 1e-06, + "loss": 0.3217, + "step": 918 + }, + { + "clip_ratio/high_max": 0.025767189600628626, + "clip_ratio/high_mean": 0.0037047950551141184, + "clip_ratio/low_mean": 0.0005373598905862309, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004242155049723806, + "epoch": 0.15956592512208356, + "grad_norm": 5.998327732086182, + "kl": 111.0, + "learning_rate": 1e-06, + "loss": 0.3253, + "step": 919 + }, + { + "clip_ratio/high_max": 0.026498649200220825, + "clip_ratio/high_mean": 0.0038175268642817173, + "clip_ratio/low_mean": 0.0005658570084960957, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004383383849017264, + "epoch": 0.15973955507325013, + "grad_norm": 5.540132522583008, + "kl": 112.25, + "learning_rate": 1e-06, + "loss": 0.3217, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6026785714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2917.0, + "completions/mean_length": 2149.1005859375, + "completions/mean_terminated_length": 749.1966552734375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1599131850244167, + "grad_norm": 4.431583881378174, + "kl": 107.9375, + "learning_rate": 1e-06, + "loss": 0.2767, + "num_tokens": 63838858.0, + "reward": 0.1875000149011612, + "reward_std": 0.22567379474639893, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3907487094402313, + "step": 921 + }, + { + "clip_ratio/high_max": 6.693550949421478e-05, + "clip_ratio/high_mean": 1.2670173987316957e-05, + "clip_ratio/low_mean": 0.0671194423630368, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06713211312307976, + "epoch": 0.16008681497558327, + "grad_norm": 3.6399943828582764, + "kl": 130.875, + "learning_rate": 1e-06, + "loss": 0.2722, + "step": 922 + }, + { + "clip_ratio/high_max": 0.00015479441071875044, + "clip_ratio/high_mean": 2.8481904621457943e-05, + "clip_ratio/low_mean": 0.20874573290348053, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.20877421647310257, + "epoch": 0.16026044492674987, + "grad_norm": 10.984441757202148, + "kl": 143.25, + "learning_rate": 1e-06, + "loss": 0.2813, + "step": 923 + }, + { + "clip_ratio/high_max": 9.760822058524354e-05, + "clip_ratio/high_mean": 2.0003820111469395e-05, + "clip_ratio/low_mean": 0.19261416653171182, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.19263417040929198, + "epoch": 0.16043407487791644, + "grad_norm": 10.206161499023438, + "kl": 140.75, + "learning_rate": 1e-06, + "loss": 0.2776, + "step": 924 + }, + { + "clip_ratio/high_max": 0.00024990056408569217, + "clip_ratio/high_mean": 4.66027163383842e-05, + "clip_ratio/low_mean": 0.012099524845325504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012146127957748831, + "epoch": 0.16060770482908301, + "grad_norm": 1.2792290449142456, + "kl": 124.375, + "learning_rate": 1e-06, + "loss": 0.2673, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0005967910674371524, + "clip_ratio/high_mean": 0.0001559015665861807, + "clip_ratio/low_mean": 0.003816013726464007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003971915206420817, + "epoch": 0.16078133478024959, + "grad_norm": 2.380481004714966, + "kl": 116.625, + "learning_rate": 1e-06, + "loss": 0.2648, + "step": 926 + }, + { + "clip_ratio/high_max": 0.000886707566678524, + "clip_ratio/high_mean": 0.00024679061425558757, + "clip_ratio/low_mean": 0.045372905500698835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.045619698939844966, + "epoch": 0.16095496473141616, + "grad_norm": 1.2520081996917725, + "kl": 121.4375, + "learning_rate": 1e-06, + "loss": 0.2614, + "step": 927 + }, + { + "clip_ratio/high_max": 0.01455737085416331, + "clip_ratio/high_mean": 0.003907026200522523, + "clip_ratio/low_mean": 0.062229252012912184, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06613627896877006, + "epoch": 0.16112859468258275, + "grad_norm": 1.36063814163208, + "kl": 119.0625, + "learning_rate": 1e-06, + "loss": 0.258, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5111607142857143, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3016.0, + "completions/mean_length": 1925.6607666015625, + "completions/mean_terminated_length": 726.9771118164062, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.16130222463374932, + "grad_norm": 6.980772018432617, + "kl": 114.4375, + "learning_rate": 1e-06, + "loss": 0.3387, + "num_tokens": 64763274.0, + "reward": 0.2075892984867096, + "reward_std": 0.24138395488262177, + "rewards/accuracy_reward/mean": 0.2075892835855484, + "rewards/accuracy_reward/std": 0.4060344398021698, + "step": 929 + }, + { + "clip_ratio/high_max": 0.00019260190765635343, + "clip_ratio/high_mean": 4.413607376818618e-05, + "clip_ratio/low_mean": 1.581258709393296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.994866126002307e-05, + "epoch": 0.1614758545849159, + "grad_norm": 6.23195743560791, + "kl": 117.625, + "learning_rate": 1e-06, + "loss": 0.3357, + "step": 930 + }, + { + "clip_ratio/high_max": 0.00026050472479255404, + "clip_ratio/high_mean": 5.937071819062112e-05, + "clip_ratio/low_mean": 0.03085515503539682, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03091452617491086, + "epoch": 0.16164948453608247, + "grad_norm": 1.5646380186080933, + "kl": 134.6875, + "learning_rate": 1e-06, + "loss": 0.328, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0004808062549273018, + "clip_ratio/high_mean": 9.861621470008686e-05, + "clip_ratio/low_mean": 0.27080697752535343, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2709056013263762, + "epoch": 0.16182311448724904, + "grad_norm": 13.239015579223633, + "kl": 165.875, + "learning_rate": 1e-06, + "loss": 0.3505, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0006032564706401899, + "clip_ratio/high_mean": 0.00014261061528486607, + "clip_ratio/low_mean": 0.2853693403303623, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2855119528248906, + "epoch": 0.16199674443841564, + "grad_norm": 14.784218788146973, + "kl": 179.5, + "learning_rate": 1e-06, + "loss": 0.3632, + "step": 933 + }, + { + "clip_ratio/high_max": 0.001075469936040463, + "clip_ratio/high_mean": 0.00019831609210996248, + "clip_ratio/low_mean": 0.28019624296575785, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2803945569321513, + "epoch": 0.1621703743895822, + "grad_norm": 13.904273986816406, + "kl": 171.3125, + "learning_rate": 1e-06, + "loss": 0.3547, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0012982589032617398, + "clip_ratio/high_mean": 0.00025886636512950645, + "clip_ratio/low_mean": 0.20548797718947753, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.20574685028987005, + "epoch": 0.16234400434074878, + "grad_norm": 9.467459678649902, + "kl": 146.5, + "learning_rate": 1e-06, + "loss": 0.332, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0017942055601452012, + "clip_ratio/high_mean": 0.0003429347430028429, + "clip_ratio/low_mean": 0.0062899544718675315, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006632889053435065, + "epoch": 0.16251763429191535, + "grad_norm": 6.360079288482666, + "kl": 113.6875, + "learning_rate": 1e-06, + "loss": 0.3262, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3021.0, + "completions/mean_length": 2055.87060546875, + "completions/mean_terminated_length": 749.4183349609375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.16269126424308192, + "grad_norm": 10.193787574768066, + "kl": 114.375, + "learning_rate": 1e-06, + "loss": 0.3974, + "num_tokens": 65756176.0, + "reward": 0.2142857313156128, + "reward_std": 0.21259182691574097, + "rewards/accuracy_reward/mean": 0.2142857164144516, + "rewards/accuracy_reward/std": 0.41078460216522217, + "step": 937 + }, + { + "clip_ratio/high_max": 0.00011761525047404575, + "clip_ratio/high_mean": 2.2554153702003532e-05, + "clip_ratio/low_mean": 1.3739087762587587e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.629324203302531e-05, + "epoch": 0.16286489419424852, + "grad_norm": 9.7383394241333, + "kl": 115.75, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 938 + }, + { + "clip_ratio/high_max": 0.00016671485855113133, + "clip_ratio/high_mean": 2.8405810667209153e-05, + "clip_ratio/low_mean": 0.00338333747777142, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034117433351639193, + "epoch": 0.1630385241454151, + "grad_norm": 5.595941066741943, + "kl": 130.5625, + "learning_rate": 1e-06, + "loss": 0.3814, + "step": 939 + }, + { + "clip_ratio/high_max": 0.00043147909082108526, + "clip_ratio/high_mean": 7.483556942133873e-05, + "clip_ratio/low_mean": 0.22171291429549456, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.22178774420171976, + "epoch": 0.16321215409658166, + "grad_norm": 13.204809188842773, + "kl": 163.8125, + "learning_rate": 1e-06, + "loss": 0.4003, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0005606060985883232, + "clip_ratio/high_mean": 9.097210920572252e-05, + "clip_ratio/low_mean": 0.2366767730563879, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2367677390575409, + "epoch": 0.16338578404774823, + "grad_norm": 14.937211990356445, + "kl": 178.5, + "learning_rate": 1e-06, + "loss": 0.4141, + "step": 941 + }, + { + "clip_ratio/high_max": 0.00066512873127067, + "clip_ratio/high_mean": 0.00010060433078251663, + "clip_ratio/low_mean": 0.2341552283614874, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2342558316886425, + "epoch": 0.16355941399891483, + "grad_norm": 14.252479553222656, + "kl": 171.5625, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0005938055655860808, + "clip_ratio/high_mean": 9.706609193926852e-05, + "clip_ratio/low_mean": 0.14637777907773852, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.14647484989836812, + "epoch": 0.1637330439500814, + "grad_norm": 8.105178833007812, + "kl": 147.0625, + "learning_rate": 1e-06, + "loss": 0.3842, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0008423180474892433, + "clip_ratio/high_mean": 0.00013988705120482336, + "clip_ratio/low_mean": 0.008517481239323388, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008657368232888984, + "epoch": 0.16390667390124797, + "grad_norm": 8.572639465332031, + "kl": 115.5625, + "learning_rate": 1e-06, + "loss": 0.3818, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4732142857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3043.0, + "completions/mean_length": 1887.01123046875, + "completions/mean_terminated_length": 822.5296630859375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.16408030385241454, + "grad_norm": 10.422353744506836, + "kl": 93.75, + "learning_rate": 1e-06, + "loss": 0.3209, + "num_tokens": 66667021.0, + "reward": 0.2321428656578064, + "reward_std": 0.21905724704265594, + "rewards/accuracy_reward/mean": 0.2321428507566452, + "rewards/accuracy_reward/std": 0.4226716458797455, + "step": 945 + }, + { + "clip_ratio/high_max": 0.00010849650971067604, + "clip_ratio/high_mean": 2.647307758252282e-05, + "clip_ratio/low_mean": 1.806760712952382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4540684484672965e-05, + "epoch": 0.1642539338035811, + "grad_norm": 9.938231468200684, + "kl": 94.84375, + "learning_rate": 1e-06, + "loss": 0.3178, + "step": 946 + }, + { + "clip_ratio/high_max": 0.00023957950270414585, + "clip_ratio/high_mean": 4.439446558990312e-05, + "clip_ratio/low_mean": 0.0026145361644012155, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002658930486177269, + "epoch": 0.1644275637547477, + "grad_norm": 6.612903594970703, + "kl": 106.5, + "learning_rate": 1e-06, + "loss": 0.3039, + "step": 947 + }, + { + "clip_ratio/high_max": 0.00023700459769315785, + "clip_ratio/high_mean": 5.029195040151535e-05, + "clip_ratio/low_mean": 0.23280390910804272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2328542061150074, + "epoch": 0.16460119370591428, + "grad_norm": 10.239872932434082, + "kl": 129.90625, + "learning_rate": 1e-06, + "loss": 0.3136, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0002602677432150813, + "clip_ratio/high_mean": 5.894684159102326e-05, + "clip_ratio/low_mean": 0.2397623723372817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.23982131946831942, + "epoch": 0.16477482365708085, + "grad_norm": 11.328302383422852, + "kl": 139.75, + "learning_rate": 1e-06, + "loss": 0.3232, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0002460751211401657, + "clip_ratio/high_mean": 5.1699793175430386e-05, + "clip_ratio/low_mean": 0.23503495659679174, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2350866561755538, + "epoch": 0.16494845360824742, + "grad_norm": 10.441222190856934, + "kl": 134.75, + "learning_rate": 1e-06, + "loss": 0.3182, + "step": 950 + }, + { + "clip_ratio/high_max": 0.00029255266599648166, + "clip_ratio/high_mean": 6.19315328549419e-05, + "clip_ratio/low_mean": 0.17671096278354526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.17677289759740233, + "epoch": 0.165122083559414, + "grad_norm": 5.125453472137451, + "kl": 117.625, + "learning_rate": 1e-06, + "loss": 0.3051, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0003273503443779191, + "clip_ratio/high_mean": 7.093568183336174e-05, + "clip_ratio/low_mean": 0.026120682397049677, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026191617322865568, + "epoch": 0.1652957135105806, + "grad_norm": 7.151223659515381, + "kl": 95.09375, + "learning_rate": 1e-06, + "loss": 0.306, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4308035714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2903.0, + "completions/mean_length": 1742.55810546875, + "completions/mean_terminated_length": 736.3529663085938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.16546934346174716, + "grad_norm": 10.497465133666992, + "kl": 76.40625, + "learning_rate": 1e-06, + "loss": 0.2989, + "num_tokens": 67509151.0, + "reward": 0.2477678656578064, + "reward_std": 0.23506686091423035, + "rewards/accuracy_reward/mean": 0.2477678507566452, + "rewards/accuracy_reward/std": 0.4321989119052887, + "step": 953 + }, + { + "clip_ratio/high_max": 0.00022534286017616978, + "clip_ratio/high_mean": 3.7859357746583555e-05, + "clip_ratio/low_mean": 2.1199214415901224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.905857125299008e-05, + "epoch": 0.16564297341291373, + "grad_norm": 9.934066772460938, + "kl": 78.15625, + "learning_rate": 1e-06, + "loss": 0.2954, + "step": 954 + }, + { + "clip_ratio/high_max": 0.00019409676269788179, + "clip_ratio/high_mean": 4.253300915024738e-05, + "clip_ratio/low_mean": 0.00030421620613196865, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003467492215349921, + "epoch": 0.1658166033640803, + "grad_norm": 7.287125110626221, + "kl": 87.65625, + "learning_rate": 1e-06, + "loss": 0.2817, + "step": 955 + }, + { + "clip_ratio/high_max": 0.00019738081391551532, + "clip_ratio/high_mean": 4.903882182816233e-05, + "clip_ratio/low_mean": 0.24304093004775496, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.24308996790659876, + "epoch": 0.16599023331524687, + "grad_norm": 8.199006080627441, + "kl": 107.125, + "learning_rate": 1e-06, + "loss": 0.2885, + "step": 956 + }, + { + "clip_ratio/high_max": 0.00034684870934142964, + "clip_ratio/high_mean": 7.373817197731114e-05, + "clip_ratio/low_mean": 0.24608148261904716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.24615521685791464, + "epoch": 0.16616386326641347, + "grad_norm": 9.010292053222656, + "kl": 116.375, + "learning_rate": 1e-06, + "loss": 0.297, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0004251130285410909, + "clip_ratio/high_mean": 9.235024299414363e-05, + "clip_ratio/low_mean": 0.245053518563509, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2451458527448267, + "epoch": 0.16633749321758004, + "grad_norm": 8.751235961914062, + "kl": 113.875, + "learning_rate": 1e-06, + "loss": 0.2944, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0003720828808582155, + "clip_ratio/high_mean": 8.570128011342604e-05, + "clip_ratio/low_mean": 0.23733620159327984, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.23742190518987627, + "epoch": 0.1665111231687466, + "grad_norm": 7.445094585418701, + "kl": 101.4375, + "learning_rate": 1e-06, + "loss": 0.2825, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0003148090913782653, + "clip_ratio/high_mean": 7.612507499743515e-05, + "clip_ratio/low_mean": 0.002286539122906106, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023626640931979637, + "epoch": 0.16668475311991318, + "grad_norm": 7.585599899291992, + "kl": 83.09375, + "learning_rate": 1e-06, + "loss": 0.2814, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2870.0, + "completions/mean_length": 1996.3460693359375, + "completions/mean_terminated_length": 777.271484375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.16685838307107975, + "grad_norm": 4.457127571105957, + "kl": 98.25, + "learning_rate": 1e-06, + "loss": 0.2684, + "num_tokens": 68470442.0, + "reward": 0.15625, + "reward_std": 0.19389037787914276, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36349809169769287, + "step": 961 + }, + { + "clip_ratio/high_max": 4.870517568633659e-05, + "clip_ratio/high_mean": 1.1530878623489116e-05, + "clip_ratio/low_mean": 1.6947949916357175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8478828198785777e-05, + "epoch": 0.16703201302224635, + "grad_norm": 4.045124530792236, + "kl": 99.6875, + "learning_rate": 1e-06, + "loss": 0.2664, + "step": 962 + }, + { + "clip_ratio/high_max": 5.519406067833188e-05, + "clip_ratio/high_mean": 1.2269231206119002e-05, + "clip_ratio/low_mean": 0.00046836378646730736, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00048063301915135526, + "epoch": 0.16720564297341292, + "grad_norm": 1.9745092391967773, + "kl": 110.1875, + "learning_rate": 1e-06, + "loss": 0.2611, + "step": 963 + }, + { + "clip_ratio/high_max": 0.00010156120242754696, + "clip_ratio/high_mean": 2.187402935760474e-05, + "clip_ratio/low_mean": 0.21372982999309897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21375170117244124, + "epoch": 0.1673792729245795, + "grad_norm": 9.501655578613281, + "kl": 130.1875, + "learning_rate": 1e-06, + "loss": 0.2708, + "step": 964 + }, + { + "clip_ratio/high_max": 0.00014319671299745096, + "clip_ratio/high_mean": 3.603941013352596e-05, + "clip_ratio/low_mean": 0.22116861259564757, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2212046510539949, + "epoch": 0.16755290287574606, + "grad_norm": 10.32266616821289, + "kl": 137.5625, + "learning_rate": 1e-06, + "loss": 0.278, + "step": 965 + }, + { + "clip_ratio/high_max": 0.00017488148432676098, + "clip_ratio/high_mean": 4.130405636715295e-05, + "clip_ratio/low_mean": 0.21784118004143238, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21788247488439083, + "epoch": 0.16772653282691263, + "grad_norm": 9.790507316589355, + "kl": 131.8125, + "learning_rate": 1e-06, + "loss": 0.2727, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0003572340747268754, + "clip_ratio/high_mean": 8.248213782735547e-05, + "clip_ratio/low_mean": 0.10994304018095136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11002552369609475, + "epoch": 0.16790016277807923, + "grad_norm": 5.344204902648926, + "kl": 115.625, + "learning_rate": 1e-06, + "loss": 0.2587, + "step": 967 + }, + { + "clip_ratio/high_max": 0.00271320123829355, + "clip_ratio/high_mean": 0.00041651809578979737, + "clip_ratio/low_mean": 0.0007406793471886886, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001157197496468143, + "epoch": 0.1680737927292458, + "grad_norm": 3.9079058170318604, + "kl": 93.875, + "learning_rate": 1e-06, + "loss": 0.2573, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5200892857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3029.0, + "completions/mean_length": 2000.107177734375, + "completions/mean_terminated_length": 838.4744262695312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.16824742268041237, + "grad_norm": 6.443572998046875, + "kl": 73.03125, + "learning_rate": 1e-06, + "loss": 0.2359, + "num_tokens": 69433258.0, + "reward": 0.1517857164144516, + "reward_std": 0.19779597222805023, + "rewards/accuracy_reward/mean": 0.1517857164144516, + "rewards/accuracy_reward/std": 0.359214186668396, + "step": 969 + }, + { + "clip_ratio/high_max": 0.00012006993620161666, + "clip_ratio/high_mean": 2.2355856742706237e-05, + "clip_ratio/low_mean": 1.8684762949305878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.104061804355297e-05, + "epoch": 0.16842105263157894, + "grad_norm": 6.408876419067383, + "kl": 71.8125, + "learning_rate": 1e-06, + "loss": 0.2346, + "step": 970 + }, + { + "clip_ratio/high_max": 9.091695847018855e-05, + "clip_ratio/high_mean": 2.263460930862493e-05, + "clip_ratio/low_mean": 1.610271874596947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.873732862302859e-05, + "epoch": 0.16859468258274551, + "grad_norm": 5.068730354309082, + "kl": 76.375, + "learning_rate": 1e-06, + "loss": 0.2266, + "step": 971 + }, + { + "clip_ratio/high_max": 0.00019825510389637202, + "clip_ratio/high_mean": 4.5022927395166334e-05, + "clip_ratio/low_mean": 0.1800609747879207, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18010600563138723, + "epoch": 0.1687683125339121, + "grad_norm": 4.735053062438965, + "kl": 87.90625, + "learning_rate": 1e-06, + "loss": 0.2235, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0005773327470706136, + "clip_ratio/high_mean": 9.872897641116651e-05, + "clip_ratio/low_mean": 0.21200398448854685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21210271026939154, + "epoch": 0.16894194248507868, + "grad_norm": 5.869259357452393, + "kl": 91.4375, + "learning_rate": 1e-06, + "loss": 0.2271, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0010693132944652461, + "clip_ratio/high_mean": 0.00016388243568599137, + "clip_ratio/low_mean": 0.18974491767585278, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18990880390629172, + "epoch": 0.16911557243624525, + "grad_norm": 4.996444225311279, + "kl": 86.75, + "learning_rate": 1e-06, + "loss": 0.2236, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0010912273241956427, + "clip_ratio/high_mean": 0.00017403450857500502, + "clip_ratio/low_mean": 0.0499700687096265, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05014410399962799, + "epoch": 0.16928920238741182, + "grad_norm": 1.2593821287155151, + "kl": 76.59375, + "learning_rate": 1e-06, + "loss": 0.2187, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0011210100656171562, + "clip_ratio/high_mean": 0.00018442897294335125, + "clip_ratio/low_mean": 0.021285885016254724, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02147031439199054, + "epoch": 0.1694628323385784, + "grad_norm": 3.1944236755371094, + "kl": 73.46875, + "learning_rate": 1e-06, + "loss": 0.2184, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6138392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2711.0, + "completions/mean_length": 2162.2568359375, + "completions/mean_terminated_length": 716.1329345703125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.169636462289745, + "grad_norm": 6.084534645080566, + "kl": 96.53125, + "learning_rate": 1e-06, + "loss": 0.2655, + "num_tokens": 70466709.0, + "reward": 0.1696428656578064, + "reward_std": 0.1699153482913971, + "rewards/accuracy_reward/mean": 0.1696428507566452, + "rewards/accuracy_reward/std": 0.37573832273483276, + "step": 977 + }, + { + "clip_ratio/high_max": 7.904152698756661e-05, + "clip_ratio/high_mean": 1.5758804238430457e-05, + "clip_ratio/low_mean": 4.345463366917102e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.921343813497515e-05, + "epoch": 0.16981009224091156, + "grad_norm": 4.2783203125, + "kl": 104.46875, + "learning_rate": 1e-06, + "loss": 0.2557, + "step": 978 + }, + { + "clip_ratio/high_max": 0.00022280243138084188, + "clip_ratio/high_mean": 3.6412843201105716e-05, + "clip_ratio/low_mean": 0.1429557972587645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.14299221290275455, + "epoch": 0.16998372219207813, + "grad_norm": 6.9569501876831055, + "kl": 117.25, + "learning_rate": 1e-06, + "loss": 0.2592, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0002253561624456779, + "clip_ratio/high_mean": 4.105632376649737e-05, + "clip_ratio/low_mean": 0.15256792306900024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15260898042470217, + "epoch": 0.1701573521432447, + "grad_norm": 7.360963344573975, + "kl": 119.4375, + "learning_rate": 1e-06, + "loss": 0.2629, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0014589717284252401, + "clip_ratio/high_mean": 0.00023911365099138493, + "clip_ratio/low_mean": 0.1478467513807118, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.14808587171137333, + "epoch": 0.17033098209441128, + "grad_norm": 6.6269354820251465, + "kl": 111.6875, + "learning_rate": 1e-06, + "loss": 0.2582, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0015495251736865612, + "clip_ratio/high_mean": 0.00026093907763424795, + "clip_ratio/low_mean": 0.11848202999681234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11874296865426004, + "epoch": 0.17050461204557787, + "grad_norm": 4.5570807456970215, + "kl": 96.65625, + "learning_rate": 1e-06, + "loss": 0.2493, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0015935538758640178, + "clip_ratio/high_mean": 0.0002674470638339699, + "clip_ratio/low_mean": 0.00793645816611388, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008203904901165515, + "epoch": 0.17067824199674445, + "grad_norm": 4.810990810394287, + "kl": 78.65625, + "learning_rate": 1e-06, + "loss": 0.2485, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0016106147722894093, + "clip_ratio/high_mean": 0.0002810100068018073, + "clip_ratio/low_mean": 0.00047865558190096635, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0007596655777888373, + "epoch": 0.17085187194791102, + "grad_norm": 7.1348724365234375, + "kl": 71.75, + "learning_rate": 1e-06, + "loss": 0.2542, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6138392857142857, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3015.0, + "completions/mean_length": 2194.035888671875, + "completions/mean_terminated_length": 798.427734375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1710255018990776, + "grad_norm": 8.333260536193848, + "kl": 100.0, + "learning_rate": 1e-06, + "loss": 0.3071, + "num_tokens": 71515117.0, + "reward": 0.1875000149011612, + "reward_std": 0.20627696812152863, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3907487094402313, + "step": 985 + }, + { + "clip_ratio/high_max": 0.00010565961747488473, + "clip_ratio/high_mean": 2.084148070480296e-05, + "clip_ratio/low_mean": 3.544178616721183e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6283264257217525e-05, + "epoch": 0.17119913185024416, + "grad_norm": 7.001766681671143, + "kl": 104.625, + "learning_rate": 1e-06, + "loss": 0.2971, + "step": 986 + }, + { + "clip_ratio/high_max": 0.00017507059237686917, + "clip_ratio/high_mean": 3.4723048997875594e-05, + "clip_ratio/low_mean": 0.16017726808786392, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.16021199338138103, + "epoch": 0.17137276180141076, + "grad_norm": 3.890636682510376, + "kl": 115.3125, + "learning_rate": 1e-06, + "loss": 0.2901, + "step": 987 + }, + { + "clip_ratio/high_max": 0.00034652778231247794, + "clip_ratio/high_mean": 6.150089200218645e-05, + "clip_ratio/low_mean": 0.1903933109715581, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.19045480526983738, + "epoch": 0.17154639175257733, + "grad_norm": 4.350380897521973, + "kl": 111.9375, + "learning_rate": 1e-06, + "loss": 0.2905, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0055917094950928, + "clip_ratio/high_mean": 0.0008056549783077571, + "clip_ratio/low_mean": 0.18840554263442755, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.18921120278537273, + "epoch": 0.1717200217037439, + "grad_norm": 3.2020316123962402, + "kl": 99.25, + "learning_rate": 1e-06, + "loss": 0.2885, + "step": 989 + }, + { + "clip_ratio/high_max": 0.005722634058656695, + "clip_ratio/high_mean": 0.0008292219226859743, + "clip_ratio/low_mean": 0.08264759345911443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.08347681490704417, + "epoch": 0.17189365165491047, + "grad_norm": 4.136929512023926, + "kl": 85.0, + "learning_rate": 1e-06, + "loss": 0.2886, + "step": 990 + }, + { + "clip_ratio/high_max": 0.005697643208804948, + "clip_ratio/high_mean": 0.0008366483941699698, + "clip_ratio/low_mean": 0.05859705356124323, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.059433701491798274, + "epoch": 0.17206728160607704, + "grad_norm": 5.158488750457764, + "kl": 82.71875, + "learning_rate": 1e-06, + "loss": 0.2882, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0057623430984676816, + "clip_ratio/high_mean": 0.0008325353151121817, + "clip_ratio/low_mean": 0.15897481981664896, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15980734955519438, + "epoch": 0.17224091155724364, + "grad_norm": 2.436245918273926, + "kl": 90.125, + "learning_rate": 1e-06, + "loss": 0.2837, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6495535714285714, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3053.0, + "completions/mean_length": 2323.8662109375, + "completions/mean_terminated_length": 937.1974487304688, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.1724145415084102, + "grad_norm": 4.198254585266113, + "kl": 129.8125, + "learning_rate": 1e-06, + "loss": 0.2911, + "num_tokens": 72625681.0, + "reward": 0.149553582072258, + "reward_std": 0.18876877427101135, + "rewards/accuracy_reward/mean": 0.1495535671710968, + "rewards/accuracy_reward/std": 0.3570319712162018, + "step": 993 + }, + { + "clip_ratio/high_max": 0.00013605480216938304, + "clip_ratio/high_mean": 2.405033580998861e-05, + "clip_ratio/low_mean": 1.2012105401026929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.606244217735366e-05, + "epoch": 0.17258817145957678, + "grad_norm": 4.686339378356934, + "kl": 138.125, + "learning_rate": 1e-06, + "loss": 0.2921, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0001518363060313277, + "clip_ratio/high_mean": 2.9466876412698184e-05, + "clip_ratio/low_mean": 1.4395756863905262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3862633901881054e-05, + "epoch": 0.17276180141074335, + "grad_norm": 4.020852565765381, + "kl": 135.8125, + "learning_rate": 1e-06, + "loss": 0.2872, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0017639204297665856, + "clip_ratio/high_mean": 0.00026533781817761337, + "clip_ratio/low_mean": 4.6107834975828155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003114456483217509, + "epoch": 0.17293543136190992, + "grad_norm": 2.607438802719116, + "kl": 127.0625, + "learning_rate": 1e-06, + "loss": 0.2796, + "step": 996 + }, + { + "clip_ratio/high_max": 0.026950488293550734, + "clip_ratio/high_mean": 0.003863473568117115, + "clip_ratio/low_mean": 0.022118269989732653, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02598174288868904, + "epoch": 0.17310906131307652, + "grad_norm": 1.3773447275161743, + "kl": 115.8125, + "learning_rate": 1e-06, + "loss": 0.2759, + "step": 997 + }, + { + "clip_ratio/high_max": 0.02710709213715745, + "clip_ratio/high_mean": 0.003887859452788689, + "clip_ratio/low_mean": 0.10817634221166372, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.11206420324742794, + "epoch": 0.1732826912642431, + "grad_norm": 4.467653751373291, + "kl": 114.3125, + "learning_rate": 1e-06, + "loss": 0.2778, + "step": 998 + }, + { + "clip_ratio/high_max": 0.02711169681424508, + "clip_ratio/high_mean": 0.003901809108810994, + "clip_ratio/low_mean": 0.10151875065639615, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.10542056430131197, + "epoch": 0.17345632121540966, + "grad_norm": 4.3048505783081055, + "kl": 107.1875, + "learning_rate": 1e-06, + "loss": 0.2775, + "step": 999 + }, + { + "clip_ratio/high_max": 0.027477349632135883, + "clip_ratio/high_mean": 0.00397799830693657, + "clip_ratio/low_mean": 0.06908055488020182, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.07305855443701148, + "epoch": 0.17362995116657623, + "grad_norm": 3.685685873031616, + "kl": 100.46875, + "learning_rate": 1e-06, + "loss": 0.2743, + "step": 1000 + }, + { + "epoch": 0.17362995116657623, + "step": 1000, + "total_flos": 0.0, + "train_loss": 0.10515824193321169, + "train_runtime": 10653.1567, + "train_samples_per_second": 42.053, + "train_steps_per_second": 0.094 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 72625681, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}