{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9963369963369964, "eval_steps": 1000, "global_step": 204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004884004884004884, "grad_norm": 2.3243459220572165, "learning_rate": 2.3809523809523807e-08, "logits/chosen": -2.550273895263672, "logits/rejected": -2.5806894302368164, "logps/chosen": -424.7008056640625, "logps/rejected": -390.49554443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04884004884004884, "grad_norm": 2.4712584301903604, "learning_rate": 2.3809523809523806e-07, "logits/chosen": -2.4481005668640137, "logits/rejected": -2.474926471710205, "logps/chosen": -395.8595886230469, "logps/rejected": -384.5038146972656, "loss": 0.6931, "rewards/accuracies": 0.4635416567325592, "rewards/chosen": 0.00013807932555209845, "rewards/margins": 0.0004333473916631192, "rewards/rejected": -0.00029526810976676643, "step": 10 }, { "epoch": 0.09768009768009768, "grad_norm": 2.318089253747947, "learning_rate": 4.761904761904761e-07, "logits/chosen": -2.445664167404175, "logits/rejected": -2.4723546504974365, "logps/chosen": -393.4665222167969, "logps/rejected": -377.8502197265625, "loss": 0.6905, "rewards/accuracies": 0.67578125, "rewards/chosen": 0.00608012406155467, "rewards/margins": 0.005808630492538214, "rewards/rejected": 0.00027149339439347386, "step": 20 }, { "epoch": 0.14652014652014653, "grad_norm": 2.5283415680520225, "learning_rate": 4.970219740227693e-07, "logits/chosen": -2.494197368621826, "logits/rejected": -2.5383658409118652, "logps/chosen": -393.2831115722656, "logps/rejected": -383.0456237792969, "loss": 0.6754, "rewards/accuracies": 0.807812511920929, "rewards/chosen": 0.03202961012721062, "rewards/margins": 0.037289537489414215, "rewards/rejected": -0.005259926896542311, "step": 30 }, { "epoch": 0.19536019536019536, "grad_norm": 2.5364879915405267, "learning_rate": 4.868186180746791e-07, "logits/chosen": -2.5153324604034424, "logits/rejected": -2.5360398292541504, "logps/chosen": -387.1582946777344, "logps/rejected": -379.3692932128906, "loss": 0.647, "rewards/accuracies": 0.8218749761581421, "rewards/chosen": 0.0628650039434433, "rewards/margins": 0.09538714587688446, "rewards/rejected": -0.032522134482860565, "step": 40 }, { "epoch": 0.2442002442002442, "grad_norm": 2.2699251495146964, "learning_rate": 4.6965306126428705e-07, "logits/chosen": -2.539130449295044, "logits/rejected": -2.5619969367980957, "logps/chosen": -404.4756774902344, "logps/rejected": -406.6902770996094, "loss": 0.5925, "rewards/accuracies": 0.82421875, "rewards/chosen": -0.0003643702657427639, "rewards/margins": 0.2586084008216858, "rewards/rejected": -0.25897279381752014, "step": 50 }, { "epoch": 0.29304029304029305, "grad_norm": 2.4145672311111994, "learning_rate": 4.460299516441776e-07, "logits/chosen": -2.550515651702881, "logits/rejected": -2.577197551727295, "logps/chosen": -413.14947509765625, "logps/rejected": -442.47674560546875, "loss": 0.5417, "rewards/accuracies": 0.801562488079071, "rewards/chosen": -0.15052883327007294, "rewards/margins": 0.4667808413505554, "rewards/rejected": -0.6173096895217896, "step": 60 }, { "epoch": 0.3418803418803419, "grad_norm": 2.7821486929434056, "learning_rate": 4.166437820523908e-07, "logits/chosen": -2.5242340564727783, "logits/rejected": -2.5425312519073486, "logps/chosen": -446.6337890625, "logps/rejected": -487.9483337402344, "loss": 0.5011, "rewards/accuracies": 0.8023437261581421, "rewards/chosen": -0.4636126160621643, "rewards/margins": 0.6593486070632935, "rewards/rejected": -1.1229612827301025, "step": 70 }, { "epoch": 0.3907203907203907, "grad_norm": 2.754390105851781, "learning_rate": 3.8235847280454626e-07, "logits/chosen": -2.4175186157226562, "logits/rejected": -2.449018955230713, "logps/chosen": -507.10357666015625, "logps/rejected": -592.0707397460938, "loss": 0.4575, "rewards/accuracies": 0.80078125, "rewards/chosen": -1.1133525371551514, "rewards/margins": 0.9770663380622864, "rewards/rejected": -2.090418815612793, "step": 80 }, { "epoch": 0.43956043956043955, "grad_norm": 2.6724675179161568, "learning_rate": 3.4418197340879627e-07, "logits/chosen": -2.409747838973999, "logits/rejected": -2.4172959327697754, "logps/chosen": -507.91876220703125, "logps/rejected": -625.10205078125, "loss": 0.4121, "rewards/accuracies": 0.835156261920929, "rewards/chosen": -1.0922380685806274, "rewards/margins": 1.3621976375579834, "rewards/rejected": -2.4544358253479004, "step": 90 }, { "epoch": 0.4884004884004884, "grad_norm": 3.151335337005712, "learning_rate": 3.032366299846039e-07, "logits/chosen": -2.4340109825134277, "logits/rejected": -2.4465105533599854, "logps/chosen": -532.0742797851562, "logps/rejected": -683.5274658203125, "loss": 0.3892, "rewards/accuracies": 0.8335937261581421, "rewards/chosen": -1.2753849029541016, "rewards/margins": 1.6482696533203125, "rewards/rejected": -2.923654794692993, "step": 100 }, { "epoch": 0.5372405372405372, "grad_norm": 2.717335654672678, "learning_rate": 2.6072618954988863e-07, "logits/chosen": -2.4394848346710205, "logits/rejected": -2.442568778991699, "logps/chosen": -518.7210693359375, "logps/rejected": -677.293701171875, "loss": 0.3834, "rewards/accuracies": 0.827343761920929, "rewards/chosen": -1.2031551599502563, "rewards/margins": 1.6999378204345703, "rewards/rejected": -2.903092861175537, "step": 110 }, { "epoch": 0.5860805860805861, "grad_norm": 3.1603927594667005, "learning_rate": 2.1790041121336222e-07, "logits/chosen": -2.4521875381469727, "logits/rejected": -2.460845470428467, "logps/chosen": -532.1248168945312, "logps/rejected": -704.0490112304688, "loss": 0.3613, "rewards/accuracies": 0.8414062261581421, "rewards/chosen": -1.3992774486541748, "rewards/margins": 1.8053524494171143, "rewards/rejected": -3.204629898071289, "step": 120 }, { "epoch": 0.6349206349206349, "grad_norm": 2.877029930356179, "learning_rate": 1.7601832466317766e-07, "logits/chosen": -2.4438443183898926, "logits/rejected": -2.462118148803711, "logps/chosen": -540.3773193359375, "logps/rejected": -711.51416015625, "loss": 0.3581, "rewards/accuracies": 0.8453124761581421, "rewards/chosen": -1.4280272722244263, "rewards/margins": 1.8352330923080444, "rewards/rejected": -3.2632603645324707, "step": 130 }, { "epoch": 0.6837606837606838, "grad_norm": 2.8365028089984454, "learning_rate": 1.3631121611097362e-07, "logits/chosen": -2.4740078449249268, "logits/rejected": -2.487417697906494, "logps/chosen": -546.05859375, "logps/rejected": -727.7886962890625, "loss": 0.3495, "rewards/accuracies": 0.8609374761581421, "rewards/chosen": -1.4827759265899658, "rewards/margins": 1.9239017963409424, "rewards/rejected": -3.4066779613494873, "step": 140 }, { "epoch": 0.7326007326007326, "grad_norm": 3.083741716442478, "learning_rate": 9.9946429862908e-08, "logits/chosen": -2.462756633758545, "logits/rejected": -2.4654526710510254, "logps/chosen": -549.0475463867188, "logps/rejected": -722.2012329101562, "loss": 0.3454, "rewards/accuracies": 0.8492187261581421, "rewards/chosen": -1.5003674030303955, "rewards/margins": 1.9490848779678345, "rewards/rejected": -3.4494519233703613, "step": 150 }, { "epoch": 0.7814407814407814, "grad_norm": 3.086204461780561, "learning_rate": 6.799304971075381e-08, "logits/chosen": -2.4620633125305176, "logits/rejected": -2.4670565128326416, "logps/chosen": -539.4750366210938, "logps/rejected": -717.6760864257812, "loss": 0.3426, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4655063152313232, "rewards/margins": 1.9486806392669678, "rewards/rejected": -3.41418719291687, "step": 160 }, { "epoch": 0.8302808302808303, "grad_norm": 3.152043777770028, "learning_rate": 4.1390469071538175e-08, "logits/chosen": -2.4839229583740234, "logits/rejected": -2.4979355335235596, "logps/chosen": -547.6788940429688, "logps/rejected": -723.7260131835938, "loss": 0.3417, "rewards/accuracies": 0.839062511920929, "rewards/chosen": -1.532496690750122, "rewards/margins": 1.8789927959442139, "rewards/rejected": -3.411489486694336, "step": 170 }, { "epoch": 0.8791208791208791, "grad_norm": 3.34422267800285, "learning_rate": 2.0920773878248837e-08, "logits/chosen": -2.4812464714050293, "logits/rejected": -2.49059796333313, "logps/chosen": -556.6783447265625, "logps/rejected": -743.0768432617188, "loss": 0.3413, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5343992710113525, "rewards/margins": 2.0101265907287598, "rewards/rejected": -3.5445258617401123, "step": 180 }, { "epoch": 0.927960927960928, "grad_norm": 3.2655151670502574, "learning_rate": 7.185750133542168e-09, "logits/chosen": -2.473402500152588, "logits/rejected": -2.471391201019287, "logps/chosen": -554.5660400390625, "logps/rejected": -741.9495849609375, "loss": 0.3343, "rewards/accuracies": 0.8539062738418579, "rewards/chosen": -1.5347990989685059, "rewards/margins": 2.0656068325042725, "rewards/rejected": -3.6004059314727783, "step": 190 }, { "epoch": 0.9768009768009768, "grad_norm": 4.051209744645471, "learning_rate": 5.891920784984184e-10, "logits/chosen": -2.456406354904175, "logits/rejected": -2.4556210041046143, "logps/chosen": -550.1729125976562, "logps/rejected": -737.451904296875, "loss": 0.3391, "rewards/accuracies": 0.875, "rewards/chosen": -1.517017126083374, "rewards/margins": 2.044365406036377, "rewards/rejected": -3.561382293701172, "step": 200 }, { "epoch": 0.9963369963369964, "step": 204, "total_flos": 0.0, "train_loss": 0.4527332771058176, "train_runtime": 5381.1516, "train_samples_per_second": 38.96, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }