| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9910979228486647, | |
| "eval_steps": 500, | |
| "global_step": 336, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 157.78125381469727, | |
| "epoch": 0.005934718100890208, | |
| "grad_norm": 4.843128681182861, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.0954087376594543, | |
| "reward_std": 0.6969138085842133, | |
| "rewards/format_reward": 0.6562500074505806, | |
| "rewards/segmentation_reward": 0.43915872275829315, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 150.3854217529297, | |
| "epoch": 0.011869436201780416, | |
| "grad_norm": 5.3678154945373535, | |
| "kl": 0.0009851455688476562, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.010141596198082, | |
| "reward_std": 0.7214687466621399, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "rewards/segmentation_reward": 0.40597493201494217, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 158.6354217529297, | |
| "epoch": 0.017804154302670624, | |
| "grad_norm": 7.694790363311768, | |
| "kl": 0.001270294189453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.3760891556739807, | |
| "reward_std": 0.4771263860166073, | |
| "rewards/format_reward": 0.8020833432674408, | |
| "rewards/segmentation_reward": 0.5740057602524757, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 148.0208396911621, | |
| "epoch": 0.02373887240356083, | |
| "grad_norm": 11.235418319702148, | |
| "kl": 0.00232696533203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.367339700460434, | |
| "reward_std": 0.42488569766283035, | |
| "rewards/format_reward": 0.8437500149011612, | |
| "rewards/segmentation_reward": 0.523589625954628, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 150.77083587646484, | |
| "epoch": 0.02967359050445104, | |
| "grad_norm": 6.50709342956543, | |
| "kl": 0.00482177734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.173013836145401, | |
| "reward_std": 0.744086429476738, | |
| "rewards/format_reward": 0.6979166716337204, | |
| "rewards/segmentation_reward": 0.4750971421599388, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 142.7291717529297, | |
| "epoch": 0.03560830860534125, | |
| "grad_norm": 4.435483932495117, | |
| "kl": 0.00745391845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.5586660504341125, | |
| "reward_std": 0.1734358905814588, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6003326624631882, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 138.25000762939453, | |
| "epoch": 0.04154302670623145, | |
| "grad_norm": 7.039760112762451, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.5964379608631134, | |
| "reward_std": 0.23111657053232193, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.648521289229393, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 144.59375381469727, | |
| "epoch": 0.04747774480712166, | |
| "grad_norm": 6.002971649169922, | |
| "kl": 0.010711669921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.500212699174881, | |
| "reward_std": 0.26775410771369934, | |
| "rewards/format_reward": 0.927083358168602, | |
| "rewards/segmentation_reward": 0.573129341006279, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 136.31250381469727, | |
| "epoch": 0.05341246290801187, | |
| "grad_norm": 11.051139831542969, | |
| "kl": 0.016204833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.4772345423698425, | |
| "reward_std": 0.28671396523714066, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "rewards/segmentation_reward": 0.5814011096954346, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 138.37500381469727, | |
| "epoch": 0.05934718100890208, | |
| "grad_norm": 5.32958459854126, | |
| "kl": 0.0171356201171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.6374868154525757, | |
| "reward_std": 0.2423457931727171, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6687368303537369, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 134.0312557220459, | |
| "epoch": 0.06528189910979229, | |
| "grad_norm": 12.082164764404297, | |
| "kl": 0.0192108154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.652137815952301, | |
| "reward_std": 0.18039543880149722, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6938043981790543, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 134.58333587646484, | |
| "epoch": 0.0712166172106825, | |
| "grad_norm": 6.0959930419921875, | |
| "kl": 0.0180816650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5255734622478485, | |
| "reward_std": 0.21420218795537949, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.5568235069513321, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 132.81250762939453, | |
| "epoch": 0.0771513353115727, | |
| "grad_norm": 8.166247367858887, | |
| "kl": 0.018310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.576871931552887, | |
| "reward_std": 0.16248912550508976, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6081219017505646, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 135.7604217529297, | |
| "epoch": 0.0830860534124629, | |
| "grad_norm": 6.298133850097656, | |
| "kl": 0.02197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7197113037109375, | |
| "reward_std": 0.07741504721343517, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7301279306411743, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 131.15625381469727, | |
| "epoch": 0.08902077151335312, | |
| "grad_norm": 15.750993728637695, | |
| "kl": 0.023529052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6381132900714874, | |
| "reward_std": 0.24310634471476078, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6797799617052078, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 134.31250762939453, | |
| "epoch": 0.09495548961424333, | |
| "grad_norm": 14.33760929107666, | |
| "kl": 0.03131103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.707416594028473, | |
| "reward_std": 0.20465393085032701, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7386665642261505, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 128.7812557220459, | |
| "epoch": 0.10089020771513353, | |
| "grad_norm": 9.760759353637695, | |
| "kl": 0.02435302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.6728956699371338, | |
| "reward_std": 0.12627490423619747, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6833122968673706, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 128.16666984558105, | |
| "epoch": 0.10682492581602374, | |
| "grad_norm": 29.32834815979004, | |
| "kl": 0.02447509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.679076761007309, | |
| "reward_std": 0.16491653956472874, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6999100893735886, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 128.4479217529297, | |
| "epoch": 0.11275964391691394, | |
| "grad_norm": 8.054636001586914, | |
| "kl": 0.026824951171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6828641295433044, | |
| "reward_std": 0.05810322519391775, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6828641593456268, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 131.4687557220459, | |
| "epoch": 0.11869436201780416, | |
| "grad_norm": 6.730808258056641, | |
| "kl": 0.026153564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.654236376285553, | |
| "reward_std": 0.15527622308582067, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6750697493553162, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 128.3229217529297, | |
| "epoch": 0.12462908011869436, | |
| "grad_norm": 7.039961814880371, | |
| "kl": 0.0263671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.70102459192276, | |
| "reward_std": 0.07563944300636649, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7114411741495132, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 128.82292366027832, | |
| "epoch": 0.13056379821958458, | |
| "grad_norm": 25.049779891967773, | |
| "kl": 0.033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6903910338878632, | |
| "reward_std": 0.11956312041729689, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7008076012134552, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 130.3541717529297, | |
| "epoch": 0.13649851632047477, | |
| "grad_norm": 22.480205535888672, | |
| "kl": 0.028472900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7005047500133514, | |
| "reward_std": 0.15591828245669603, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7317546755075455, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 125.06250190734863, | |
| "epoch": 0.142433234421365, | |
| "grad_norm": 7.072060585021973, | |
| "kl": 0.044342041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7708145081996918, | |
| "reward_std": 0.08357710530981421, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7812311798334122, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 121.11458587646484, | |
| "epoch": 0.14836795252225518, | |
| "grad_norm": 8.955461502075195, | |
| "kl": 0.027099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6748111546039581, | |
| "reward_std": 0.12412907555699348, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6852278560400009, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 120.08333396911621, | |
| "epoch": 0.1543026706231454, | |
| "grad_norm": 6.5587687492370605, | |
| "kl": 0.030548095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6505275666713715, | |
| "reward_std": 0.19929413869976997, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.6817775219678879, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 117.77083778381348, | |
| "epoch": 0.16023738872403562, | |
| "grad_norm": 7.269732475280762, | |
| "kl": 0.0325927734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6844450235366821, | |
| "reward_std": 0.10336442582774907, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6948617249727249, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 120.55208587646484, | |
| "epoch": 0.1661721068249258, | |
| "grad_norm": 5.853372097015381, | |
| "kl": 0.026458740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6804953813552856, | |
| "reward_std": 0.12254019640386105, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7013287395238876, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 121.02083587646484, | |
| "epoch": 0.17210682492581603, | |
| "grad_norm": 31.43914031982422, | |
| "kl": 0.0313720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.707067847251892, | |
| "reward_std": 0.12043083645403385, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7070677876472473, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 122.77083587646484, | |
| "epoch": 0.17804154302670624, | |
| "grad_norm": 10.165179252624512, | |
| "kl": 0.027313232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7467810809612274, | |
| "reward_std": 0.07829774497076869, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7571977376937866, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 121.52083587646484, | |
| "epoch": 0.18397626112759644, | |
| "grad_norm": 18.9634952545166, | |
| "kl": 0.02899169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7623717486858368, | |
| "reward_std": 0.06413916405290365, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7623717188835144, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 119.40625190734863, | |
| "epoch": 0.18991097922848665, | |
| "grad_norm": 9.280572891235352, | |
| "kl": 0.111724853515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0045, | |
| "reward": 1.6756855249404907, | |
| "reward_std": 0.17220945027656853, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7173521369695663, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 120.37500190734863, | |
| "epoch": 0.19584569732937684, | |
| "grad_norm": 20.99109649658203, | |
| "kl": 0.032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6820135414600372, | |
| "reward_std": 0.09892075881361961, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7028468549251556, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 119.59375381469727, | |
| "epoch": 0.20178041543026706, | |
| "grad_norm": 11.153885841369629, | |
| "kl": 0.0384521484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7420443892478943, | |
| "reward_std": 0.05706456396728754, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7420443743467331, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 117.82291984558105, | |
| "epoch": 0.20771513353115728, | |
| "grad_norm": 7.118908882141113, | |
| "kl": 0.03765869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7531715631484985, | |
| "reward_std": 0.10411902144551277, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7635882049798965, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 120.62500381469727, | |
| "epoch": 0.21364985163204747, | |
| "grad_norm": 11.814937591552734, | |
| "kl": 0.03887939453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7418705523014069, | |
| "reward_std": 0.07266789069399238, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7418705374002457, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 119.91666984558105, | |
| "epoch": 0.2195845697329377, | |
| "grad_norm": 6.670704364776611, | |
| "kl": 0.03497314453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.758776307106018, | |
| "reward_std": 0.083824397996068, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7691929787397385, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 121.03125381469727, | |
| "epoch": 0.22551928783382788, | |
| "grad_norm": 7.1339545249938965, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.780372679233551, | |
| "reward_std": 0.05653517507016659, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7803726345300674, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 123.71875190734863, | |
| "epoch": 0.2314540059347181, | |
| "grad_norm": 20.304597854614258, | |
| "kl": 0.036163330078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7353099882602692, | |
| "reward_std": 0.09488376975059509, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7457266598939896, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 126.91666793823242, | |
| "epoch": 0.23738872403560832, | |
| "grad_norm": 7.093606472015381, | |
| "kl": 0.03582763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.730019450187683, | |
| "reward_std": 0.061210392508655787, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7300194352865219, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 127.8125057220459, | |
| "epoch": 0.2433234421364985, | |
| "grad_norm": 11.748034477233887, | |
| "kl": 0.038818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.693460375070572, | |
| "reward_std": 0.10717929899692535, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7142936736345291, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 123.32291984558105, | |
| "epoch": 0.24925816023738873, | |
| "grad_norm": 10.225717544555664, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.7418551743030548, | |
| "reward_std": 0.15680904872715473, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7731050848960876, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 128.7083339691162, | |
| "epoch": 0.2551928783382789, | |
| "grad_norm": 7.125417232513428, | |
| "kl": 0.0401611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.6099365949630737, | |
| "reward_std": 0.23078683763742447, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6516032218933105, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 127.89583778381348, | |
| "epoch": 0.26112759643916916, | |
| "grad_norm": 14.912107467651367, | |
| "kl": 0.0399169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.700283408164978, | |
| "reward_std": 0.1383817931637168, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7315333336591721, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 134.50000381469727, | |
| "epoch": 0.26706231454005935, | |
| "grad_norm": 7.416093349456787, | |
| "kl": 0.037109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7258837223052979, | |
| "reward_std": 0.10230511240661144, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7363004088401794, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 130.80208587646484, | |
| "epoch": 0.27299703264094954, | |
| "grad_norm": 195.13722229003906, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8153444528579712, | |
| "reward_std": 0.07033979892730713, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8153444081544876, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 129.22916793823242, | |
| "epoch": 0.2789317507418398, | |
| "grad_norm": 5.549770832061768, | |
| "kl": 0.04315185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.7382583618164062, | |
| "reward_std": 0.044012173311784863, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7382583022117615, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 135.3541717529297, | |
| "epoch": 0.28486646884273, | |
| "grad_norm": 13.713714599609375, | |
| "kl": 0.0467529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.740307867527008, | |
| "reward_std": 0.12180997617542744, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7611411660909653, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 131.77083587646484, | |
| "epoch": 0.29080118694362017, | |
| "grad_norm": 17.87116241455078, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7088869214057922, | |
| "reward_std": 0.07198232505470514, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7088869214057922, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 135.3333396911621, | |
| "epoch": 0.29673590504451036, | |
| "grad_norm": 14.480645179748535, | |
| "kl": 0.04559326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7133915424346924, | |
| "reward_std": 0.15073410887271166, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7550581842660904, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 137.1041717529297, | |
| "epoch": 0.3026706231454006, | |
| "grad_norm": 5.64309024810791, | |
| "kl": 0.03741455078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.6373025476932526, | |
| "reward_std": 0.13259334303438663, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6581359207630157, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 135.1979217529297, | |
| "epoch": 0.3086053412462908, | |
| "grad_norm": 5.212345123291016, | |
| "kl": 0.037841796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7314472496509552, | |
| "reward_std": 0.04300686717033386, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7314473092556, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 134.28125381469727, | |
| "epoch": 0.314540059347181, | |
| "grad_norm": 6.159574508666992, | |
| "kl": 0.04058837890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7339943647384644, | |
| "reward_std": 0.07974916975945234, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7444109320640564, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 139.8958396911621, | |
| "epoch": 0.32047477744807124, | |
| "grad_norm": 19.429752349853516, | |
| "kl": 0.03790283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7864000797271729, | |
| "reward_std": 0.08986240020021796, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8072333931922913, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 140.21875762939453, | |
| "epoch": 0.3264094955489614, | |
| "grad_norm": 8.32386302947998, | |
| "kl": 0.12933349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0052, | |
| "reward": 1.7257477641105652, | |
| "reward_std": 0.11480520572513342, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7465810775756836, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 140.37500381469727, | |
| "epoch": 0.3323442136498516, | |
| "grad_norm": 4.434051990509033, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7016322612762451, | |
| "reward_std": 0.14971541427075863, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7328822016716003, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 138.32291793823242, | |
| "epoch": 0.33827893175074186, | |
| "grad_norm": 21.184770584106445, | |
| "kl": 0.03936767578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7572259902954102, | |
| "reward_std": 0.03327028127387166, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7572259604930878, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 144.4791717529297, | |
| "epoch": 0.34421364985163205, | |
| "grad_norm": 12.050070762634277, | |
| "kl": 0.04034423828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7147690653800964, | |
| "reward_std": 0.09481704700738192, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7147690504789352, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 140.72916793823242, | |
| "epoch": 0.35014836795252224, | |
| "grad_norm": 17.42802619934082, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.736547976732254, | |
| "reward_std": 0.10654295142740011, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.746964618563652, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 140.9270896911621, | |
| "epoch": 0.3560830860534125, | |
| "grad_norm": 10.749812126159668, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.753279983997345, | |
| "reward_std": 0.040626129135489464, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7532799541950226, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 140.37500381469727, | |
| "epoch": 0.3620178041543027, | |
| "grad_norm": 7.617424488067627, | |
| "kl": 0.035858154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6448023915290833, | |
| "reward_std": 0.1214896326418966, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.6656356900930405, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 138.68750381469727, | |
| "epoch": 0.36795252225519287, | |
| "grad_norm": 15.33764362335205, | |
| "kl": 0.043731689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7142232954502106, | |
| "reward_std": 0.1497452650219202, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7454732358455658, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 143.78125762939453, | |
| "epoch": 0.37388724035608306, | |
| "grad_norm": 26.1903133392334, | |
| "kl": 0.031951904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.717173308134079, | |
| "reward_std": 0.1322586655151099, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7484232485294342, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 139.3645896911621, | |
| "epoch": 0.3798219584569733, | |
| "grad_norm": 11.163153648376465, | |
| "kl": 0.0316162109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7420830428600311, | |
| "reward_std": 0.16791360033676028, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7837497144937515, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 138.12500381469727, | |
| "epoch": 0.3857566765578635, | |
| "grad_norm": 8.011311531066895, | |
| "kl": 0.03466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6190190613269806, | |
| "reward_std": 0.21082479134202003, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.6502691060304642, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 144.75000762939453, | |
| "epoch": 0.3916913946587537, | |
| "grad_norm": 11.111846923828125, | |
| "kl": 0.03204345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.755936175584793, | |
| "reward_std": 0.10981305036693811, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7663528323173523, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 139.93750762939453, | |
| "epoch": 0.39762611275964393, | |
| "grad_norm": 9.128460884094238, | |
| "kl": 0.033660888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7998457551002502, | |
| "reward_std": 0.06874012341722846, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7998457551002502, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 142.1979217529297, | |
| "epoch": 0.4035608308605341, | |
| "grad_norm": 12.227910041809082, | |
| "kl": 0.032440185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7355614304542542, | |
| "reward_std": 0.04059491120278835, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7355614304542542, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 147.52084350585938, | |
| "epoch": 0.4094955489614243, | |
| "grad_norm": 4.963237762451172, | |
| "kl": 0.03948974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.6883772611618042, | |
| "reward_std": 0.12697911448776722, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7092105895280838, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 146.0104217529297, | |
| "epoch": 0.41543026706231456, | |
| "grad_norm": 5.463217258453369, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.655370056629181, | |
| "reward_std": 0.1917936820536852, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.6970367729663849, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 144.03125381469727, | |
| "epoch": 0.42136498516320475, | |
| "grad_norm": 11.424163818359375, | |
| "kl": 0.0374755859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.662711262702942, | |
| "reward_std": 0.16645172238349915, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.6939611881971359, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 147.8333396911621, | |
| "epoch": 0.42729970326409494, | |
| "grad_norm": 19.122568130493164, | |
| "kl": 0.030853271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7026071846485138, | |
| "reward_std": 0.12206890597008169, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.723440483212471, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 146.35417556762695, | |
| "epoch": 0.4332344213649852, | |
| "grad_norm": 5.500903129577637, | |
| "kl": 0.03125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6472920775413513, | |
| "reward_std": 0.20085742231458426, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.6889587044715881, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 145.47916793823242, | |
| "epoch": 0.4391691394658754, | |
| "grad_norm": 8.684319496154785, | |
| "kl": 0.035247802734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7125734388828278, | |
| "reward_std": 0.11136134760454297, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7334067076444626, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 147.15625381469727, | |
| "epoch": 0.44510385756676557, | |
| "grad_norm": 7.0105109214782715, | |
| "kl": 0.02880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.690079689025879, | |
| "reward_std": 0.06854211632162333, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6900796890258789, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 144.68750762939453, | |
| "epoch": 0.45103857566765576, | |
| "grad_norm": 4.295221328735352, | |
| "kl": 0.032958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7480990290641785, | |
| "reward_std": 0.10679568164050579, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7585156410932541, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 137.6145896911621, | |
| "epoch": 0.456973293768546, | |
| "grad_norm": 9.906067848205566, | |
| "kl": 0.0330810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7196555435657501, | |
| "reward_std": 0.15833959798328578, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7613222450017929, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 140.1041717529297, | |
| "epoch": 0.4629080118694362, | |
| "grad_norm": 7.921298503875732, | |
| "kl": 0.03240966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7260190546512604, | |
| "reward_std": 0.109945148229599, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7468523383140564, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 137.4166717529297, | |
| "epoch": 0.4688427299703264, | |
| "grad_norm": 13.661524772644043, | |
| "kl": 0.02947998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.721655547618866, | |
| "reward_std": 0.15363138541579247, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7529055327177048, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 133.37500762939453, | |
| "epoch": 0.47477744807121663, | |
| "grad_norm": 7.67321252822876, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6211079955101013, | |
| "reward_std": 0.2416740320622921, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.6731913536787033, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 133.2291717529297, | |
| "epoch": 0.4807121661721068, | |
| "grad_norm": 6.358465194702148, | |
| "kl": 0.0377197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.6948546469211578, | |
| "reward_std": 0.15342898294329643, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7261045873165131, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 137.3958396911621, | |
| "epoch": 0.486646884272997, | |
| "grad_norm": 5.652665138244629, | |
| "kl": 0.03717041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7105199992656708, | |
| "reward_std": 0.05939330440014601, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7105200439691544, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 128.9687557220459, | |
| "epoch": 0.49258160237388726, | |
| "grad_norm": 9.61016845703125, | |
| "kl": 0.0361328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6815986335277557, | |
| "reward_std": 0.07865951140411198, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.692015215754509, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 132.33333778381348, | |
| "epoch": 0.49851632047477745, | |
| "grad_norm": 13.813359260559082, | |
| "kl": 0.041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7599249482154846, | |
| "reward_std": 0.1436138590797782, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7911749482154846, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 125.53125381469727, | |
| "epoch": 0.5044510385756676, | |
| "grad_norm": 8.654479026794434, | |
| "kl": 0.0391845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.730294018983841, | |
| "reward_std": 0.08125040959566832, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7407106310129166, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 125.79166984558105, | |
| "epoch": 0.5103857566765578, | |
| "grad_norm": 12.567873001098633, | |
| "kl": 0.0428466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.7137242257595062, | |
| "reward_std": 0.12538791447877884, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7345575541257858, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 119.34375190734863, | |
| "epoch": 0.516320474777448, | |
| "grad_norm": 4.4666361808776855, | |
| "kl": 0.04974365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.6612667441368103, | |
| "reward_std": 0.16449622996151447, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6925167888402939, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 120.00000381469727, | |
| "epoch": 0.5222551928783383, | |
| "grad_norm": 8.294538497924805, | |
| "kl": 0.0579833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.6651489436626434, | |
| "reward_std": 0.15905625000596046, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7068156003952026, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 114.86458587646484, | |
| "epoch": 0.5281899109792285, | |
| "grad_norm": 4.889521598815918, | |
| "kl": 0.056396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7740101218223572, | |
| "reward_std": 0.08986124489456415, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7844266593456268, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 113.12500190734863, | |
| "epoch": 0.5341246290801187, | |
| "grad_norm": 13.152044296264648, | |
| "kl": 0.06005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7618820667266846, | |
| "reward_std": 0.1168306190520525, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7722987830638885, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 109.41666984558105, | |
| "epoch": 0.5400593471810089, | |
| "grad_norm": 41.23225402832031, | |
| "kl": 0.062255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7959917783737183, | |
| "reward_std": 0.05520722921937704, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7959917634725571, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 112.48958587646484, | |
| "epoch": 0.5459940652818991, | |
| "grad_norm": 24.223176956176758, | |
| "kl": 0.0667724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7793012261390686, | |
| "reward_std": 0.07624120265245438, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.789717823266983, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 108.58333587646484, | |
| "epoch": 0.5519287833827893, | |
| "grad_norm": 6.036355972290039, | |
| "kl": 0.10546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.6717748641967773, | |
| "reward_std": 0.1516607478260994, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.6926082074642181, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 104.37500381469727, | |
| "epoch": 0.5578635014836796, | |
| "grad_norm": 6.875241279602051, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7788923680782318, | |
| "reward_std": 0.07802953757345676, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7788923531770706, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 106.38541984558105, | |
| "epoch": 0.5637982195845698, | |
| "grad_norm": 9.138381958007812, | |
| "kl": 0.080322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7983123362064362, | |
| "reward_std": 0.0496332747861743, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7983123362064362, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 106.48958778381348, | |
| "epoch": 0.56973293768546, | |
| "grad_norm": 19.429649353027344, | |
| "kl": 0.084716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7282686233520508, | |
| "reward_std": 0.1304482314735651, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.749101996421814, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 103.33333396911621, | |
| "epoch": 0.5756676557863502, | |
| "grad_norm": 6.47030782699585, | |
| "kl": 0.086669921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7975687980651855, | |
| "reward_std": 0.046134506817907095, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7975687384605408, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 106.27083587646484, | |
| "epoch": 0.5816023738872403, | |
| "grad_norm": 19.016536712646484, | |
| "kl": 0.09228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7813318371772766, | |
| "reward_std": 0.08001636108383536, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.791748434305191, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 104.04166984558105, | |
| "epoch": 0.5875370919881305, | |
| "grad_norm": 7.989252090454102, | |
| "kl": 0.087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8176788091659546, | |
| "reward_std": 0.04656107863411307, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.817678764462471, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 104.83333587646484, | |
| "epoch": 0.5934718100890207, | |
| "grad_norm": 12.417603492736816, | |
| "kl": 0.093994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.756578117609024, | |
| "reward_std": 0.09739597979933023, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7669947296380997, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 103.44791793823242, | |
| "epoch": 0.599406528189911, | |
| "grad_norm": 8.88759708404541, | |
| "kl": 0.1048583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.710509866476059, | |
| "reward_std": 0.16302508860826492, | |
| "rewards/format_reward": 0.9583333730697632, | |
| "rewards/segmentation_reward": 0.7521764636039734, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 104.97916984558105, | |
| "epoch": 0.6053412462908012, | |
| "grad_norm": 6.247071266174316, | |
| "kl": 0.0887451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.764405071735382, | |
| "reward_std": 0.09363555815070868, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7748217582702637, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 103.29166793823242, | |
| "epoch": 0.6112759643916914, | |
| "grad_norm": 8.60831069946289, | |
| "kl": 0.0771484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7240653932094574, | |
| "reward_std": 0.07296892208978534, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7240653038024902, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 104.32291984558105, | |
| "epoch": 0.6172106824925816, | |
| "grad_norm": 41.23006820678711, | |
| "kl": 0.089599609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7711005508899689, | |
| "reward_std": 0.10164707154035568, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7815171927213669, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 102.12500190734863, | |
| "epoch": 0.6231454005934718, | |
| "grad_norm": 10.650015830993652, | |
| "kl": 0.0810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7078483700752258, | |
| "reward_std": 0.047545977868139744, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.707848384976387, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 103.43750381469727, | |
| "epoch": 0.629080118694362, | |
| "grad_norm": 5.714875221252441, | |
| "kl": 0.088623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7607176005840302, | |
| "reward_std": 0.11030509509146214, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7815508991479874, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 102.39583587646484, | |
| "epoch": 0.6350148367952523, | |
| "grad_norm": 4.9711079597473145, | |
| "kl": 0.0848388671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7914873659610748, | |
| "reward_std": 0.07838396297302097, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8019039928913116, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 105.53125190734863, | |
| "epoch": 0.6409495548961425, | |
| "grad_norm": 34.618621826171875, | |
| "kl": 0.094970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7531647086143494, | |
| "reward_std": 0.133857280947268, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7739980816841125, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 106.22917175292969, | |
| "epoch": 0.6468842729970327, | |
| "grad_norm": 65.69969177246094, | |
| "kl": 0.076904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8055002093315125, | |
| "reward_std": 0.04653235850855708, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8055001646280289, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 104.59375190734863, | |
| "epoch": 0.6528189910979229, | |
| "grad_norm": 6.102004051208496, | |
| "kl": 0.078857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7133222222328186, | |
| "reward_std": 0.11231098510324955, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7341555207967758, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 109.19791793823242, | |
| "epoch": 0.658753709198813, | |
| "grad_norm": 10.22568130493164, | |
| "kl": 0.07861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7694753110408783, | |
| "reward_std": 0.08740894356742501, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7798919528722763, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 110.48958587646484, | |
| "epoch": 0.6646884272997032, | |
| "grad_norm": 14.369743347167969, | |
| "kl": 0.077880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.678814172744751, | |
| "reward_std": 0.11626282706856728, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6996474862098694, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 108.34375381469727, | |
| "epoch": 0.6706231454005934, | |
| "grad_norm": 9.375542640686035, | |
| "kl": 0.0745849609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.76340052485466, | |
| "reward_std": 0.06236946932040155, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7738171219825745, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 106.72916984558105, | |
| "epoch": 0.6765578635014837, | |
| "grad_norm": 11.679986000061035, | |
| "kl": 0.0679931640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7401450872421265, | |
| "reward_std": 0.1390870539471507, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7713950872421265, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 112.61458587646484, | |
| "epoch": 0.6824925816023739, | |
| "grad_norm": 14.078929901123047, | |
| "kl": 0.0660400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8118026554584503, | |
| "reward_std": 0.06046540685929358, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8222192972898483, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 114.12500190734863, | |
| "epoch": 0.6884272997032641, | |
| "grad_norm": 30.76274299621582, | |
| "kl": 0.06036376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7482226490974426, | |
| "reward_std": 0.0746369045227766, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7586392909288406, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 114.25000381469727, | |
| "epoch": 0.6943620178041543, | |
| "grad_norm": 5.525589466094971, | |
| "kl": 0.06390380859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.668542355298996, | |
| "reward_std": 0.16833586525171995, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7102090418338776, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 115.56250381469727, | |
| "epoch": 0.7002967359050445, | |
| "grad_norm": 6.112927436828613, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7961181998252869, | |
| "reward_std": 0.10987653583288193, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8065348565578461, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 118.67708587646484, | |
| "epoch": 0.7062314540059347, | |
| "grad_norm": 5.205990314483643, | |
| "kl": 0.0548095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8050501644611359, | |
| "reward_std": 0.057498088805004954, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8154668360948563, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 126.40625381469727, | |
| "epoch": 0.712166172106825, | |
| "grad_norm": 17.208343505859375, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7176477015018463, | |
| "reward_std": 0.033727534115314484, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7176476120948792, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 125.34375381469727, | |
| "epoch": 0.7181008902077152, | |
| "grad_norm": 5.374625205993652, | |
| "kl": 0.05877685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7735000848770142, | |
| "reward_std": 0.05824981536716223, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7839167714118958, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 127.64583587646484, | |
| "epoch": 0.7240356083086054, | |
| "grad_norm": 7.2726311683654785, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8391265571117401, | |
| "reward_std": 0.04235434322617948, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8391265720129013, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 136.0416717529297, | |
| "epoch": 0.7299703264094956, | |
| "grad_norm": 3.9126789569854736, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.7534107267856598, | |
| "reward_std": 0.0715349493548274, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7638274282217026, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 132.17708778381348, | |
| "epoch": 0.7359050445103857, | |
| "grad_norm": 7.291192531585693, | |
| "kl": 0.04913330078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.6664856970310211, | |
| "reward_std": 0.0711211496964097, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6769023388624191, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 135.2708396911621, | |
| "epoch": 0.7418397626112759, | |
| "grad_norm": 5.235753059387207, | |
| "kl": 0.0531005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.666751205921173, | |
| "reward_std": 0.22133541852235794, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.7188344746828079, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 137.87500762939453, | |
| "epoch": 0.7477744807121661, | |
| "grad_norm": 7.52689266204834, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.717499554157257, | |
| "reward_std": 0.08941709902137518, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7279161959886551, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 142.9583396911621, | |
| "epoch": 0.7537091988130564, | |
| "grad_norm": 6.447893142700195, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.729353815317154, | |
| "reward_std": 0.0784148364327848, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7397704720497131, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 138.9479217529297, | |
| "epoch": 0.7596439169139466, | |
| "grad_norm": 9.053107261657715, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7839249968528748, | |
| "reward_std": 0.018866646569222212, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7839248925447464, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 139.73958587646484, | |
| "epoch": 0.7655786350148368, | |
| "grad_norm": 6.239522933959961, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8248617351055145, | |
| "reward_std": 0.047789576230570674, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8248616605997086, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 140.15625381469727, | |
| "epoch": 0.771513353115727, | |
| "grad_norm": 6.727787971496582, | |
| "kl": 0.05694580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7491380870342255, | |
| "reward_std": 0.0848769242875278, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7595547139644623, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 142.87500381469727, | |
| "epoch": 0.7774480712166172, | |
| "grad_norm": 10.61686897277832, | |
| "kl": 0.0516357421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6897242963314056, | |
| "reward_std": 0.11665517743676901, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7105576545000076, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 139.32291793823242, | |
| "epoch": 0.7833827893175074, | |
| "grad_norm": 26.17420196533203, | |
| "kl": 0.05377197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7028740346431732, | |
| "reward_std": 0.12365616485476494, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.723707303404808, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 141.1354217529297, | |
| "epoch": 0.7893175074183977, | |
| "grad_norm": 7.808278560638428, | |
| "kl": 0.05169677734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7467839121818542, | |
| "reward_std": 0.06525697093456984, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7572005093097687, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 139.0729217529297, | |
| "epoch": 0.7952522255192879, | |
| "grad_norm": 15.984428405761719, | |
| "kl": 0.06195068359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7193138897418976, | |
| "reward_std": 0.08782277535647154, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7297305464744568, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 144.7604217529297, | |
| "epoch": 0.8011869436201781, | |
| "grad_norm": 11.879109382629395, | |
| "kl": 0.052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7329791486263275, | |
| "reward_std": 0.07974315900355577, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7433958202600479, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 138.1770896911621, | |
| "epoch": 0.8071216617210683, | |
| "grad_norm": 7.3762383460998535, | |
| "kl": 0.0635986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.6841251850128174, | |
| "reward_std": 0.14460914488881826, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7153750509023666, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 136.5104217529297, | |
| "epoch": 0.8130563798219584, | |
| "grad_norm": 21.975542068481445, | |
| "kl": 0.0584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7298172414302826, | |
| "reward_std": 0.11540778167545795, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7506504952907562, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 138.06250762939453, | |
| "epoch": 0.8189910979228486, | |
| "grad_norm": 9.58665657043457, | |
| "kl": 0.05853271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7674833238124847, | |
| "reward_std": 0.0737875527702272, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7779000103473663, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 136.50000381469727, | |
| "epoch": 0.8249258160237388, | |
| "grad_norm": 6.232303142547607, | |
| "kl": 0.068603515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7411607801914215, | |
| "reward_std": 0.11014922056347132, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7619940489530563, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 135.50000762939453, | |
| "epoch": 0.8308605341246291, | |
| "grad_norm": 5.448104381561279, | |
| "kl": 0.0635986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7017558217048645, | |
| "reward_std": 0.14623194839805365, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7330057322978973, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 130.5104217529297, | |
| "epoch": 0.8367952522255193, | |
| "grad_norm": 6.76865816116333, | |
| "kl": 0.05865478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8449821770191193, | |
| "reward_std": 0.026794791920110583, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8449821621179581, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 132.0208396911621, | |
| "epoch": 0.8427299703264095, | |
| "grad_norm": 28.80607032775879, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7313937842845917, | |
| "reward_std": 0.17943856306374073, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7730603665113449, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 130.68750381469727, | |
| "epoch": 0.8486646884272997, | |
| "grad_norm": 6.0609564781188965, | |
| "kl": 0.0596923828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7204857766628265, | |
| "reward_std": 0.13561286870390177, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7413192093372345, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 132.06250190734863, | |
| "epoch": 0.8545994065281899, | |
| "grad_norm": 12.343249320983887, | |
| "kl": 0.06207275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.6887616515159607, | |
| "reward_std": 0.09354105032980442, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6991782933473587, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 128.94791984558105, | |
| "epoch": 0.8605341246290801, | |
| "grad_norm": 14.069941520690918, | |
| "kl": 0.064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7474890351295471, | |
| "reward_std": 0.13476973632350564, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7787390351295471, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 127.62500381469727, | |
| "epoch": 0.8664688427299704, | |
| "grad_norm": 10.8029146194458, | |
| "kl": 0.0654296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7317570745944977, | |
| "reward_std": 0.09545435523614287, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7525903880596161, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 130.9166717529297, | |
| "epoch": 0.8724035608308606, | |
| "grad_norm": 11.501479148864746, | |
| "kl": 0.0904541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.686740756034851, | |
| "reward_std": 0.1634499505162239, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7179906815290451, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 126.8750057220459, | |
| "epoch": 0.8783382789317508, | |
| "grad_norm": 13.238967895507812, | |
| "kl": 0.06903076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7835940718650818, | |
| "reward_std": 0.07068347651511431, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7940107583999634, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 128.73958778381348, | |
| "epoch": 0.884272997032641, | |
| "grad_norm": 12.029227256774902, | |
| "kl": 0.0657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7471203207969666, | |
| "reward_std": 0.051280025159940124, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7471202611923218, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 126.30208969116211, | |
| "epoch": 0.8902077151335311, | |
| "grad_norm": 10.597739219665527, | |
| "kl": 0.05908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7907660007476807, | |
| "reward_std": 0.095851581543684, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8011826276779175, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 130.15625762939453, | |
| "epoch": 0.8961424332344213, | |
| "grad_norm": 11.776272773742676, | |
| "kl": 0.06475830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8018748760223389, | |
| "reward_std": 0.06940475525334477, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8122915327548981, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 130.97916793823242, | |
| "epoch": 0.9020771513353115, | |
| "grad_norm": 13.906003952026367, | |
| "kl": 0.0582275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8337860107421875, | |
| "reward_std": 0.026843111030757427, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8337860107421875, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 130.58333778381348, | |
| "epoch": 0.9080118694362018, | |
| "grad_norm": 14.249563217163086, | |
| "kl": 0.05963134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.749543309211731, | |
| "reward_std": 0.02320151124149561, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7495433241128922, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 123.80208396911621, | |
| "epoch": 0.913946587537092, | |
| "grad_norm": 31.058822631835938, | |
| "kl": 0.056396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7307706773281097, | |
| "reward_std": 0.09688921645283699, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7411873042583466, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 127.3437557220459, | |
| "epoch": 0.9198813056379822, | |
| "grad_norm": 24.157621383666992, | |
| "kl": 0.0574951171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7422930300235748, | |
| "reward_std": 0.05691177165135741, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.742293044924736, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 125.56250381469727, | |
| "epoch": 0.9258160237388724, | |
| "grad_norm": 8.896293640136719, | |
| "kl": 0.06060791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7800202369689941, | |
| "reward_std": 0.10962340701371431, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8008535355329514, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 129.05208587646484, | |
| "epoch": 0.9317507418397626, | |
| "grad_norm": 9.590377807617188, | |
| "kl": 0.060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8012830018997192, | |
| "reward_std": 0.017546723363921046, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8012829422950745, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 126.30208396911621, | |
| "epoch": 0.9376854599406528, | |
| "grad_norm": 8.54839038848877, | |
| "kl": 0.05804443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.6956645548343658, | |
| "reward_std": 0.04086484480649233, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6956644654273987, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 128.5104217529297, | |
| "epoch": 0.9436201780415431, | |
| "grad_norm": 6.301590442657471, | |
| "kl": 0.05426025390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7389605939388275, | |
| "reward_std": 0.11934427171945572, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7493772804737091, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 126.62500381469727, | |
| "epoch": 0.9495548961424333, | |
| "grad_norm": 6.094778537750244, | |
| "kl": 0.05352783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7299975156784058, | |
| "reward_std": 0.05153268342837691, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7299974858760834, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 126.44792175292969, | |
| "epoch": 0.9554896142433235, | |
| "grad_norm": 16.101211547851562, | |
| "kl": 0.05364990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7796873152256012, | |
| "reward_std": 0.10896322131156921, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8005205690860748, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 127.29167366027832, | |
| "epoch": 0.9614243323442137, | |
| "grad_norm": 9.07394790649414, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.806904137134552, | |
| "reward_std": 0.03972620144486427, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8069040328264236, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 124.92708778381348, | |
| "epoch": 0.9673590504451038, | |
| "grad_norm": 4.550307750701904, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7452739477157593, | |
| "reward_std": 0.10492927418090403, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7765238583087921, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 125.27083587646484, | |
| "epoch": 0.973293768545994, | |
| "grad_norm": 7.808937072753906, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7517999708652496, | |
| "reward_std": 0.09145255433395505, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7622165530920029, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 125.53125381469727, | |
| "epoch": 0.9792284866468842, | |
| "grad_norm": 4.795185565948486, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7628948986530304, | |
| "reward_std": 0.05225597298704088, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.773311510682106, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 126.81250381469727, | |
| "epoch": 0.9851632047477745, | |
| "grad_norm": 212.8815460205078, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7152629494667053, | |
| "reward_std": 0.0905265836045146, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7256796211004257, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 122.66666793823242, | |
| "epoch": 0.9910979228486647, | |
| "grad_norm": 13.994025230407715, | |
| "kl": 0.05609130859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.82010880112648, | |
| "reward_std": 0.04186421073973179, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8201088160276413, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 116.84375190734863, | |
| "epoch": 0.9970326409495549, | |
| "grad_norm": 6.674865245819092, | |
| "kl": 0.05755615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8161313235759735, | |
| "reward_std": 0.020860509714111686, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8161313384771347, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 122.33333587646484, | |
| "epoch": 1.0, | |
| "grad_norm": 6.674865245819092, | |
| "kl": 0.05810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8273937106132507, | |
| "reward_std": 0.05868770182132721, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8482270240783691, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 120.18750190734863, | |
| "epoch": 1.0059347181008902, | |
| "grad_norm": 4.1788835525512695, | |
| "kl": 0.0687255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.6861424446105957, | |
| "reward_std": 0.07579059433192015, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6965591013431549, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 121.20833587646484, | |
| "epoch": 1.0118694362017804, | |
| "grad_norm": 8.323039054870605, | |
| "kl": 0.06103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8149082660675049, | |
| "reward_std": 0.06312395888380706, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8253249377012253, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 120.20833587646484, | |
| "epoch": 1.0178041543026706, | |
| "grad_norm": 4.517621040344238, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.792221575975418, | |
| "reward_std": 0.021815289743244648, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7922215461730957, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 119.87500190734863, | |
| "epoch": 1.0237388724035608, | |
| "grad_norm": 9.510743141174316, | |
| "kl": 0.05963134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8078531324863434, | |
| "reward_std": 0.07382986601442099, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8182697743177414, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 119.87500190734863, | |
| "epoch": 1.029673590504451, | |
| "grad_norm": 5.854668140411377, | |
| "kl": 0.07275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8211482167243958, | |
| "reward_std": 0.08701172703877091, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8315648138523102, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 115.69791984558105, | |
| "epoch": 1.0356083086053411, | |
| "grad_norm": 3.5263679027557373, | |
| "kl": 0.068603515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7708676159381866, | |
| "reward_std": 0.07291271979920566, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7812842577695847, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 119.44791984558105, | |
| "epoch": 1.0415430267062316, | |
| "grad_norm": 9.571796417236328, | |
| "kl": 0.06640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7062998712062836, | |
| "reward_std": 0.03168163984082639, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7062998414039612, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 115.05208587646484, | |
| "epoch": 1.0474777448071217, | |
| "grad_norm": 3.939903974533081, | |
| "kl": 0.07196044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.6977596580982208, | |
| "reward_std": 0.0435628320556134, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6977596431970596, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 120.00000381469727, | |
| "epoch": 1.053412462908012, | |
| "grad_norm": 5.052703380584717, | |
| "kl": 0.072265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7161425948143005, | |
| "reward_std": 0.08243446378037333, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.726559191942215, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 114.14583778381348, | |
| "epoch": 1.0593471810089021, | |
| "grad_norm": 3.9155845642089844, | |
| "kl": 0.07421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.7838780879974365, | |
| "reward_std": 0.05022241431288421, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.794294685125351, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 113.77083396911621, | |
| "epoch": 1.0652818991097923, | |
| "grad_norm": 6.6712727546691895, | |
| "kl": 0.0728759765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7887768745422363, | |
| "reward_std": 0.03889981145039201, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7887768745422363, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 115.16666793823242, | |
| "epoch": 1.0712166172106825, | |
| "grad_norm": 8.987320899963379, | |
| "kl": 0.068359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7815601825714111, | |
| "reward_std": 0.045000725833233446, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7815601229667664, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 115.10416793823242, | |
| "epoch": 1.0771513353115727, | |
| "grad_norm": 5.333348274230957, | |
| "kl": 0.08056640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.807422399520874, | |
| "reward_std": 0.06547991407569498, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8178391009569168, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 113.58333587646484, | |
| "epoch": 1.083086053412463, | |
| "grad_norm": 14.877934455871582, | |
| "kl": 0.0751953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.7589649856090546, | |
| "reward_std": 0.036034910939633846, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7589650005102158, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 115.14583587646484, | |
| "epoch": 1.089020771513353, | |
| "grad_norm": 5.486746311187744, | |
| "kl": 0.078369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.692201405763626, | |
| "reward_std": 0.06457182578742504, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6922013610601425, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 116.83333396911621, | |
| "epoch": 1.0949554896142433, | |
| "grad_norm": 22.6247501373291, | |
| "kl": 0.069580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7884438037872314, | |
| "reward_std": 0.0695024150190875, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7988604456186295, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 114.22916984558105, | |
| "epoch": 1.1008902077151335, | |
| "grad_norm": 3.4625394344329834, | |
| "kl": 0.06756591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7949866950511932, | |
| "reward_std": 0.0218111855792813, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7949866652488708, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 114.61458587646484, | |
| "epoch": 1.1068249258160237, | |
| "grad_norm": 3.797193765640259, | |
| "kl": 0.07470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.7465449571609497, | |
| "reward_std": 0.06478537991642952, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7569615691900253, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 113.55208587646484, | |
| "epoch": 1.1127596439169138, | |
| "grad_norm": 3.3276095390319824, | |
| "kl": 0.091552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7474755942821503, | |
| "reward_std": 0.07169347535818815, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.757892295718193, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 115.68750381469727, | |
| "epoch": 1.1186943620178043, | |
| "grad_norm": 5.974353790283203, | |
| "kl": 0.079345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.8201344907283783, | |
| "reward_std": 0.04725779825821519, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8305511772632599, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 117.37500190734863, | |
| "epoch": 1.1246290801186944, | |
| "grad_norm": 16.502485275268555, | |
| "kl": 0.077880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8688680529594421, | |
| "reward_std": 0.01468480727635324, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8688680231571198, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 114.40625190734863, | |
| "epoch": 1.1305637982195846, | |
| "grad_norm": 5.832488059997559, | |
| "kl": 0.082275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7779072523117065, | |
| "reward_std": 0.10380933433771133, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7987405955791473, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 113.42708587646484, | |
| "epoch": 1.1364985163204748, | |
| "grad_norm": 11.94194507598877, | |
| "kl": 0.0953369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7498697340488434, | |
| "reward_std": 0.06882704934105277, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.760286420583725, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 114.39583778381348, | |
| "epoch": 1.142433234421365, | |
| "grad_norm": 3.888754367828369, | |
| "kl": 0.0953369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7453253865242004, | |
| "reward_std": 0.10540217161178589, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/segmentation_reward": 0.776575356721878, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 119.65625381469727, | |
| "epoch": 1.1483679525222552, | |
| "grad_norm": 8.40555477142334, | |
| "kl": 0.0889892578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7195344865322113, | |
| "reward_std": 0.0910744748543948, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7299510538578033, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 117.95833396911621, | |
| "epoch": 1.1543026706231454, | |
| "grad_norm": 14.257683753967285, | |
| "kl": 0.0919189453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7635149657726288, | |
| "reward_std": 0.049157430883497, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7635148763656616, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 118.45833587646484, | |
| "epoch": 1.1602373887240356, | |
| "grad_norm": 4.263291835784912, | |
| "kl": 0.08837890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.6939684748649597, | |
| "reward_std": 0.08351217093877494, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7148017585277557, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 114.95833778381348, | |
| "epoch": 1.1661721068249258, | |
| "grad_norm": 4.633386611938477, | |
| "kl": 0.0999755859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.8257586061954498, | |
| "reward_std": 0.03723354451358318, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.825758546590805, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 115.04166793823242, | |
| "epoch": 1.172106824925816, | |
| "grad_norm": 5.097075939178467, | |
| "kl": 0.09375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7617026567459106, | |
| "reward_std": 0.12132613873109221, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7825359106063843, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 116.18750381469727, | |
| "epoch": 1.1780415430267062, | |
| "grad_norm": 3.8244075775146484, | |
| "kl": 0.096435546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7776153683662415, | |
| "reward_std": 0.11768396757543087, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7984486818313599, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 118.48958587646484, | |
| "epoch": 1.1839762611275964, | |
| "grad_norm": 3.69380521774292, | |
| "kl": 0.0916748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7464786171913147, | |
| "reward_std": 0.12091443943791091, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7777286469936371, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 120.65625190734863, | |
| "epoch": 1.1899109792284865, | |
| "grad_norm": 17.07198715209961, | |
| "kl": 0.11572265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0046, | |
| "reward": 1.7191323935985565, | |
| "reward_std": 0.07988837361335754, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7295490056276321, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 124.06250190734863, | |
| "epoch": 1.1958456973293767, | |
| "grad_norm": 3.0627005100250244, | |
| "kl": 0.0885009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.70257368683815, | |
| "reward_std": 0.15230464632622898, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.73382368683815, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 118.03125, | |
| "epoch": 1.2017804154302671, | |
| "grad_norm": 4.091593265533447, | |
| "kl": 0.09326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7475507259368896, | |
| "reward_std": 0.09758387203328311, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7683840394020081, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 120.31250190734863, | |
| "epoch": 1.2077151335311573, | |
| "grad_norm": 2.6315650939941406, | |
| "kl": 0.0948486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.778723031282425, | |
| "reward_std": 0.06321048270910978, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7891396880149841, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 117.28125, | |
| "epoch": 1.2136498516320475, | |
| "grad_norm": 5.009098052978516, | |
| "kl": 0.093505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7251938581466675, | |
| "reward_std": 0.1409313241019845, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7564438581466675, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 115.86458587646484, | |
| "epoch": 1.2195845697329377, | |
| "grad_norm": 8.25062084197998, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.674118846654892, | |
| "reward_std": 0.17346674762666225, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7157854735851288, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 117.28125190734863, | |
| "epoch": 1.225519287833828, | |
| "grad_norm": 4.700351715087891, | |
| "kl": 0.09814453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.642125815153122, | |
| "reward_std": 0.23734556511044502, | |
| "rewards/format_reward": 0.9375000298023224, | |
| "rewards/segmentation_reward": 0.7046257257461548, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 117.13541984558105, | |
| "epoch": 1.231454005934718, | |
| "grad_norm": 10.711128234863281, | |
| "kl": 0.091552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7565841972827911, | |
| "reward_std": 0.11169130681082606, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7774175554513931, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 116.43750190734863, | |
| "epoch": 1.2373887240356083, | |
| "grad_norm": 5.590770244598389, | |
| "kl": 0.0892333984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7059676349163055, | |
| "reward_std": 0.1644108621403575, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7372175455093384, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 117.04166984558105, | |
| "epoch": 1.2433234421364985, | |
| "grad_norm": 5.0753092765808105, | |
| "kl": 0.0899658203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7945037186145782, | |
| "reward_std": 0.09403454745188355, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8153369724750519, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 115.83333396911621, | |
| "epoch": 1.2492581602373887, | |
| "grad_norm": 6.409451007843018, | |
| "kl": 0.09228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.731789082288742, | |
| "reward_std": 0.15776935871690512, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7630390673875809, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 114.11458587646484, | |
| "epoch": 1.2551928783382789, | |
| "grad_norm": 4.285139083862305, | |
| "kl": 0.0980224609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7591657638549805, | |
| "reward_std": 0.14351706253364682, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7904157638549805, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 112.38541984558105, | |
| "epoch": 1.2611275964391693, | |
| "grad_norm": 10.488381385803223, | |
| "kl": 0.1031494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.7158048748970032, | |
| "reward_std": 0.07641064748167992, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.726221576333046, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 115.22916793823242, | |
| "epoch": 1.2670623145400595, | |
| "grad_norm": 4.701923370361328, | |
| "kl": 0.0870361328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.783218115568161, | |
| "reward_std": 0.04100660281255841, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7832181453704834, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 119.78125190734863, | |
| "epoch": 1.2729970326409497, | |
| "grad_norm": 4.9908766746521, | |
| "kl": 0.0970458984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.752919316291809, | |
| "reward_std": 0.15398756321519613, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7737526744604111, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 116.95833587646484, | |
| "epoch": 1.2789317507418398, | |
| "grad_norm": 13.415197372436523, | |
| "kl": 0.090087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.6532210111618042, | |
| "reward_std": 0.2240110612474382, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.705304279923439, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 119.66666984558105, | |
| "epoch": 1.28486646884273, | |
| "grad_norm": 2.535939931869507, | |
| "kl": 0.09765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7339254915714264, | |
| "reward_std": 0.13148222491145134, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.765175461769104, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 118.87500190734863, | |
| "epoch": 1.2908011869436202, | |
| "grad_norm": 4.534719944000244, | |
| "kl": 0.1004638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.8009094595909119, | |
| "reward_std": 0.07824655435979366, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8113261461257935, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 120.03125381469727, | |
| "epoch": 1.2967359050445104, | |
| "grad_norm": 9.774252891540527, | |
| "kl": 0.100830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.761326789855957, | |
| "reward_std": 0.04916087444871664, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7613267600536346, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 114.40625, | |
| "epoch": 1.3026706231454006, | |
| "grad_norm": 3.3857150077819824, | |
| "kl": 0.094970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7107034027576447, | |
| "reward_std": 0.12496633175760508, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7315367162227631, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 118.66666793823242, | |
| "epoch": 1.3086053412462908, | |
| "grad_norm": 3.008610725402832, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.751450091600418, | |
| "reward_std": 0.06824500812217593, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7618667632341385, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 119.70833587646484, | |
| "epoch": 1.314540059347181, | |
| "grad_norm": 3.0898277759552, | |
| "kl": 0.08740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7804038226604462, | |
| "reward_std": 0.05429700808599591, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7908205091953278, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 114.90625381469727, | |
| "epoch": 1.3204747774480712, | |
| "grad_norm": 3.706261396408081, | |
| "kl": 0.0892333984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7740592658519745, | |
| "reward_std": 0.04056970216333866, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7740592062473297, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 117.84375190734863, | |
| "epoch": 1.3264094955489614, | |
| "grad_norm": 6.03821325302124, | |
| "kl": 0.1055908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.7339475452899933, | |
| "reward_std": 0.053982728626579046, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7443641424179077, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 120.10416984558105, | |
| "epoch": 1.3323442136498516, | |
| "grad_norm": 7.86570930480957, | |
| "kl": 0.081787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7618150115013123, | |
| "reward_std": 0.11194364842958748, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7826483100652695, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 119.80208778381348, | |
| "epoch": 1.3382789317507418, | |
| "grad_norm": 5.573207378387451, | |
| "kl": 0.089111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7204334437847137, | |
| "reward_std": 0.14075168408453465, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7516833990812302, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 118.75000190734863, | |
| "epoch": 1.344213649851632, | |
| "grad_norm": 2.9999611377716064, | |
| "kl": 0.093017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7968370914459229, | |
| "reward_std": 0.05921215028502047, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8072537481784821, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 127.47916984558105, | |
| "epoch": 1.3501483679525221, | |
| "grad_norm": 3.0452966690063477, | |
| "kl": 0.0789794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7333484888076782, | |
| "reward_std": 0.13363408669829369, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7645984143018723, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 118.04166984558105, | |
| "epoch": 1.3560830860534125, | |
| "grad_norm": 19.453765869140625, | |
| "kl": 0.0914306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7236287295818329, | |
| "reward_std": 0.13475321233272552, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7548786997795105, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 125.07292175292969, | |
| "epoch": 1.3620178041543027, | |
| "grad_norm": 3.446118116378784, | |
| "kl": 0.0867919921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7927662432193756, | |
| "reward_std": 0.050292326137423515, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7927661687135696, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 122.11458778381348, | |
| "epoch": 1.367952522255193, | |
| "grad_norm": 3.5459084510803223, | |
| "kl": 0.0965576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7196455001831055, | |
| "reward_std": 0.10603441158309579, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7300621271133423, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 124.71875381469727, | |
| "epoch": 1.3738872403560831, | |
| "grad_norm": 2.9705264568328857, | |
| "kl": 0.0799560546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.831710696220398, | |
| "reward_std": 0.03920856770128012, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.831710696220398, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 121.53125190734863, | |
| "epoch": 1.3798219584569733, | |
| "grad_norm": 3.8888819217681885, | |
| "kl": 0.07861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.816581815481186, | |
| "reward_std": 0.03990558721125126, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8165817707777023, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 123.67708587646484, | |
| "epoch": 1.3857566765578635, | |
| "grad_norm": 4.347487449645996, | |
| "kl": 0.0831298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7707000076770782, | |
| "reward_std": 0.10984114836901426, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7915334403514862, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 122.53125190734863, | |
| "epoch": 1.3916913946587537, | |
| "grad_norm": 3.111170768737793, | |
| "kl": 0.08544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.776373028755188, | |
| "reward_std": 0.10835065133869648, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7972063720226288, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 127.32291793823242, | |
| "epoch": 1.3976261127596439, | |
| "grad_norm": 15.74927806854248, | |
| "kl": 0.078857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.8090568482875824, | |
| "reward_std": 0.02911346103064716, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8090568333864212, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 128.96875381469727, | |
| "epoch": 1.403560830860534, | |
| "grad_norm": 3.8220973014831543, | |
| "kl": 0.084228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.6805144250392914, | |
| "reward_std": 0.05462493887171149, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.680514469742775, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 123.5625057220459, | |
| "epoch": 1.4094955489614243, | |
| "grad_norm": 7.271127700805664, | |
| "kl": 0.081298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.8020037412643433, | |
| "reward_std": 0.07482604053802788, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8124203979969025, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 127.60417175292969, | |
| "epoch": 1.4154302670623147, | |
| "grad_norm": 3.528641700744629, | |
| "kl": 0.0869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7796452343463898, | |
| "reward_std": 0.09994546975940466, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8004785478115082, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 132.9895896911621, | |
| "epoch": 1.4213649851632049, | |
| "grad_norm": 4.905125617980957, | |
| "kl": 0.0906982421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7627727091312408, | |
| "reward_std": 0.0973970009945333, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7836060523986816, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 131.90625381469727, | |
| "epoch": 1.427299703264095, | |
| "grad_norm": 4.065957546234131, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7551555633544922, | |
| "reward_std": 0.0871009798720479, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7655721604824066, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 127.75000381469727, | |
| "epoch": 1.4332344213649852, | |
| "grad_norm": 9.198005676269531, | |
| "kl": 0.0819091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7510286271572113, | |
| "reward_std": 0.0416345689445734, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7510285973548889, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 134.58333778381348, | |
| "epoch": 1.4391691394658754, | |
| "grad_norm": 2.298591375350952, | |
| "kl": 0.0924072265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7603932321071625, | |
| "reward_std": 0.08814567420631647, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7708099633455276, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 133.47916984558105, | |
| "epoch": 1.4451038575667656, | |
| "grad_norm": 3.710606336593628, | |
| "kl": 0.08740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7705409824848175, | |
| "reward_std": 0.10231837723404169, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7809576243162155, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 132.7395896911621, | |
| "epoch": 1.4510385756676558, | |
| "grad_norm": 9.152044296264648, | |
| "kl": 0.0809326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7908837795257568, | |
| "reward_std": 0.06551890983246267, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8013004064559937, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 131.01041793823242, | |
| "epoch": 1.456973293768546, | |
| "grad_norm": 8.228063583374023, | |
| "kl": 0.0845947265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7855284810066223, | |
| "reward_std": 0.025126937543973327, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7855284363031387, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 137.91666793823242, | |
| "epoch": 1.4629080118694362, | |
| "grad_norm": 6.479981422424316, | |
| "kl": 0.15380859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0061, | |
| "reward": 1.7316823601722717, | |
| "reward_std": 0.09728977642953396, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7420989274978638, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 129.26042366027832, | |
| "epoch": 1.4688427299703264, | |
| "grad_norm": 24.555652618408203, | |
| "kl": 0.0960693359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7869995534420013, | |
| "reward_std": 0.03961431025527418, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7869995981454849, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 129.4583396911621, | |
| "epoch": 1.4747774480712166, | |
| "grad_norm": 3.8002452850341797, | |
| "kl": 0.095458984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.8435184359550476, | |
| "reward_std": 0.028940949589014053, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8435183763504028, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 131.2291717529297, | |
| "epoch": 1.4807121661721068, | |
| "grad_norm": 5.852691650390625, | |
| "kl": 0.17578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.007, | |
| "reward": 1.7853436172008514, | |
| "reward_std": 0.08121407218277454, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7957603484392166, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 133.0729217529297, | |
| "epoch": 1.486646884272997, | |
| "grad_norm": 4.016416072845459, | |
| "kl": 0.0858154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.6717259883880615, | |
| "reward_std": 0.15333154564723372, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7029759734869003, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 130.0416717529297, | |
| "epoch": 1.4925816023738872, | |
| "grad_norm": 2.7940571308135986, | |
| "kl": 0.0894775390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7898103296756744, | |
| "reward_std": 0.11414735415019095, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8106437027454376, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 135.3333396911621, | |
| "epoch": 1.4985163204747773, | |
| "grad_norm": 6.82745885848999, | |
| "kl": 0.088134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8346801698207855, | |
| "reward_std": 0.02834852272644639, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8346801251173019, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 127.35417175292969, | |
| "epoch": 1.5044510385756675, | |
| "grad_norm": 2.9223759174346924, | |
| "kl": 0.0977783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.8006429970264435, | |
| "reward_std": 0.11039311997592449, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8214763551950455, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 128.17708587646484, | |
| "epoch": 1.5103857566765577, | |
| "grad_norm": 3.724637269973755, | |
| "kl": 0.1004638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7425091862678528, | |
| "reward_std": 0.08927370049059391, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.752925843000412, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 128.7708396911621, | |
| "epoch": 1.516320474777448, | |
| "grad_norm": 4.042162895202637, | |
| "kl": 0.092529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7844707369804382, | |
| "reward_std": 0.057451182045042515, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7844707667827606, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 125.92708587646484, | |
| "epoch": 1.5222551928783383, | |
| "grad_norm": 8.26577377319336, | |
| "kl": 0.0897216796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.8131219148635864, | |
| "reward_std": 0.021738199284300208, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8131218403577805, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 125.91667175292969, | |
| "epoch": 1.5281899109792285, | |
| "grad_norm": 3.753349781036377, | |
| "kl": 0.0914306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7764064967632294, | |
| "reward_std": 0.06681416090577841, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7868231534957886, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 125.88542175292969, | |
| "epoch": 1.5341246290801187, | |
| "grad_norm": 3.156595230102539, | |
| "kl": 0.10546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.7478066980838776, | |
| "reward_std": 0.1068794084712863, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7686399817466736, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 121.90625190734863, | |
| "epoch": 1.540059347181009, | |
| "grad_norm": 3.946298360824585, | |
| "kl": 0.097412109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7763938307762146, | |
| "reward_std": 0.12856396986171603, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8076437562704086, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 123.72917175292969, | |
| "epoch": 1.545994065281899, | |
| "grad_norm": 5.4555277824401855, | |
| "kl": 0.097900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7746759355068207, | |
| "reward_std": 0.05479801073670387, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7746759802103043, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 120.48958396911621, | |
| "epoch": 1.5519287833827893, | |
| "grad_norm": 4.6337199211120605, | |
| "kl": 0.1065673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.7088752686977386, | |
| "reward_std": 0.08904288220219314, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7192919701337814, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 117.96875381469727, | |
| "epoch": 1.5578635014836797, | |
| "grad_norm": 97.80472564697266, | |
| "kl": 0.098876953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.8269087076187134, | |
| "reward_std": 0.04217356303706765, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8269086629152298, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 115.72916984558105, | |
| "epoch": 1.5637982195845699, | |
| "grad_norm": 4.1004486083984375, | |
| "kl": 0.102294921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.8442316353321075, | |
| "reward_std": 0.04190053790807724, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8442316502332687, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 117.45833587646484, | |
| "epoch": 1.56973293768546, | |
| "grad_norm": 26.500062942504883, | |
| "kl": 0.099365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7497759461402893, | |
| "reward_std": 0.03234653011895716, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7497759461402893, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 117.09375381469727, | |
| "epoch": 1.5756676557863503, | |
| "grad_norm": 5.039199352264404, | |
| "kl": 0.1082763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.8009319305419922, | |
| "reward_std": 0.032779114320874214, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8009319305419922, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 117.69792175292969, | |
| "epoch": 1.5816023738872405, | |
| "grad_norm": 4.768641471862793, | |
| "kl": 0.1068115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.7576944530010223, | |
| "reward_std": 0.07862287666648626, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7681111395359039, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 119.13541984558105, | |
| "epoch": 1.5875370919881306, | |
| "grad_norm": 6.0930352210998535, | |
| "kl": 0.09130859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7958367764949799, | |
| "reward_std": 0.09232278482522815, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8166700899600983, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 115.75000381469727, | |
| "epoch": 1.5934718100890208, | |
| "grad_norm": 26.93886947631836, | |
| "kl": 0.0869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.788069725036621, | |
| "reward_std": 0.02124928869307041, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7880697846412659, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 114.47916984558105, | |
| "epoch": 1.599406528189911, | |
| "grad_norm": 7.8712663650512695, | |
| "kl": 0.0975341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7922349870204926, | |
| "reward_std": 0.06056637444999069, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8026516437530518, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 117.11458587646484, | |
| "epoch": 1.6053412462908012, | |
| "grad_norm": 4.941649913787842, | |
| "kl": 0.1014404296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.8086935579776764, | |
| "reward_std": 0.015388808911666274, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8086935430765152, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 116.30208587646484, | |
| "epoch": 1.6112759643916914, | |
| "grad_norm": 4.916129112243652, | |
| "kl": 0.1123046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0045, | |
| "reward": 1.6760722398757935, | |
| "reward_std": 0.05616055289283395, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6864889115095139, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 109.93750190734863, | |
| "epoch": 1.6172106824925816, | |
| "grad_norm": 3.3433773517608643, | |
| "kl": 0.099853515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7660618126392365, | |
| "reward_std": 0.05840137042105198, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7660617679357529, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 116.47916984558105, | |
| "epoch": 1.6231454005934718, | |
| "grad_norm": 3.4884281158447266, | |
| "kl": 0.1029052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.77039036154747, | |
| "reward_std": 0.044363456312566996, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7703903168439865, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 112.77083396911621, | |
| "epoch": 1.629080118694362, | |
| "grad_norm": 4.025818347930908, | |
| "kl": 0.1072998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.724378764629364, | |
| "reward_std": 0.07113260589540005, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7347953617572784, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 114.58333587646484, | |
| "epoch": 1.6350148367952522, | |
| "grad_norm": 12.856447219848633, | |
| "kl": 0.113525390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0045, | |
| "reward": 1.7462435364723206, | |
| "reward_std": 0.07672798447310925, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7566602230072021, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 114.94791793823242, | |
| "epoch": 1.6409495548961424, | |
| "grad_norm": 9.79211711883545, | |
| "kl": 0.106689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.7808694243431091, | |
| "reward_std": 0.07154811033979058, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7912861257791519, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 114.31250381469727, | |
| "epoch": 1.6468842729970326, | |
| "grad_norm": 4.4307942390441895, | |
| "kl": 0.0987548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.8354142606258392, | |
| "reward_std": 0.03158940875437111, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8354142606258392, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 113.84375190734863, | |
| "epoch": 1.6528189910979227, | |
| "grad_norm": 6.534969806671143, | |
| "kl": 0.1024169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.8007658421993256, | |
| "reward_std": 0.03258772916160524, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.800765797495842, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 116.56250190734863, | |
| "epoch": 1.658753709198813, | |
| "grad_norm": 3.3599140644073486, | |
| "kl": 0.0963134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7653041183948517, | |
| "reward_std": 0.08127650991082191, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7757207006216049, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 116.84375381469727, | |
| "epoch": 1.6646884272997031, | |
| "grad_norm": 3.239370107650757, | |
| "kl": 0.0994873046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7427730858325958, | |
| "reward_std": 0.17299647070467472, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7740231454372406, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 116.05208587646484, | |
| "epoch": 1.6706231454005933, | |
| "grad_norm": 9.61308765411377, | |
| "kl": 0.095947265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7398262023925781, | |
| "reward_std": 0.03492267336696386, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7398261874914169, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 117.16666793823242, | |
| "epoch": 1.6765578635014837, | |
| "grad_norm": 2.8493831157684326, | |
| "kl": 0.114990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0046, | |
| "reward": 1.8178634643554688, | |
| "reward_std": 0.04271406587213278, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8178633898496628, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 120.60416984558105, | |
| "epoch": 1.682492581602374, | |
| "grad_norm": 14.684961318969727, | |
| "kl": 0.091064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7771551609039307, | |
| "reward_std": 0.04852711455896497, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7771551162004471, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 124.32291793823242, | |
| "epoch": 1.688427299703264, | |
| "grad_norm": 15.723881721496582, | |
| "kl": 0.099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7905828952789307, | |
| "reward_std": 0.039514560252428055, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7905828356742859, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 125.19791984558105, | |
| "epoch": 1.6943620178041543, | |
| "grad_norm": 9.84542179107666, | |
| "kl": 0.0933837890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7513935565948486, | |
| "reward_std": 0.07400949532166123, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7618101686239243, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 126.54166793823242, | |
| "epoch": 1.7002967359050445, | |
| "grad_norm": 7.75327205657959, | |
| "kl": 0.1571044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0063, | |
| "reward": 1.7322804033756256, | |
| "reward_std": 0.09522599866613746, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7531136721372604, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 130.23958778381348, | |
| "epoch": 1.7062314540059347, | |
| "grad_norm": 2.8010153770446777, | |
| "kl": 0.087646484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7387493252754211, | |
| "reward_std": 0.07657346210908145, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7491659671068192, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 124.05208587646484, | |
| "epoch": 1.712166172106825, | |
| "grad_norm": 7.392086029052734, | |
| "kl": 0.0909423828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.8153815567493439, | |
| "reward_std": 0.05484287068247795, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8153815120458603, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 123.72916984558105, | |
| "epoch": 1.7181008902077153, | |
| "grad_norm": 3.426253080368042, | |
| "kl": 0.0906982421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.8041094243526459, | |
| "reward_std": 0.03429581504315138, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8041094243526459, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 128.9895896911621, | |
| "epoch": 1.7240356083086055, | |
| "grad_norm": 5.66267204284668, | |
| "kl": 0.1024169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.7256536781787872, | |
| "reward_std": 0.1030060425400734, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7464869767427444, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 126.36458587646484, | |
| "epoch": 1.7299703264094957, | |
| "grad_norm": 4.350327491760254, | |
| "kl": 0.08984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7592671513557434, | |
| "reward_std": 0.10288255475461483, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7801005095243454, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 126.08333778381348, | |
| "epoch": 1.7359050445103859, | |
| "grad_norm": 5.17351770401001, | |
| "kl": 0.09716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7503591477870941, | |
| "reward_std": 0.08022835082374513, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7607757598161697, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 124.03125190734863, | |
| "epoch": 1.741839762611276, | |
| "grad_norm": 13.972125053405762, | |
| "kl": 0.095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.6726593971252441, | |
| "reward_std": 0.1583157368004322, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7039093673229218, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 134.20833587646484, | |
| "epoch": 1.7477744807121662, | |
| "grad_norm": 2.3745687007904053, | |
| "kl": 0.1024169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.7382087111473083, | |
| "reward_std": 0.13995032757520676, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7694587409496307, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 131.4270896911621, | |
| "epoch": 1.7537091988130564, | |
| "grad_norm": 3.0401179790496826, | |
| "kl": 0.0921630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7459054291248322, | |
| "reward_std": 0.1274840518599376, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7771554589271545, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 131.6979217529297, | |
| "epoch": 1.7596439169139466, | |
| "grad_norm": 3.122546911239624, | |
| "kl": 0.103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.8136819303035736, | |
| "reward_std": 0.07234706217423081, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8240985721349716, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 129.11458778381348, | |
| "epoch": 1.7655786350148368, | |
| "grad_norm": 6.673977375030518, | |
| "kl": 0.092529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7540749609470367, | |
| "reward_std": 0.11415270157158375, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7749083042144775, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 131.53125762939453, | |
| "epoch": 1.771513353115727, | |
| "grad_norm": 3.5474982261657715, | |
| "kl": 0.097900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.6313540041446686, | |
| "reward_std": 0.21476275008171797, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.6834373325109482, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 140.59375381469727, | |
| "epoch": 1.7774480712166172, | |
| "grad_norm": 3.5867486000061035, | |
| "kl": 0.0789794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.6557375490665436, | |
| "reward_std": 0.16151662543416023, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.6869875341653824, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 142.75000762939453, | |
| "epoch": 1.7833827893175074, | |
| "grad_norm": 3.54956316947937, | |
| "kl": 0.087646484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.777003139257431, | |
| "reward_std": 0.048837858252227306, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7770031094551086, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 144.80208587646484, | |
| "epoch": 1.7893175074183976, | |
| "grad_norm": 5.2782673835754395, | |
| "kl": 0.08740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8177353739738464, | |
| "reward_std": 0.052700204541906714, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8177353739738464, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 141.9583396911621, | |
| "epoch": 1.7952522255192878, | |
| "grad_norm": 2.5410995483398438, | |
| "kl": 0.0814208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.8333467245101929, | |
| "reward_std": 0.03343971585854888, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8333466798067093, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 146.65625762939453, | |
| "epoch": 1.801186943620178, | |
| "grad_norm": 3.657487630844116, | |
| "kl": 0.0743408203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.7758903205394745, | |
| "reward_std": 0.04749991255812347, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7758902609348297, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 145.7083396911621, | |
| "epoch": 1.8071216617210681, | |
| "grad_norm": 5.898721218109131, | |
| "kl": 0.0816650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7634993493556976, | |
| "reward_std": 0.13334597554057837, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7947493195533752, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 144.53125381469727, | |
| "epoch": 1.8130563798219583, | |
| "grad_norm": 3.5383071899414062, | |
| "kl": 0.078369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8184546828269958, | |
| "reward_std": 0.06767121748998761, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.828871339559555, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 148.30208587646484, | |
| "epoch": 1.8189910979228485, | |
| "grad_norm": 3.506437063217163, | |
| "kl": 0.078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7857343256473541, | |
| "reward_std": 0.11842209007591009, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8065676689147949, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 150.96875381469727, | |
| "epoch": 1.8249258160237387, | |
| "grad_norm": 3.9226300716400146, | |
| "kl": 0.0777587890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7295546233654022, | |
| "reward_std": 0.1010670899413526, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.739971250295639, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 146.6666717529297, | |
| "epoch": 1.8308605341246291, | |
| "grad_norm": 17.635072708129883, | |
| "kl": 0.083740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7234169244766235, | |
| "reward_std": 0.1391521729528904, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7546669095754623, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 152.71875762939453, | |
| "epoch": 1.8367952522255193, | |
| "grad_norm": 8.344084739685059, | |
| "kl": 0.08251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7250747382640839, | |
| "reward_std": 0.05801352020353079, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7250746786594391, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 148.9479217529297, | |
| "epoch": 1.8427299703264095, | |
| "grad_norm": 5.004342555999756, | |
| "kl": 0.08349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7435450851917267, | |
| "reward_std": 0.12095003947615623, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7643783837556839, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 139.56250381469727, | |
| "epoch": 1.8486646884272997, | |
| "grad_norm": 9.769325256347656, | |
| "kl": 0.08349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.788212239742279, | |
| "reward_std": 0.06093810824677348, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7986288219690323, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 145.8541717529297, | |
| "epoch": 1.8545994065281899, | |
| "grad_norm": 4.7226643562316895, | |
| "kl": 0.0816650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7469014525413513, | |
| "reward_std": 0.08418664801865816, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7573181390762329, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 141.0208396911621, | |
| "epoch": 1.86053412462908, | |
| "grad_norm": 4.053534030914307, | |
| "kl": 0.0819091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7462977170944214, | |
| "reward_std": 0.12578246276825666, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7775476276874542, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 143.83333587646484, | |
| "epoch": 1.8664688427299705, | |
| "grad_norm": 2.9752867221832275, | |
| "kl": 0.080322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.8229963779449463, | |
| "reward_std": 0.03319215914234519, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8229963183403015, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 142.21875381469727, | |
| "epoch": 1.8724035608308607, | |
| "grad_norm": 3.922870397567749, | |
| "kl": 0.0765380859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8421693444252014, | |
| "reward_std": 0.03140254644677043, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8421692997217178, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 138.50000762939453, | |
| "epoch": 1.8783382789317509, | |
| "grad_norm": 3.026204824447632, | |
| "kl": 0.0841064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.8107014894485474, | |
| "reward_std": 0.0752922969404608, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.810701459646225, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 135.03125381469727, | |
| "epoch": 1.884272997032641, | |
| "grad_norm": 18.734750747680664, | |
| "kl": 0.08935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7768707573413849, | |
| "reward_std": 0.06880995538085699, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7872874289751053, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 142.09375, | |
| "epoch": 1.8902077151335313, | |
| "grad_norm": 2.997725248336792, | |
| "kl": 0.087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7464922964572906, | |
| "reward_std": 0.13362758047878742, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7673256248235703, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 145.7916717529297, | |
| "epoch": 1.8961424332344214, | |
| "grad_norm": 7.73359489440918, | |
| "kl": 0.082763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7588372826576233, | |
| "reward_std": 0.07530860649421811, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7692539393901825, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 138.47917556762695, | |
| "epoch": 1.9020771513353116, | |
| "grad_norm": 5.083198070526123, | |
| "kl": 0.0914306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.72692009806633, | |
| "reward_std": 0.08305464556906372, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7477534264326096, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 142.41666793823242, | |
| "epoch": 1.9080118694362018, | |
| "grad_norm": 6.715488433837891, | |
| "kl": 0.0872802734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7511941194534302, | |
| "reward_std": 0.031710159964859486, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.751194104552269, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 138.04167556762695, | |
| "epoch": 1.913946587537092, | |
| "grad_norm": 2.5687637329101562, | |
| "kl": 0.0877685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8096555471420288, | |
| "reward_std": 0.03562284540385008, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.809655487537384, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 136.03125381469727, | |
| "epoch": 1.9198813056379822, | |
| "grad_norm": 5.063670635223389, | |
| "kl": 0.103759765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.699085146188736, | |
| "reward_std": 0.09667661227285862, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7095017433166504, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 135.0208396911621, | |
| "epoch": 1.9258160237388724, | |
| "grad_norm": 4.568605899810791, | |
| "kl": 0.0887451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8009403049945831, | |
| "reward_std": 0.04601938929408789, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8009403496980667, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 130.7604217529297, | |
| "epoch": 1.9317507418397626, | |
| "grad_norm": 4.746346473693848, | |
| "kl": 0.0987548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.7941046059131622, | |
| "reward_std": 0.0846591629087925, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8045211881399155, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 138.28125381469727, | |
| "epoch": 1.9376854599406528, | |
| "grad_norm": 12.027536392211914, | |
| "kl": 0.08740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.80451300740242, | |
| "reward_std": 0.03780581499449909, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8045129328966141, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 134.66666793823242, | |
| "epoch": 1.943620178041543, | |
| "grad_norm": 8.19904613494873, | |
| "kl": 0.0963134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.7921161651611328, | |
| "reward_std": 0.04007569560781121, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7921161502599716, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 135.46875762939453, | |
| "epoch": 1.9495548961424332, | |
| "grad_norm": 3.6576757431030273, | |
| "kl": 0.090576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7738652527332306, | |
| "reward_std": 0.04611685499548912, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7842819541692734, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 139.07291793823242, | |
| "epoch": 1.9554896142433233, | |
| "grad_norm": 5.115423679351807, | |
| "kl": 0.084228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.8082719147205353, | |
| "reward_std": 0.060723274014890194, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8186886012554169, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 133.8854217529297, | |
| "epoch": 1.9614243323442135, | |
| "grad_norm": 3.8776745796203613, | |
| "kl": 0.0994873046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.8066315650939941, | |
| "reward_std": 0.0820788680575788, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8170481771230698, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 136.3229217529297, | |
| "epoch": 1.9673590504451037, | |
| "grad_norm": 8.56767463684082, | |
| "kl": 0.08203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7753276526927948, | |
| "reward_std": 0.017031708965077996, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7753276079893112, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 131.2291717529297, | |
| "epoch": 1.973293768545994, | |
| "grad_norm": 3.104804277420044, | |
| "kl": 0.07958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7486878633499146, | |
| "reward_std": 0.060599199729040265, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7591045498847961, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 132.98958587646484, | |
| "epoch": 1.979228486646884, | |
| "grad_norm": 3.6994638442993164, | |
| "kl": 0.0992431640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "reward": 1.7889062762260437, | |
| "reward_std": 0.08577963337302208, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7993228882551193, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 132.62500190734863, | |
| "epoch": 1.9851632047477745, | |
| "grad_norm": 3.3083105087280273, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.7760635912418365, | |
| "reward_std": 0.044035853585228324, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7760635614395142, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 131.53125762939453, | |
| "epoch": 1.9910979228486647, | |
| "grad_norm": 4.454075336456299, | |
| "kl": 0.1187744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0047, | |
| "reward": 1.7569631934165955, | |
| "reward_std": 0.06494582071900368, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7569631636142731, | |
| "step": 336 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 336, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |