Seg-R1-7B / trainer_state.json
geshang's picture
init
cd72bcd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9910979228486647,
"eval_steps": 500,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 157.78125381469727,
"epoch": 0.005934718100890208,
"grad_norm": 4.843128681182861,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 1.0954087376594543,
"reward_std": 0.6969138085842133,
"rewards/format_reward": 0.6562500074505806,
"rewards/segmentation_reward": 0.43915872275829315,
"step": 1
},
{
"completion_length": 150.3854217529297,
"epoch": 0.011869436201780416,
"grad_norm": 5.3678154945373535,
"kl": 0.0009851455688476562,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 1.010141596198082,
"reward_std": 0.7214687466621399,
"rewards/format_reward": 0.6041666865348816,
"rewards/segmentation_reward": 0.40597493201494217,
"step": 2
},
{
"completion_length": 158.6354217529297,
"epoch": 0.017804154302670624,
"grad_norm": 7.694790363311768,
"kl": 0.001270294189453125,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.3760891556739807,
"reward_std": 0.4771263860166073,
"rewards/format_reward": 0.8020833432674408,
"rewards/segmentation_reward": 0.5740057602524757,
"step": 3
},
{
"completion_length": 148.0208396911621,
"epoch": 0.02373887240356083,
"grad_norm": 11.235418319702148,
"kl": 0.00232696533203125,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.367339700460434,
"reward_std": 0.42488569766283035,
"rewards/format_reward": 0.8437500149011612,
"rewards/segmentation_reward": 0.523589625954628,
"step": 4
},
{
"completion_length": 150.77083587646484,
"epoch": 0.02967359050445104,
"grad_norm": 6.50709342956543,
"kl": 0.00482177734375,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 1.173013836145401,
"reward_std": 0.744086429476738,
"rewards/format_reward": 0.6979166716337204,
"rewards/segmentation_reward": 0.4750971421599388,
"step": 5
},
{
"completion_length": 142.7291717529297,
"epoch": 0.03560830860534125,
"grad_norm": 4.435483932495117,
"kl": 0.00745391845703125,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 1.5586660504341125,
"reward_std": 0.1734358905814588,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.6003326624631882,
"step": 6
},
{
"completion_length": 138.25000762939453,
"epoch": 0.04154302670623145,
"grad_norm": 7.039760112762451,
"kl": 0.01007080078125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.5964379608631134,
"reward_std": 0.23111657053232193,
"rewards/format_reward": 0.9479167014360428,
"rewards/segmentation_reward": 0.648521289229393,
"step": 7
},
{
"completion_length": 144.59375381469727,
"epoch": 0.04747774480712166,
"grad_norm": 6.002971649169922,
"kl": 0.010711669921875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.500212699174881,
"reward_std": 0.26775410771369934,
"rewards/format_reward": 0.927083358168602,
"rewards/segmentation_reward": 0.573129341006279,
"step": 8
},
{
"completion_length": 136.31250381469727,
"epoch": 0.05341246290801187,
"grad_norm": 11.051139831542969,
"kl": 0.016204833984375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.4772345423698425,
"reward_std": 0.28671396523714066,
"rewards/format_reward": 0.8958333432674408,
"rewards/segmentation_reward": 0.5814011096954346,
"step": 9
},
{
"completion_length": 138.37500381469727,
"epoch": 0.05934718100890208,
"grad_norm": 5.32958459854126,
"kl": 0.0171356201171875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.6374868154525757,
"reward_std": 0.2423457931727171,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.6687368303537369,
"step": 10
},
{
"completion_length": 134.0312557220459,
"epoch": 0.06528189910979229,
"grad_norm": 12.082164764404297,
"kl": 0.0192108154296875,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.652137815952301,
"reward_std": 0.18039543880149722,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.6938043981790543,
"step": 11
},
{
"completion_length": 134.58333587646484,
"epoch": 0.0712166172106825,
"grad_norm": 6.0959930419921875,
"kl": 0.0180816650390625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.5255734622478485,
"reward_std": 0.21420218795537949,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.5568235069513321,
"step": 12
},
{
"completion_length": 132.81250762939453,
"epoch": 0.0771513353115727,
"grad_norm": 8.166247367858887,
"kl": 0.018310546875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.576871931552887,
"reward_std": 0.16248912550508976,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.6081219017505646,
"step": 13
},
{
"completion_length": 135.7604217529297,
"epoch": 0.0830860534124629,
"grad_norm": 6.298133850097656,
"kl": 0.02197265625,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.7197113037109375,
"reward_std": 0.07741504721343517,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7301279306411743,
"step": 14
},
{
"completion_length": 131.15625381469727,
"epoch": 0.08902077151335312,
"grad_norm": 15.750993728637695,
"kl": 0.023529052734375,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.6381132900714874,
"reward_std": 0.24310634471476078,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.6797799617052078,
"step": 15
},
{
"completion_length": 134.31250762939453,
"epoch": 0.09495548961424333,
"grad_norm": 14.33760929107666,
"kl": 0.03131103515625,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.707416594028473,
"reward_std": 0.20465393085032701,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7386665642261505,
"step": 16
},
{
"completion_length": 128.7812557220459,
"epoch": 0.10089020771513353,
"grad_norm": 9.760759353637695,
"kl": 0.02435302734375,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 1.6728956699371338,
"reward_std": 0.12627490423619747,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6833122968673706,
"step": 17
},
{
"completion_length": 128.16666984558105,
"epoch": 0.10682492581602374,
"grad_norm": 29.32834815979004,
"kl": 0.02447509765625,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 1.679076761007309,
"reward_std": 0.16491653956472874,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.6999100893735886,
"step": 18
},
{
"completion_length": 128.4479217529297,
"epoch": 0.11275964391691394,
"grad_norm": 8.054636001586914,
"kl": 0.026824951171875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.6828641295433044,
"reward_std": 0.05810322519391775,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.6828641593456268,
"step": 19
},
{
"completion_length": 131.4687557220459,
"epoch": 0.11869436201780416,
"grad_norm": 6.730808258056641,
"kl": 0.026153564453125,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 1.654236376285553,
"reward_std": 0.15527622308582067,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.6750697493553162,
"step": 20
},
{
"completion_length": 128.3229217529297,
"epoch": 0.12462908011869436,
"grad_norm": 7.039961814880371,
"kl": 0.0263671875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.70102459192276,
"reward_std": 0.07563944300636649,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7114411741495132,
"step": 21
},
{
"completion_length": 128.82292366027832,
"epoch": 0.13056379821958458,
"grad_norm": 25.049779891967773,
"kl": 0.033203125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.6903910338878632,
"reward_std": 0.11956312041729689,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7008076012134552,
"step": 22
},
{
"completion_length": 130.3541717529297,
"epoch": 0.13649851632047477,
"grad_norm": 22.480205535888672,
"kl": 0.028472900390625,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.7005047500133514,
"reward_std": 0.15591828245669603,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7317546755075455,
"step": 23
},
{
"completion_length": 125.06250190734863,
"epoch": 0.142433234421365,
"grad_norm": 7.072060585021973,
"kl": 0.044342041015625,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.7708145081996918,
"reward_std": 0.08357710530981421,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7812311798334122,
"step": 24
},
{
"completion_length": 121.11458587646484,
"epoch": 0.14836795252225518,
"grad_norm": 8.955461502075195,
"kl": 0.027099609375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.6748111546039581,
"reward_std": 0.12412907555699348,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6852278560400009,
"step": 25
},
{
"completion_length": 120.08333396911621,
"epoch": 0.1543026706231454,
"grad_norm": 6.5587687492370605,
"kl": 0.030548095703125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.6505275666713715,
"reward_std": 0.19929413869976997,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.6817775219678879,
"step": 26
},
{
"completion_length": 117.77083778381348,
"epoch": 0.16023738872403562,
"grad_norm": 7.269732475280762,
"kl": 0.0325927734375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.6844450235366821,
"reward_std": 0.10336442582774907,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6948617249727249,
"step": 27
},
{
"completion_length": 120.55208587646484,
"epoch": 0.1661721068249258,
"grad_norm": 5.853372097015381,
"kl": 0.026458740234375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.6804953813552856,
"reward_std": 0.12254019640386105,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7013287395238876,
"step": 28
},
{
"completion_length": 121.02083587646484,
"epoch": 0.17210682492581603,
"grad_norm": 31.43914031982422,
"kl": 0.0313720703125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.707067847251892,
"reward_std": 0.12043083645403385,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7070677876472473,
"step": 29
},
{
"completion_length": 122.77083587646484,
"epoch": 0.17804154302670624,
"grad_norm": 10.165179252624512,
"kl": 0.027313232421875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.7467810809612274,
"reward_std": 0.07829774497076869,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7571977376937866,
"step": 30
},
{
"completion_length": 121.52083587646484,
"epoch": 0.18397626112759644,
"grad_norm": 18.9634952545166,
"kl": 0.02899169921875,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.7623717486858368,
"reward_std": 0.06413916405290365,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7623717188835144,
"step": 31
},
{
"completion_length": 119.40625190734863,
"epoch": 0.18991097922848665,
"grad_norm": 9.280572891235352,
"kl": 0.111724853515625,
"learning_rate": 1e-06,
"loss": 0.0045,
"reward": 1.6756855249404907,
"reward_std": 0.17220945027656853,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.7173521369695663,
"step": 32
},
{
"completion_length": 120.37500190734863,
"epoch": 0.19584569732937684,
"grad_norm": 20.99109649658203,
"kl": 0.032470703125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.6820135414600372,
"reward_std": 0.09892075881361961,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7028468549251556,
"step": 33
},
{
"completion_length": 119.59375381469727,
"epoch": 0.20178041543026706,
"grad_norm": 11.153885841369629,
"kl": 0.0384521484375,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7420443892478943,
"reward_std": 0.05706456396728754,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7420443743467331,
"step": 34
},
{
"completion_length": 117.82291984558105,
"epoch": 0.20771513353115728,
"grad_norm": 7.118908882141113,
"kl": 0.03765869140625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7531715631484985,
"reward_std": 0.10411902144551277,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7635882049798965,
"step": 35
},
{
"completion_length": 120.62500381469727,
"epoch": 0.21364985163204747,
"grad_norm": 11.814937591552734,
"kl": 0.03887939453125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7418705523014069,
"reward_std": 0.07266789069399238,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7418705374002457,
"step": 36
},
{
"completion_length": 119.91666984558105,
"epoch": 0.2195845697329377,
"grad_norm": 6.670704364776611,
"kl": 0.03497314453125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.758776307106018,
"reward_std": 0.083824397996068,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7691929787397385,
"step": 37
},
{
"completion_length": 121.03125381469727,
"epoch": 0.22551928783382788,
"grad_norm": 7.1339545249938965,
"kl": 0.03955078125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.780372679233551,
"reward_std": 0.05653517507016659,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7803726345300674,
"step": 38
},
{
"completion_length": 123.71875190734863,
"epoch": 0.2314540059347181,
"grad_norm": 20.304597854614258,
"kl": 0.036163330078125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.7353099882602692,
"reward_std": 0.09488376975059509,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7457266598939896,
"step": 39
},
{
"completion_length": 126.91666793823242,
"epoch": 0.23738872403560832,
"grad_norm": 7.093606472015381,
"kl": 0.03582763671875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.730019450187683,
"reward_std": 0.061210392508655787,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7300194352865219,
"step": 40
},
{
"completion_length": 127.8125057220459,
"epoch": 0.2433234421364985,
"grad_norm": 11.748034477233887,
"kl": 0.038818359375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.693460375070572,
"reward_std": 0.10717929899692535,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7142936736345291,
"step": 41
},
{
"completion_length": 123.32291984558105,
"epoch": 0.24925816023738873,
"grad_norm": 10.225717544555664,
"kl": 0.04193115234375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.7418551743030548,
"reward_std": 0.15680904872715473,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7731050848960876,
"step": 42
},
{
"completion_length": 128.7083339691162,
"epoch": 0.2551928783382789,
"grad_norm": 7.125417232513428,
"kl": 0.0401611328125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.6099365949630737,
"reward_std": 0.23078683763742447,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.6516032218933105,
"step": 43
},
{
"completion_length": 127.89583778381348,
"epoch": 0.26112759643916916,
"grad_norm": 14.912107467651367,
"kl": 0.0399169921875,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.700283408164978,
"reward_std": 0.1383817931637168,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7315333336591721,
"step": 44
},
{
"completion_length": 134.50000381469727,
"epoch": 0.26706231454005935,
"grad_norm": 7.416093349456787,
"kl": 0.037109375,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7258837223052979,
"reward_std": 0.10230511240661144,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7363004088401794,
"step": 45
},
{
"completion_length": 130.80208587646484,
"epoch": 0.27299703264094954,
"grad_norm": 195.13722229003906,
"kl": 0.04193115234375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8153444528579712,
"reward_std": 0.07033979892730713,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8153444081544876,
"step": 46
},
{
"completion_length": 129.22916793823242,
"epoch": 0.2789317507418398,
"grad_norm": 5.549770832061768,
"kl": 0.04315185546875,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.7382583618164062,
"reward_std": 0.044012173311784863,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7382583022117615,
"step": 47
},
{
"completion_length": 135.3541717529297,
"epoch": 0.28486646884273,
"grad_norm": 13.713714599609375,
"kl": 0.0467529296875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.740307867527008,
"reward_std": 0.12180997617542744,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7611411660909653,
"step": 48
},
{
"completion_length": 131.77083587646484,
"epoch": 0.29080118694362017,
"grad_norm": 17.87116241455078,
"kl": 0.04638671875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.7088869214057922,
"reward_std": 0.07198232505470514,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7088869214057922,
"step": 49
},
{
"completion_length": 135.3333396911621,
"epoch": 0.29673590504451036,
"grad_norm": 14.480645179748535,
"kl": 0.04559326171875,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.7133915424346924,
"reward_std": 0.15073410887271166,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.7550581842660904,
"step": 50
},
{
"completion_length": 137.1041717529297,
"epoch": 0.3026706231454006,
"grad_norm": 5.64309024810791,
"kl": 0.03741455078125,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.6373025476932526,
"reward_std": 0.13259334303438663,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.6581359207630157,
"step": 51
},
{
"completion_length": 135.1979217529297,
"epoch": 0.3086053412462908,
"grad_norm": 5.212345123291016,
"kl": 0.037841796875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7314472496509552,
"reward_std": 0.04300686717033386,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7314473092556,
"step": 52
},
{
"completion_length": 134.28125381469727,
"epoch": 0.314540059347181,
"grad_norm": 6.159574508666992,
"kl": 0.04058837890625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7339943647384644,
"reward_std": 0.07974916975945234,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7444109320640564,
"step": 53
},
{
"completion_length": 139.8958396911621,
"epoch": 0.32047477744807124,
"grad_norm": 19.429752349853516,
"kl": 0.03790283203125,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7864000797271729,
"reward_std": 0.08986240020021796,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8072333931922913,
"step": 54
},
{
"completion_length": 140.21875762939453,
"epoch": 0.3264094955489614,
"grad_norm": 8.32386302947998,
"kl": 0.12933349609375,
"learning_rate": 1e-06,
"loss": 0.0052,
"reward": 1.7257477641105652,
"reward_std": 0.11480520572513342,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7465810775756836,
"step": 55
},
{
"completion_length": 140.37500381469727,
"epoch": 0.3323442136498516,
"grad_norm": 4.434051990509033,
"kl": 0.0369873046875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7016322612762451,
"reward_std": 0.14971541427075863,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7328822016716003,
"step": 56
},
{
"completion_length": 138.32291793823242,
"epoch": 0.33827893175074186,
"grad_norm": 21.184770584106445,
"kl": 0.03936767578125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7572259902954102,
"reward_std": 0.03327028127387166,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7572259604930878,
"step": 57
},
{
"completion_length": 144.4791717529297,
"epoch": 0.34421364985163205,
"grad_norm": 12.050070762634277,
"kl": 0.04034423828125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7147690653800964,
"reward_std": 0.09481704700738192,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7147690504789352,
"step": 58
},
{
"completion_length": 140.72916793823242,
"epoch": 0.35014836795252224,
"grad_norm": 17.42802619934082,
"kl": 0.041748046875,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.736547976732254,
"reward_std": 0.10654295142740011,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.746964618563652,
"step": 59
},
{
"completion_length": 140.9270896911621,
"epoch": 0.3560830860534125,
"grad_norm": 10.749812126159668,
"kl": 0.03387451171875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.753279983997345,
"reward_std": 0.040626129135489464,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7532799541950226,
"step": 60
},
{
"completion_length": 140.37500381469727,
"epoch": 0.3620178041543027,
"grad_norm": 7.617424488067627,
"kl": 0.035858154296875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.6448023915290833,
"reward_std": 0.1214896326418966,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.6656356900930405,
"step": 61
},
{
"completion_length": 138.68750381469727,
"epoch": 0.36795252225519287,
"grad_norm": 15.33764362335205,
"kl": 0.043731689453125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.7142232954502106,
"reward_std": 0.1497452650219202,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7454732358455658,
"step": 62
},
{
"completion_length": 143.78125762939453,
"epoch": 0.37388724035608306,
"grad_norm": 26.1903133392334,
"kl": 0.031951904296875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.717173308134079,
"reward_std": 0.1322586655151099,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7484232485294342,
"step": 63
},
{
"completion_length": 139.3645896911621,
"epoch": 0.3798219584569733,
"grad_norm": 11.163153648376465,
"kl": 0.0316162109375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7420830428600311,
"reward_std": 0.16791360033676028,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.7837497144937515,
"step": 64
},
{
"completion_length": 138.12500381469727,
"epoch": 0.3857566765578635,
"grad_norm": 8.011311531066895,
"kl": 0.03466796875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.6190190613269806,
"reward_std": 0.21082479134202003,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.6502691060304642,
"step": 65
},
{
"completion_length": 144.75000762939453,
"epoch": 0.3916913946587537,
"grad_norm": 11.111846923828125,
"kl": 0.03204345703125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.755936175584793,
"reward_std": 0.10981305036693811,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7663528323173523,
"step": 66
},
{
"completion_length": 139.93750762939453,
"epoch": 0.39762611275964393,
"grad_norm": 9.128460884094238,
"kl": 0.033660888671875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7998457551002502,
"reward_std": 0.06874012341722846,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7998457551002502,
"step": 67
},
{
"completion_length": 142.1979217529297,
"epoch": 0.4035608308605341,
"grad_norm": 12.227910041809082,
"kl": 0.032440185546875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7355614304542542,
"reward_std": 0.04059491120278835,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7355614304542542,
"step": 68
},
{
"completion_length": 147.52084350585938,
"epoch": 0.4094955489614243,
"grad_norm": 4.963237762451172,
"kl": 0.03948974609375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.6883772611618042,
"reward_std": 0.12697911448776722,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7092105895280838,
"step": 69
},
{
"completion_length": 146.0104217529297,
"epoch": 0.41543026706231456,
"grad_norm": 5.463217258453369,
"kl": 0.03167724609375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.655370056629181,
"reward_std": 0.1917936820536852,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.6970367729663849,
"step": 70
},
{
"completion_length": 144.03125381469727,
"epoch": 0.42136498516320475,
"grad_norm": 11.424163818359375,
"kl": 0.0374755859375,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.662711262702942,
"reward_std": 0.16645172238349915,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.6939611881971359,
"step": 71
},
{
"completion_length": 147.8333396911621,
"epoch": 0.42729970326409494,
"grad_norm": 19.122568130493164,
"kl": 0.030853271484375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.7026071846485138,
"reward_std": 0.12206890597008169,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.723440483212471,
"step": 72
},
{
"completion_length": 146.35417556762695,
"epoch": 0.4332344213649852,
"grad_norm": 5.500903129577637,
"kl": 0.03125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.6472920775413513,
"reward_std": 0.20085742231458426,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.6889587044715881,
"step": 73
},
{
"completion_length": 145.47916793823242,
"epoch": 0.4391691394658754,
"grad_norm": 8.684319496154785,
"kl": 0.035247802734375,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.7125734388828278,
"reward_std": 0.11136134760454297,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7334067076444626,
"step": 74
},
{
"completion_length": 147.15625381469727,
"epoch": 0.44510385756676557,
"grad_norm": 7.0105109214782715,
"kl": 0.02880859375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.690079689025879,
"reward_std": 0.06854211632162333,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.6900796890258789,
"step": 75
},
{
"completion_length": 144.68750762939453,
"epoch": 0.45103857566765576,
"grad_norm": 4.295221328735352,
"kl": 0.032958984375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7480990290641785,
"reward_std": 0.10679568164050579,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7585156410932541,
"step": 76
},
{
"completion_length": 137.6145896911621,
"epoch": 0.456973293768546,
"grad_norm": 9.906067848205566,
"kl": 0.0330810546875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7196555435657501,
"reward_std": 0.15833959798328578,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7613222450017929,
"step": 77
},
{
"completion_length": 140.1041717529297,
"epoch": 0.4629080118694362,
"grad_norm": 7.921298503875732,
"kl": 0.03240966796875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.7260190546512604,
"reward_std": 0.109945148229599,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7468523383140564,
"step": 78
},
{
"completion_length": 137.4166717529297,
"epoch": 0.4688427299703264,
"grad_norm": 13.661524772644043,
"kl": 0.02947998046875,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.721655547618866,
"reward_std": 0.15363138541579247,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7529055327177048,
"step": 79
},
{
"completion_length": 133.37500762939453,
"epoch": 0.47477744807121663,
"grad_norm": 7.67321252822876,
"kl": 0.0350341796875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.6211079955101013,
"reward_std": 0.2416740320622921,
"rewards/format_reward": 0.9479167014360428,
"rewards/segmentation_reward": 0.6731913536787033,
"step": 80
},
{
"completion_length": 133.2291717529297,
"epoch": 0.4807121661721068,
"grad_norm": 6.358465194702148,
"kl": 0.0377197265625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.6948546469211578,
"reward_std": 0.15342898294329643,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7261045873165131,
"step": 81
},
{
"completion_length": 137.3958396911621,
"epoch": 0.486646884272997,
"grad_norm": 5.652665138244629,
"kl": 0.03717041015625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.7105199992656708,
"reward_std": 0.05939330440014601,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7105200439691544,
"step": 82
},
{
"completion_length": 128.9687557220459,
"epoch": 0.49258160237388726,
"grad_norm": 9.61016845703125,
"kl": 0.0361328125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.6815986335277557,
"reward_std": 0.07865951140411198,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.692015215754509,
"step": 83
},
{
"completion_length": 132.33333778381348,
"epoch": 0.49851632047477745,
"grad_norm": 13.813359260559082,
"kl": 0.041015625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7599249482154846,
"reward_std": 0.1436138590797782,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7911749482154846,
"step": 84
},
{
"completion_length": 125.53125381469727,
"epoch": 0.5044510385756676,
"grad_norm": 8.654479026794434,
"kl": 0.0391845703125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.730294018983841,
"reward_std": 0.08125040959566832,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7407106310129166,
"step": 85
},
{
"completion_length": 125.79166984558105,
"epoch": 0.5103857566765578,
"grad_norm": 12.567873001098633,
"kl": 0.0428466796875,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.7137242257595062,
"reward_std": 0.12538791447877884,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7345575541257858,
"step": 86
},
{
"completion_length": 119.34375190734863,
"epoch": 0.516320474777448,
"grad_norm": 4.4666361808776855,
"kl": 0.04974365234375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.6612667441368103,
"reward_std": 0.16449622996151447,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.6925167888402939,
"step": 87
},
{
"completion_length": 120.00000381469727,
"epoch": 0.5222551928783383,
"grad_norm": 8.294538497924805,
"kl": 0.0579833984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.6651489436626434,
"reward_std": 0.15905625000596046,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7068156003952026,
"step": 88
},
{
"completion_length": 114.86458587646484,
"epoch": 0.5281899109792285,
"grad_norm": 4.889521598815918,
"kl": 0.056396484375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7740101218223572,
"reward_std": 0.08986124489456415,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7844266593456268,
"step": 89
},
{
"completion_length": 113.12500190734863,
"epoch": 0.5341246290801187,
"grad_norm": 13.152044296264648,
"kl": 0.06005859375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7618820667266846,
"reward_std": 0.1168306190520525,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7722987830638885,
"step": 90
},
{
"completion_length": 109.41666984558105,
"epoch": 0.5400593471810089,
"grad_norm": 41.23225402832031,
"kl": 0.062255859375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.7959917783737183,
"reward_std": 0.05520722921937704,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7959917634725571,
"step": 91
},
{
"completion_length": 112.48958587646484,
"epoch": 0.5459940652818991,
"grad_norm": 24.223176956176758,
"kl": 0.0667724609375,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7793012261390686,
"reward_std": 0.07624120265245438,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.789717823266983,
"step": 92
},
{
"completion_length": 108.58333587646484,
"epoch": 0.5519287833827893,
"grad_norm": 6.036355972290039,
"kl": 0.10546875,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.6717748641967773,
"reward_std": 0.1516607478260994,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.6926082074642181,
"step": 93
},
{
"completion_length": 104.37500381469727,
"epoch": 0.5578635014836796,
"grad_norm": 6.875241279602051,
"kl": 0.0736083984375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7788923680782318,
"reward_std": 0.07802953757345676,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7788923531770706,
"step": 94
},
{
"completion_length": 106.38541984558105,
"epoch": 0.5637982195845698,
"grad_norm": 9.138381958007812,
"kl": 0.080322265625,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7983123362064362,
"reward_std": 0.0496332747861743,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7983123362064362,
"step": 95
},
{
"completion_length": 106.48958778381348,
"epoch": 0.56973293768546,
"grad_norm": 19.429649353027344,
"kl": 0.084716796875,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.7282686233520508,
"reward_std": 0.1304482314735651,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.749101996421814,
"step": 96
},
{
"completion_length": 103.33333396911621,
"epoch": 0.5756676557863502,
"grad_norm": 6.47030782699585,
"kl": 0.086669921875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7975687980651855,
"reward_std": 0.046134506817907095,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7975687384605408,
"step": 97
},
{
"completion_length": 106.27083587646484,
"epoch": 0.5816023738872403,
"grad_norm": 19.016536712646484,
"kl": 0.09228515625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7813318371772766,
"reward_std": 0.08001636108383536,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.791748434305191,
"step": 98
},
{
"completion_length": 104.04166984558105,
"epoch": 0.5875370919881305,
"grad_norm": 7.989252090454102,
"kl": 0.087890625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8176788091659546,
"reward_std": 0.04656107863411307,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.817678764462471,
"step": 99
},
{
"completion_length": 104.83333587646484,
"epoch": 0.5934718100890207,
"grad_norm": 12.417603492736816,
"kl": 0.093994140625,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.756578117609024,
"reward_std": 0.09739597979933023,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7669947296380997,
"step": 100
},
{
"completion_length": 103.44791793823242,
"epoch": 0.599406528189911,
"grad_norm": 8.88759708404541,
"kl": 0.1048583984375,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.710509866476059,
"reward_std": 0.16302508860826492,
"rewards/format_reward": 0.9583333730697632,
"rewards/segmentation_reward": 0.7521764636039734,
"step": 101
},
{
"completion_length": 104.97916984558105,
"epoch": 0.6053412462908012,
"grad_norm": 6.247071266174316,
"kl": 0.0887451171875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.764405071735382,
"reward_std": 0.09363555815070868,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7748217582702637,
"step": 102
},
{
"completion_length": 103.29166793823242,
"epoch": 0.6112759643916914,
"grad_norm": 8.60831069946289,
"kl": 0.0771484375,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.7240653932094574,
"reward_std": 0.07296892208978534,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7240653038024902,
"step": 103
},
{
"completion_length": 104.32291984558105,
"epoch": 0.6172106824925816,
"grad_norm": 41.23006820678711,
"kl": 0.089599609375,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7711005508899689,
"reward_std": 0.10164707154035568,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7815171927213669,
"step": 104
},
{
"completion_length": 102.12500190734863,
"epoch": 0.6231454005934718,
"grad_norm": 10.650015830993652,
"kl": 0.0810546875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7078483700752258,
"reward_std": 0.047545977868139744,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.707848384976387,
"step": 105
},
{
"completion_length": 103.43750381469727,
"epoch": 0.629080118694362,
"grad_norm": 5.714875221252441,
"kl": 0.088623046875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7607176005840302,
"reward_std": 0.11030509509146214,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7815508991479874,
"step": 106
},
{
"completion_length": 102.39583587646484,
"epoch": 0.6350148367952523,
"grad_norm": 4.9711079597473145,
"kl": 0.0848388671875,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.7914873659610748,
"reward_std": 0.07838396297302097,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8019039928913116,
"step": 107
},
{
"completion_length": 105.53125190734863,
"epoch": 0.6409495548961425,
"grad_norm": 34.618621826171875,
"kl": 0.094970703125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7531647086143494,
"reward_std": 0.133857280947268,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7739980816841125,
"step": 108
},
{
"completion_length": 106.22917175292969,
"epoch": 0.6468842729970327,
"grad_norm": 65.69969177246094,
"kl": 0.076904296875,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8055002093315125,
"reward_std": 0.04653235850855708,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8055001646280289,
"step": 109
},
{
"completion_length": 104.59375190734863,
"epoch": 0.6528189910979229,
"grad_norm": 6.102004051208496,
"kl": 0.078857421875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7133222222328186,
"reward_std": 0.11231098510324955,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7341555207967758,
"step": 110
},
{
"completion_length": 109.19791793823242,
"epoch": 0.658753709198813,
"grad_norm": 10.22568130493164,
"kl": 0.07861328125,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.7694753110408783,
"reward_std": 0.08740894356742501,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7798919528722763,
"step": 111
},
{
"completion_length": 110.48958587646484,
"epoch": 0.6646884272997032,
"grad_norm": 14.369743347167969,
"kl": 0.077880859375,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.678814172744751,
"reward_std": 0.11626282706856728,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.6996474862098694,
"step": 112
},
{
"completion_length": 108.34375381469727,
"epoch": 0.6706231454005934,
"grad_norm": 9.375542640686035,
"kl": 0.0745849609375,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.76340052485466,
"reward_std": 0.06236946932040155,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7738171219825745,
"step": 113
},
{
"completion_length": 106.72916984558105,
"epoch": 0.6765578635014837,
"grad_norm": 11.679986000061035,
"kl": 0.0679931640625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7401450872421265,
"reward_std": 0.1390870539471507,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7713950872421265,
"step": 114
},
{
"completion_length": 112.61458587646484,
"epoch": 0.6824925816023739,
"grad_norm": 14.078929901123047,
"kl": 0.0660400390625,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8118026554584503,
"reward_std": 0.06046540685929358,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8222192972898483,
"step": 115
},
{
"completion_length": 114.12500190734863,
"epoch": 0.6884272997032641,
"grad_norm": 30.76274299621582,
"kl": 0.06036376953125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7482226490974426,
"reward_std": 0.0746369045227766,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7586392909288406,
"step": 116
},
{
"completion_length": 114.25000381469727,
"epoch": 0.6943620178041543,
"grad_norm": 5.525589466094971,
"kl": 0.06390380859375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.668542355298996,
"reward_std": 0.16833586525171995,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7102090418338776,
"step": 117
},
{
"completion_length": 115.56250381469727,
"epoch": 0.7002967359050445,
"grad_norm": 6.112927436828613,
"kl": 0.05987548828125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7961181998252869,
"reward_std": 0.10987653583288193,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8065348565578461,
"step": 118
},
{
"completion_length": 118.67708587646484,
"epoch": 0.7062314540059347,
"grad_norm": 5.205990314483643,
"kl": 0.0548095703125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8050501644611359,
"reward_std": 0.057498088805004954,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8154668360948563,
"step": 119
},
{
"completion_length": 126.40625381469727,
"epoch": 0.712166172106825,
"grad_norm": 17.208343505859375,
"kl": 0.0572509765625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7176477015018463,
"reward_std": 0.033727534115314484,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7176476120948792,
"step": 120
},
{
"completion_length": 125.34375381469727,
"epoch": 0.7181008902077152,
"grad_norm": 5.374625205993652,
"kl": 0.05877685546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7735000848770142,
"reward_std": 0.05824981536716223,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7839167714118958,
"step": 121
},
{
"completion_length": 127.64583587646484,
"epoch": 0.7240356083086054,
"grad_norm": 7.2726311683654785,
"kl": 0.052490234375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8391265571117401,
"reward_std": 0.04235434322617948,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8391265720129013,
"step": 122
},
{
"completion_length": 136.0416717529297,
"epoch": 0.7299703264094956,
"grad_norm": 3.9126789569854736,
"kl": 0.049560546875,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.7534107267856598,
"reward_std": 0.0715349493548274,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7638274282217026,
"step": 123
},
{
"completion_length": 132.17708778381348,
"epoch": 0.7359050445103857,
"grad_norm": 7.291192531585693,
"kl": 0.04913330078125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.6664856970310211,
"reward_std": 0.0711211496964097,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6769023388624191,
"step": 124
},
{
"completion_length": 135.2708396911621,
"epoch": 0.7418397626112759,
"grad_norm": 5.235753059387207,
"kl": 0.0531005859375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.666751205921173,
"reward_std": 0.22133541852235794,
"rewards/format_reward": 0.9479167014360428,
"rewards/segmentation_reward": 0.7188344746828079,
"step": 125
},
{
"completion_length": 137.87500762939453,
"epoch": 0.7477744807121661,
"grad_norm": 7.52689266204834,
"kl": 0.0528564453125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.717499554157257,
"reward_std": 0.08941709902137518,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7279161959886551,
"step": 126
},
{
"completion_length": 142.9583396911621,
"epoch": 0.7537091988130564,
"grad_norm": 6.447893142700195,
"kl": 0.05340576171875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.729353815317154,
"reward_std": 0.0784148364327848,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7397704720497131,
"step": 127
},
{
"completion_length": 138.9479217529297,
"epoch": 0.7596439169139466,
"grad_norm": 9.053107261657715,
"kl": 0.05517578125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7839249968528748,
"reward_std": 0.018866646569222212,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7839248925447464,
"step": 128
},
{
"completion_length": 139.73958587646484,
"epoch": 0.7655786350148368,
"grad_norm": 6.239522933959961,
"kl": 0.05584716796875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8248617351055145,
"reward_std": 0.047789576230570674,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8248616605997086,
"step": 129
},
{
"completion_length": 140.15625381469727,
"epoch": 0.771513353115727,
"grad_norm": 6.727787971496582,
"kl": 0.05694580078125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7491380870342255,
"reward_std": 0.0848769242875278,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7595547139644623,
"step": 130
},
{
"completion_length": 142.87500381469727,
"epoch": 0.7774480712166172,
"grad_norm": 10.61686897277832,
"kl": 0.0516357421875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.6897242963314056,
"reward_std": 0.11665517743676901,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7105576545000076,
"step": 131
},
{
"completion_length": 139.32291793823242,
"epoch": 0.7833827893175074,
"grad_norm": 26.17420196533203,
"kl": 0.05377197265625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7028740346431732,
"reward_std": 0.12365616485476494,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.723707303404808,
"step": 132
},
{
"completion_length": 141.1354217529297,
"epoch": 0.7893175074183977,
"grad_norm": 7.808278560638428,
"kl": 0.05169677734375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7467839121818542,
"reward_std": 0.06525697093456984,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7572005093097687,
"step": 133
},
{
"completion_length": 139.0729217529297,
"epoch": 0.7952522255192879,
"grad_norm": 15.984428405761719,
"kl": 0.06195068359375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.7193138897418976,
"reward_std": 0.08782277535647154,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7297305464744568,
"step": 134
},
{
"completion_length": 144.7604217529297,
"epoch": 0.8011869436201781,
"grad_norm": 11.879109382629395,
"kl": 0.052734375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7329791486263275,
"reward_std": 0.07974315900355577,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7433958202600479,
"step": 135
},
{
"completion_length": 138.1770896911621,
"epoch": 0.8071216617210683,
"grad_norm": 7.3762383460998535,
"kl": 0.0635986328125,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.6841251850128174,
"reward_std": 0.14460914488881826,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7153750509023666,
"step": 136
},
{
"completion_length": 136.5104217529297,
"epoch": 0.8130563798219584,
"grad_norm": 21.975542068481445,
"kl": 0.0584716796875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7298172414302826,
"reward_std": 0.11540778167545795,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7506504952907562,
"step": 137
},
{
"completion_length": 138.06250762939453,
"epoch": 0.8189910979228486,
"grad_norm": 9.58665657043457,
"kl": 0.05853271484375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7674833238124847,
"reward_std": 0.0737875527702272,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7779000103473663,
"step": 138
},
{
"completion_length": 136.50000381469727,
"epoch": 0.8249258160237388,
"grad_norm": 6.232303142547607,
"kl": 0.068603515625,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7411607801914215,
"reward_std": 0.11014922056347132,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7619940489530563,
"step": 139
},
{
"completion_length": 135.50000762939453,
"epoch": 0.8308605341246291,
"grad_norm": 5.448104381561279,
"kl": 0.0635986328125,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.7017558217048645,
"reward_std": 0.14623194839805365,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7330057322978973,
"step": 140
},
{
"completion_length": 130.5104217529297,
"epoch": 0.8367952522255193,
"grad_norm": 6.76865816116333,
"kl": 0.05865478515625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8449821770191193,
"reward_std": 0.026794791920110583,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8449821621179581,
"step": 141
},
{
"completion_length": 132.0208396911621,
"epoch": 0.8427299703264095,
"grad_norm": 28.80607032775879,
"kl": 0.05657958984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7313937842845917,
"reward_std": 0.17943856306374073,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7730603665113449,
"step": 142
},
{
"completion_length": 130.68750381469727,
"epoch": 0.8486646884272997,
"grad_norm": 6.0609564781188965,
"kl": 0.0596923828125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7204857766628265,
"reward_std": 0.13561286870390177,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7413192093372345,
"step": 143
},
{
"completion_length": 132.06250190734863,
"epoch": 0.8545994065281899,
"grad_norm": 12.343249320983887,
"kl": 0.06207275390625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.6887616515159607,
"reward_std": 0.09354105032980442,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6991782933473587,
"step": 144
},
{
"completion_length": 128.94791984558105,
"epoch": 0.8605341246290801,
"grad_norm": 14.069941520690918,
"kl": 0.064453125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.7474890351295471,
"reward_std": 0.13476973632350564,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7787390351295471,
"step": 145
},
{
"completion_length": 127.62500381469727,
"epoch": 0.8664688427299704,
"grad_norm": 10.8029146194458,
"kl": 0.0654296875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.7317570745944977,
"reward_std": 0.09545435523614287,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7525903880596161,
"step": 146
},
{
"completion_length": 130.9166717529297,
"epoch": 0.8724035608308606,
"grad_norm": 11.501479148864746,
"kl": 0.0904541015625,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.686740756034851,
"reward_std": 0.1634499505162239,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7179906815290451,
"step": 147
},
{
"completion_length": 126.8750057220459,
"epoch": 0.8783382789317508,
"grad_norm": 13.238967895507812,
"kl": 0.06903076171875,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7835940718650818,
"reward_std": 0.07068347651511431,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7940107583999634,
"step": 148
},
{
"completion_length": 128.73958778381348,
"epoch": 0.884272997032641,
"grad_norm": 12.029227256774902,
"kl": 0.0657958984375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.7471203207969666,
"reward_std": 0.051280025159940124,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7471202611923218,
"step": 149
},
{
"completion_length": 126.30208969116211,
"epoch": 0.8902077151335311,
"grad_norm": 10.597739219665527,
"kl": 0.05908203125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7907660007476807,
"reward_std": 0.095851581543684,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8011826276779175,
"step": 150
},
{
"completion_length": 130.15625762939453,
"epoch": 0.8961424332344213,
"grad_norm": 11.776272773742676,
"kl": 0.06475830078125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8018748760223389,
"reward_std": 0.06940475525334477,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8122915327548981,
"step": 151
},
{
"completion_length": 130.97916793823242,
"epoch": 0.9020771513353115,
"grad_norm": 13.906003952026367,
"kl": 0.0582275390625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8337860107421875,
"reward_std": 0.026843111030757427,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8337860107421875,
"step": 152
},
{
"completion_length": 130.58333778381348,
"epoch": 0.9080118694362018,
"grad_norm": 14.249563217163086,
"kl": 0.05963134765625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.749543309211731,
"reward_std": 0.02320151124149561,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7495433241128922,
"step": 153
},
{
"completion_length": 123.80208396911621,
"epoch": 0.913946587537092,
"grad_norm": 31.058822631835938,
"kl": 0.056396484375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7307706773281097,
"reward_std": 0.09688921645283699,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7411873042583466,
"step": 154
},
{
"completion_length": 127.3437557220459,
"epoch": 0.9198813056379822,
"grad_norm": 24.157621383666992,
"kl": 0.0574951171875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7422930300235748,
"reward_std": 0.05691177165135741,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.742293044924736,
"step": 155
},
{
"completion_length": 125.56250381469727,
"epoch": 0.9258160237388724,
"grad_norm": 8.896293640136719,
"kl": 0.06060791015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7800202369689941,
"reward_std": 0.10962340701371431,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8008535355329514,
"step": 156
},
{
"completion_length": 129.05208587646484,
"epoch": 0.9317507418397626,
"grad_norm": 9.590377807617188,
"kl": 0.060546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8012830018997192,
"reward_std": 0.017546723363921046,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8012829422950745,
"step": 157
},
{
"completion_length": 126.30208396911621,
"epoch": 0.9376854599406528,
"grad_norm": 8.54839038848877,
"kl": 0.05804443359375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.6956645548343658,
"reward_std": 0.04086484480649233,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.6956644654273987,
"step": 158
},
{
"completion_length": 128.5104217529297,
"epoch": 0.9436201780415431,
"grad_norm": 6.301590442657471,
"kl": 0.05426025390625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7389605939388275,
"reward_std": 0.11934427171945572,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7493772804737091,
"step": 159
},
{
"completion_length": 126.62500381469727,
"epoch": 0.9495548961424333,
"grad_norm": 6.094778537750244,
"kl": 0.05352783203125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7299975156784058,
"reward_std": 0.05153268342837691,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7299974858760834,
"step": 160
},
{
"completion_length": 126.44792175292969,
"epoch": 0.9554896142433235,
"grad_norm": 16.101211547851562,
"kl": 0.05364990234375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7796873152256012,
"reward_std": 0.10896322131156921,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8005205690860748,
"step": 161
},
{
"completion_length": 127.29167366027832,
"epoch": 0.9614243323442137,
"grad_norm": 9.07394790649414,
"kl": 0.05267333984375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.806904137134552,
"reward_std": 0.03972620144486427,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8069040328264236,
"step": 162
},
{
"completion_length": 124.92708778381348,
"epoch": 0.9673590504451038,
"grad_norm": 4.550307750701904,
"kl": 0.05987548828125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7452739477157593,
"reward_std": 0.10492927418090403,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7765238583087921,
"step": 163
},
{
"completion_length": 125.27083587646484,
"epoch": 0.973293768545994,
"grad_norm": 7.808937072753906,
"kl": 0.05322265625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7517999708652496,
"reward_std": 0.09145255433395505,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7622165530920029,
"step": 164
},
{
"completion_length": 125.53125381469727,
"epoch": 0.9792284866468842,
"grad_norm": 4.795185565948486,
"kl": 0.05657958984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7628948986530304,
"reward_std": 0.05225597298704088,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.773311510682106,
"step": 165
},
{
"completion_length": 126.81250381469727,
"epoch": 0.9851632047477745,
"grad_norm": 212.8815460205078,
"kl": 0.05267333984375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7152629494667053,
"reward_std": 0.0905265836045146,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7256796211004257,
"step": 166
},
{
"completion_length": 122.66666793823242,
"epoch": 0.9910979228486647,
"grad_norm": 13.994025230407715,
"kl": 0.05609130859375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.82010880112648,
"reward_std": 0.04186421073973179,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8201088160276413,
"step": 167
},
{
"completion_length": 116.84375190734863,
"epoch": 0.9970326409495549,
"grad_norm": 6.674865245819092,
"kl": 0.05755615234375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8161313235759735,
"reward_std": 0.020860509714111686,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8161313384771347,
"step": 168
},
{
"completion_length": 122.33333587646484,
"epoch": 1.0,
"grad_norm": 6.674865245819092,
"kl": 0.05810546875,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8273937106132507,
"reward_std": 0.05868770182132721,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8482270240783691,
"step": 169
},
{
"completion_length": 120.18750190734863,
"epoch": 1.0059347181008902,
"grad_norm": 4.1788835525512695,
"kl": 0.0687255859375,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.6861424446105957,
"reward_std": 0.07579059433192015,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6965591013431549,
"step": 170
},
{
"completion_length": 121.20833587646484,
"epoch": 1.0118694362017804,
"grad_norm": 8.323039054870605,
"kl": 0.06103515625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8149082660675049,
"reward_std": 0.06312395888380706,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8253249377012253,
"step": 171
},
{
"completion_length": 120.20833587646484,
"epoch": 1.0178041543026706,
"grad_norm": 4.517621040344238,
"kl": 0.066650390625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.792221575975418,
"reward_std": 0.021815289743244648,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7922215461730957,
"step": 172
},
{
"completion_length": 119.87500190734863,
"epoch": 1.0237388724035608,
"grad_norm": 9.510743141174316,
"kl": 0.05963134765625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8078531324863434,
"reward_std": 0.07382986601442099,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8182697743177414,
"step": 173
},
{
"completion_length": 119.87500190734863,
"epoch": 1.029673590504451,
"grad_norm": 5.854668140411377,
"kl": 0.07275390625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8211482167243958,
"reward_std": 0.08701172703877091,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8315648138523102,
"step": 174
},
{
"completion_length": 115.69791984558105,
"epoch": 1.0356083086053411,
"grad_norm": 3.5263679027557373,
"kl": 0.068603515625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7708676159381866,
"reward_std": 0.07291271979920566,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7812842577695847,
"step": 175
},
{
"completion_length": 119.44791984558105,
"epoch": 1.0415430267062316,
"grad_norm": 9.571796417236328,
"kl": 0.06640625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7062998712062836,
"reward_std": 0.03168163984082639,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7062998414039612,
"step": 176
},
{
"completion_length": 115.05208587646484,
"epoch": 1.0474777448071217,
"grad_norm": 3.939903974533081,
"kl": 0.07196044921875,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.6977596580982208,
"reward_std": 0.0435628320556134,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.6977596431970596,
"step": 177
},
{
"completion_length": 120.00000381469727,
"epoch": 1.053412462908012,
"grad_norm": 5.052703380584717,
"kl": 0.072265625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7161425948143005,
"reward_std": 0.08243446378037333,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.726559191942215,
"step": 178
},
{
"completion_length": 114.14583778381348,
"epoch": 1.0593471810089021,
"grad_norm": 3.9155845642089844,
"kl": 0.07421875,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.7838780879974365,
"reward_std": 0.05022241431288421,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.794294685125351,
"step": 179
},
{
"completion_length": 113.77083396911621,
"epoch": 1.0652818991097923,
"grad_norm": 6.6712727546691895,
"kl": 0.0728759765625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7887768745422363,
"reward_std": 0.03889981145039201,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7887768745422363,
"step": 180
},
{
"completion_length": 115.16666793823242,
"epoch": 1.0712166172106825,
"grad_norm": 8.987320899963379,
"kl": 0.068359375,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7815601825714111,
"reward_std": 0.045000725833233446,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7815601229667664,
"step": 181
},
{
"completion_length": 115.10416793823242,
"epoch": 1.0771513353115727,
"grad_norm": 5.333348274230957,
"kl": 0.08056640625,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.807422399520874,
"reward_std": 0.06547991407569498,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8178391009569168,
"step": 182
},
{
"completion_length": 113.58333587646484,
"epoch": 1.083086053412463,
"grad_norm": 14.877934455871582,
"kl": 0.0751953125,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.7589649856090546,
"reward_std": 0.036034910939633846,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7589650005102158,
"step": 183
},
{
"completion_length": 115.14583587646484,
"epoch": 1.089020771513353,
"grad_norm": 5.486746311187744,
"kl": 0.078369140625,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.692201405763626,
"reward_std": 0.06457182578742504,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.6922013610601425,
"step": 184
},
{
"completion_length": 116.83333396911621,
"epoch": 1.0949554896142433,
"grad_norm": 22.6247501373291,
"kl": 0.069580078125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7884438037872314,
"reward_std": 0.0695024150190875,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7988604456186295,
"step": 185
},
{
"completion_length": 114.22916984558105,
"epoch": 1.1008902077151335,
"grad_norm": 3.4625394344329834,
"kl": 0.06756591796875,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7949866950511932,
"reward_std": 0.0218111855792813,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7949866652488708,
"step": 186
},
{
"completion_length": 114.61458587646484,
"epoch": 1.1068249258160237,
"grad_norm": 3.797193765640259,
"kl": 0.07470703125,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.7465449571609497,
"reward_std": 0.06478537991642952,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7569615691900253,
"step": 187
},
{
"completion_length": 113.55208587646484,
"epoch": 1.1127596439169138,
"grad_norm": 3.3276095390319824,
"kl": 0.091552734375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7474755942821503,
"reward_std": 0.07169347535818815,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.757892295718193,
"step": 188
},
{
"completion_length": 115.68750381469727,
"epoch": 1.1186943620178043,
"grad_norm": 5.974353790283203,
"kl": 0.079345703125,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.8201344907283783,
"reward_std": 0.04725779825821519,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8305511772632599,
"step": 189
},
{
"completion_length": 117.37500190734863,
"epoch": 1.1246290801186944,
"grad_norm": 16.502485275268555,
"kl": 0.077880859375,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8688680529594421,
"reward_std": 0.01468480727635324,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8688680231571198,
"step": 190
},
{
"completion_length": 114.40625190734863,
"epoch": 1.1305637982195846,
"grad_norm": 5.832488059997559,
"kl": 0.082275390625,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7779072523117065,
"reward_std": 0.10380933433771133,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7987405955791473,
"step": 191
},
{
"completion_length": 113.42708587646484,
"epoch": 1.1364985163204748,
"grad_norm": 11.94194507598877,
"kl": 0.0953369140625,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7498697340488434,
"reward_std": 0.06882704934105277,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.760286420583725,
"step": 192
},
{
"completion_length": 114.39583778381348,
"epoch": 1.142433234421365,
"grad_norm": 3.888754367828369,
"kl": 0.0953369140625,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7453253865242004,
"reward_std": 0.10540217161178589,
"rewards/format_reward": 0.96875,
"rewards/segmentation_reward": 0.776575356721878,
"step": 193
},
{
"completion_length": 119.65625381469727,
"epoch": 1.1483679525222552,
"grad_norm": 8.40555477142334,
"kl": 0.0889892578125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7195344865322113,
"reward_std": 0.0910744748543948,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7299510538578033,
"step": 194
},
{
"completion_length": 117.95833396911621,
"epoch": 1.1543026706231454,
"grad_norm": 14.257683753967285,
"kl": 0.0919189453125,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7635149657726288,
"reward_std": 0.049157430883497,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7635148763656616,
"step": 195
},
{
"completion_length": 118.45833587646484,
"epoch": 1.1602373887240356,
"grad_norm": 4.263291835784912,
"kl": 0.08837890625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.6939684748649597,
"reward_std": 0.08351217093877494,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7148017585277557,
"step": 196
},
{
"completion_length": 114.95833778381348,
"epoch": 1.1661721068249258,
"grad_norm": 4.633386611938477,
"kl": 0.0999755859375,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.8257586061954498,
"reward_std": 0.03723354451358318,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.825758546590805,
"step": 197
},
{
"completion_length": 115.04166793823242,
"epoch": 1.172106824925816,
"grad_norm": 5.097075939178467,
"kl": 0.09375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7617026567459106,
"reward_std": 0.12132613873109221,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7825359106063843,
"step": 198
},
{
"completion_length": 116.18750381469727,
"epoch": 1.1780415430267062,
"grad_norm": 3.8244075775146484,
"kl": 0.096435546875,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7776153683662415,
"reward_std": 0.11768396757543087,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7984486818313599,
"step": 199
},
{
"completion_length": 118.48958587646484,
"epoch": 1.1839762611275964,
"grad_norm": 3.69380521774292,
"kl": 0.0916748046875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7464786171913147,
"reward_std": 0.12091443943791091,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7777286469936371,
"step": 200
},
{
"completion_length": 120.65625190734863,
"epoch": 1.1899109792284865,
"grad_norm": 17.07198715209961,
"kl": 0.11572265625,
"learning_rate": 1e-06,
"loss": 0.0046,
"reward": 1.7191323935985565,
"reward_std": 0.07988837361335754,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7295490056276321,
"step": 201
},
{
"completion_length": 124.06250190734863,
"epoch": 1.1958456973293767,
"grad_norm": 3.0627005100250244,
"kl": 0.0885009765625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.70257368683815,
"reward_std": 0.15230464632622898,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.73382368683815,
"step": 202
},
{
"completion_length": 118.03125,
"epoch": 1.2017804154302671,
"grad_norm": 4.091593265533447,
"kl": 0.09326171875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7475507259368896,
"reward_std": 0.09758387203328311,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7683840394020081,
"step": 203
},
{
"completion_length": 120.31250190734863,
"epoch": 1.2077151335311573,
"grad_norm": 2.6315650939941406,
"kl": 0.0948486328125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.778723031282425,
"reward_std": 0.06321048270910978,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7891396880149841,
"step": 204
},
{
"completion_length": 117.28125,
"epoch": 1.2136498516320475,
"grad_norm": 5.009098052978516,
"kl": 0.093505859375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7251938581466675,
"reward_std": 0.1409313241019845,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7564438581466675,
"step": 205
},
{
"completion_length": 115.86458587646484,
"epoch": 1.2195845697329377,
"grad_norm": 8.25062084197998,
"kl": 0.0926513671875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.674118846654892,
"reward_std": 0.17346674762666225,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7157854735851288,
"step": 206
},
{
"completion_length": 117.28125190734863,
"epoch": 1.225519287833828,
"grad_norm": 4.700351715087891,
"kl": 0.09814453125,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.642125815153122,
"reward_std": 0.23734556511044502,
"rewards/format_reward": 0.9375000298023224,
"rewards/segmentation_reward": 0.7046257257461548,
"step": 207
},
{
"completion_length": 117.13541984558105,
"epoch": 1.231454005934718,
"grad_norm": 10.711128234863281,
"kl": 0.091552734375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7565841972827911,
"reward_std": 0.11169130681082606,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7774175554513931,
"step": 208
},
{
"completion_length": 116.43750190734863,
"epoch": 1.2373887240356083,
"grad_norm": 5.590770244598389,
"kl": 0.0892333984375,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7059676349163055,
"reward_std": 0.1644108621403575,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7372175455093384,
"step": 209
},
{
"completion_length": 117.04166984558105,
"epoch": 1.2433234421364985,
"grad_norm": 5.0753092765808105,
"kl": 0.0899658203125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7945037186145782,
"reward_std": 0.09403454745188355,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8153369724750519,
"step": 210
},
{
"completion_length": 115.83333396911621,
"epoch": 1.2492581602373887,
"grad_norm": 6.409451007843018,
"kl": 0.09228515625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.731789082288742,
"reward_std": 0.15776935871690512,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7630390673875809,
"step": 211
},
{
"completion_length": 114.11458587646484,
"epoch": 1.2551928783382789,
"grad_norm": 4.285139083862305,
"kl": 0.0980224609375,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7591657638549805,
"reward_std": 0.14351706253364682,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7904157638549805,
"step": 212
},
{
"completion_length": 112.38541984558105,
"epoch": 1.2611275964391693,
"grad_norm": 10.488381385803223,
"kl": 0.1031494140625,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.7158048748970032,
"reward_std": 0.07641064748167992,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.726221576333046,
"step": 213
},
{
"completion_length": 115.22916793823242,
"epoch": 1.2670623145400595,
"grad_norm": 4.701923370361328,
"kl": 0.0870361328125,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.783218115568161,
"reward_std": 0.04100660281255841,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7832181453704834,
"step": 214
},
{
"completion_length": 119.78125190734863,
"epoch": 1.2729970326409497,
"grad_norm": 4.9908766746521,
"kl": 0.0970458984375,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.752919316291809,
"reward_std": 0.15398756321519613,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7737526744604111,
"step": 215
},
{
"completion_length": 116.95833587646484,
"epoch": 1.2789317507418398,
"grad_norm": 13.415197372436523,
"kl": 0.090087890625,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.6532210111618042,
"reward_std": 0.2240110612474382,
"rewards/format_reward": 0.9479166865348816,
"rewards/segmentation_reward": 0.705304279923439,
"step": 216
},
{
"completion_length": 119.66666984558105,
"epoch": 1.28486646884273,
"grad_norm": 2.535939931869507,
"kl": 0.09765625,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7339254915714264,
"reward_std": 0.13148222491145134,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.765175461769104,
"step": 217
},
{
"completion_length": 118.87500190734863,
"epoch": 1.2908011869436202,
"grad_norm": 4.534719944000244,
"kl": 0.1004638671875,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.8009094595909119,
"reward_std": 0.07824655435979366,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8113261461257935,
"step": 218
},
{
"completion_length": 120.03125381469727,
"epoch": 1.2967359050445104,
"grad_norm": 9.774252891540527,
"kl": 0.100830078125,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.761326789855957,
"reward_std": 0.04916087444871664,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7613267600536346,
"step": 219
},
{
"completion_length": 114.40625,
"epoch": 1.3026706231454006,
"grad_norm": 3.3857150077819824,
"kl": 0.094970703125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7107034027576447,
"reward_std": 0.12496633175760508,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7315367162227631,
"step": 220
},
{
"completion_length": 118.66666793823242,
"epoch": 1.3086053412462908,
"grad_norm": 3.008610725402832,
"kl": 0.0926513671875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.751450091600418,
"reward_std": 0.06824500812217593,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7618667632341385,
"step": 221
},
{
"completion_length": 119.70833587646484,
"epoch": 1.314540059347181,
"grad_norm": 3.0898277759552,
"kl": 0.08740234375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7804038226604462,
"reward_std": 0.05429700808599591,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7908205091953278,
"step": 222
},
{
"completion_length": 114.90625381469727,
"epoch": 1.3204747774480712,
"grad_norm": 3.706261396408081,
"kl": 0.0892333984375,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7740592658519745,
"reward_std": 0.04056970216333866,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7740592062473297,
"step": 223
},
{
"completion_length": 117.84375190734863,
"epoch": 1.3264094955489614,
"grad_norm": 6.03821325302124,
"kl": 0.1055908203125,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.7339475452899933,
"reward_std": 0.053982728626579046,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7443641424179077,
"step": 224
},
{
"completion_length": 120.10416984558105,
"epoch": 1.3323442136498516,
"grad_norm": 7.86570930480957,
"kl": 0.081787109375,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7618150115013123,
"reward_std": 0.11194364842958748,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7826483100652695,
"step": 225
},
{
"completion_length": 119.80208778381348,
"epoch": 1.3382789317507418,
"grad_norm": 5.573207378387451,
"kl": 0.089111328125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7204334437847137,
"reward_std": 0.14075168408453465,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7516833990812302,
"step": 226
},
{
"completion_length": 118.75000190734863,
"epoch": 1.344213649851632,
"grad_norm": 2.9999611377716064,
"kl": 0.093017578125,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7968370914459229,
"reward_std": 0.05921215028502047,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8072537481784821,
"step": 227
},
{
"completion_length": 127.47916984558105,
"epoch": 1.3501483679525221,
"grad_norm": 3.0452966690063477,
"kl": 0.0789794921875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7333484888076782,
"reward_std": 0.13363408669829369,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7645984143018723,
"step": 228
},
{
"completion_length": 118.04166984558105,
"epoch": 1.3560830860534125,
"grad_norm": 19.453765869140625,
"kl": 0.0914306640625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7236287295818329,
"reward_std": 0.13475321233272552,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7548786997795105,
"step": 229
},
{
"completion_length": 125.07292175292969,
"epoch": 1.3620178041543027,
"grad_norm": 3.446118116378784,
"kl": 0.0867919921875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7927662432193756,
"reward_std": 0.050292326137423515,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7927661687135696,
"step": 230
},
{
"completion_length": 122.11458778381348,
"epoch": 1.367952522255193,
"grad_norm": 3.5459084510803223,
"kl": 0.0965576171875,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7196455001831055,
"reward_std": 0.10603441158309579,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7300621271133423,
"step": 231
},
{
"completion_length": 124.71875381469727,
"epoch": 1.3738872403560831,
"grad_norm": 2.9705264568328857,
"kl": 0.0799560546875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.831710696220398,
"reward_std": 0.03920856770128012,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.831710696220398,
"step": 232
},
{
"completion_length": 121.53125190734863,
"epoch": 1.3798219584569733,
"grad_norm": 3.8888819217681885,
"kl": 0.07861328125,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.816581815481186,
"reward_std": 0.03990558721125126,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8165817707777023,
"step": 233
},
{
"completion_length": 123.67708587646484,
"epoch": 1.3857566765578635,
"grad_norm": 4.347487449645996,
"kl": 0.0831298828125,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7707000076770782,
"reward_std": 0.10984114836901426,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7915334403514862,
"step": 234
},
{
"completion_length": 122.53125190734863,
"epoch": 1.3916913946587537,
"grad_norm": 3.111170768737793,
"kl": 0.08544921875,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.776373028755188,
"reward_std": 0.10835065133869648,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7972063720226288,
"step": 235
},
{
"completion_length": 127.32291793823242,
"epoch": 1.3976261127596439,
"grad_norm": 15.74927806854248,
"kl": 0.078857421875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.8090568482875824,
"reward_std": 0.02911346103064716,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8090568333864212,
"step": 236
},
{
"completion_length": 128.96875381469727,
"epoch": 1.403560830860534,
"grad_norm": 3.8220973014831543,
"kl": 0.084228515625,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.6805144250392914,
"reward_std": 0.05462493887171149,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.680514469742775,
"step": 237
},
{
"completion_length": 123.5625057220459,
"epoch": 1.4094955489614243,
"grad_norm": 7.271127700805664,
"kl": 0.081298828125,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.8020037412643433,
"reward_std": 0.07482604053802788,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8124203979969025,
"step": 238
},
{
"completion_length": 127.60417175292969,
"epoch": 1.4154302670623147,
"grad_norm": 3.528641700744629,
"kl": 0.0869140625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7796452343463898,
"reward_std": 0.09994546975940466,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8004785478115082,
"step": 239
},
{
"completion_length": 132.9895896911621,
"epoch": 1.4213649851632049,
"grad_norm": 4.905125617980957,
"kl": 0.0906982421875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7627727091312408,
"reward_std": 0.0973970009945333,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7836060523986816,
"step": 240
},
{
"completion_length": 131.90625381469727,
"epoch": 1.427299703264095,
"grad_norm": 4.065957546234131,
"kl": 0.0736083984375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7551555633544922,
"reward_std": 0.0871009798720479,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7655721604824066,
"step": 241
},
{
"completion_length": 127.75000381469727,
"epoch": 1.4332344213649852,
"grad_norm": 9.198005676269531,
"kl": 0.0819091796875,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7510286271572113,
"reward_std": 0.0416345689445734,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7510285973548889,
"step": 242
},
{
"completion_length": 134.58333778381348,
"epoch": 1.4391691394658754,
"grad_norm": 2.298591375350952,
"kl": 0.0924072265625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7603932321071625,
"reward_std": 0.08814567420631647,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7708099633455276,
"step": 243
},
{
"completion_length": 133.47916984558105,
"epoch": 1.4451038575667656,
"grad_norm": 3.710606336593628,
"kl": 0.08740234375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7705409824848175,
"reward_std": 0.10231837723404169,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7809576243162155,
"step": 244
},
{
"completion_length": 132.7395896911621,
"epoch": 1.4510385756676558,
"grad_norm": 9.152044296264648,
"kl": 0.0809326171875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7908837795257568,
"reward_std": 0.06551890983246267,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8013004064559937,
"step": 245
},
{
"completion_length": 131.01041793823242,
"epoch": 1.456973293768546,
"grad_norm": 8.228063583374023,
"kl": 0.0845947265625,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.7855284810066223,
"reward_std": 0.025126937543973327,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7855284363031387,
"step": 246
},
{
"completion_length": 137.91666793823242,
"epoch": 1.4629080118694362,
"grad_norm": 6.479981422424316,
"kl": 0.15380859375,
"learning_rate": 1e-06,
"loss": 0.0061,
"reward": 1.7316823601722717,
"reward_std": 0.09728977642953396,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7420989274978638,
"step": 247
},
{
"completion_length": 129.26042366027832,
"epoch": 1.4688427299703264,
"grad_norm": 24.555652618408203,
"kl": 0.0960693359375,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7869995534420013,
"reward_std": 0.03961431025527418,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7869995981454849,
"step": 248
},
{
"completion_length": 129.4583396911621,
"epoch": 1.4747774480712166,
"grad_norm": 3.8002452850341797,
"kl": 0.095458984375,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.8435184359550476,
"reward_std": 0.028940949589014053,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8435183763504028,
"step": 249
},
{
"completion_length": 131.2291717529297,
"epoch": 1.4807121661721068,
"grad_norm": 5.852691650390625,
"kl": 0.17578125,
"learning_rate": 1e-06,
"loss": 0.007,
"reward": 1.7853436172008514,
"reward_std": 0.08121407218277454,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7957603484392166,
"step": 250
},
{
"completion_length": 133.0729217529297,
"epoch": 1.486646884272997,
"grad_norm": 4.016416072845459,
"kl": 0.0858154296875,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.6717259883880615,
"reward_std": 0.15333154564723372,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7029759734869003,
"step": 251
},
{
"completion_length": 130.0416717529297,
"epoch": 1.4925816023738872,
"grad_norm": 2.7940571308135986,
"kl": 0.0894775390625,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7898103296756744,
"reward_std": 0.11414735415019095,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8106437027454376,
"step": 252
},
{
"completion_length": 135.3333396911621,
"epoch": 1.4985163204747773,
"grad_norm": 6.82745885848999,
"kl": 0.088134765625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8346801698207855,
"reward_std": 0.02834852272644639,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8346801251173019,
"step": 253
},
{
"completion_length": 127.35417175292969,
"epoch": 1.5044510385756675,
"grad_norm": 2.9223759174346924,
"kl": 0.0977783203125,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.8006429970264435,
"reward_std": 0.11039311997592449,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8214763551950455,
"step": 254
},
{
"completion_length": 128.17708587646484,
"epoch": 1.5103857566765577,
"grad_norm": 3.724637269973755,
"kl": 0.1004638671875,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7425091862678528,
"reward_std": 0.08927370049059391,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.752925843000412,
"step": 255
},
{
"completion_length": 128.7708396911621,
"epoch": 1.516320474777448,
"grad_norm": 4.042162895202637,
"kl": 0.092529296875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7844707369804382,
"reward_std": 0.057451182045042515,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7844707667827606,
"step": 256
},
{
"completion_length": 125.92708587646484,
"epoch": 1.5222551928783383,
"grad_norm": 8.26577377319336,
"kl": 0.0897216796875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.8131219148635864,
"reward_std": 0.021738199284300208,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8131218403577805,
"step": 257
},
{
"completion_length": 125.91667175292969,
"epoch": 1.5281899109792285,
"grad_norm": 3.753349781036377,
"kl": 0.0914306640625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7764064967632294,
"reward_std": 0.06681416090577841,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7868231534957886,
"step": 258
},
{
"completion_length": 125.88542175292969,
"epoch": 1.5341246290801187,
"grad_norm": 3.156595230102539,
"kl": 0.10546875,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.7478066980838776,
"reward_std": 0.1068794084712863,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7686399817466736,
"step": 259
},
{
"completion_length": 121.90625190734863,
"epoch": 1.540059347181009,
"grad_norm": 3.946298360824585,
"kl": 0.097412109375,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7763938307762146,
"reward_std": 0.12856396986171603,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8076437562704086,
"step": 260
},
{
"completion_length": 123.72917175292969,
"epoch": 1.545994065281899,
"grad_norm": 5.4555277824401855,
"kl": 0.097900390625,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7746759355068207,
"reward_std": 0.05479801073670387,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7746759802103043,
"step": 261
},
{
"completion_length": 120.48958396911621,
"epoch": 1.5519287833827893,
"grad_norm": 4.6337199211120605,
"kl": 0.1065673828125,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 1.7088752686977386,
"reward_std": 0.08904288220219314,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7192919701337814,
"step": 262
},
{
"completion_length": 117.96875381469727,
"epoch": 1.5578635014836797,
"grad_norm": 97.80472564697266,
"kl": 0.098876953125,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.8269087076187134,
"reward_std": 0.04217356303706765,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8269086629152298,
"step": 263
},
{
"completion_length": 115.72916984558105,
"epoch": 1.5637982195845699,
"grad_norm": 4.1004486083984375,
"kl": 0.102294921875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.8442316353321075,
"reward_std": 0.04190053790807724,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8442316502332687,
"step": 264
},
{
"completion_length": 117.45833587646484,
"epoch": 1.56973293768546,
"grad_norm": 26.500062942504883,
"kl": 0.099365234375,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7497759461402893,
"reward_std": 0.03234653011895716,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7497759461402893,
"step": 265
},
{
"completion_length": 117.09375381469727,
"epoch": 1.5756676557863503,
"grad_norm": 5.039199352264404,
"kl": 0.1082763671875,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 1.8009319305419922,
"reward_std": 0.032779114320874214,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8009319305419922,
"step": 266
},
{
"completion_length": 117.69792175292969,
"epoch": 1.5816023738872405,
"grad_norm": 4.768641471862793,
"kl": 0.1068115234375,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 1.7576944530010223,
"reward_std": 0.07862287666648626,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7681111395359039,
"step": 267
},
{
"completion_length": 119.13541984558105,
"epoch": 1.5875370919881306,
"grad_norm": 6.0930352210998535,
"kl": 0.09130859375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7958367764949799,
"reward_std": 0.09232278482522815,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8166700899600983,
"step": 268
},
{
"completion_length": 115.75000381469727,
"epoch": 1.5934718100890208,
"grad_norm": 26.93886947631836,
"kl": 0.0869140625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.788069725036621,
"reward_std": 0.02124928869307041,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7880697846412659,
"step": 269
},
{
"completion_length": 114.47916984558105,
"epoch": 1.599406528189911,
"grad_norm": 7.8712663650512695,
"kl": 0.0975341796875,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7922349870204926,
"reward_std": 0.06056637444999069,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8026516437530518,
"step": 270
},
{
"completion_length": 117.11458587646484,
"epoch": 1.6053412462908012,
"grad_norm": 4.941649913787842,
"kl": 0.1014404296875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.8086935579776764,
"reward_std": 0.015388808911666274,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8086935430765152,
"step": 271
},
{
"completion_length": 116.30208587646484,
"epoch": 1.6112759643916914,
"grad_norm": 4.916129112243652,
"kl": 0.1123046875,
"learning_rate": 1e-06,
"loss": 0.0045,
"reward": 1.6760722398757935,
"reward_std": 0.05616055289283395,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.6864889115095139,
"step": 272
},
{
"completion_length": 109.93750190734863,
"epoch": 1.6172106824925816,
"grad_norm": 3.3433773517608643,
"kl": 0.099853515625,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7660618126392365,
"reward_std": 0.05840137042105198,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7660617679357529,
"step": 273
},
{
"completion_length": 116.47916984558105,
"epoch": 1.6231454005934718,
"grad_norm": 3.4884281158447266,
"kl": 0.1029052734375,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.77039036154747,
"reward_std": 0.044363456312566996,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7703903168439865,
"step": 274
},
{
"completion_length": 112.77083396911621,
"epoch": 1.629080118694362,
"grad_norm": 4.025818347930908,
"kl": 0.1072998046875,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 1.724378764629364,
"reward_std": 0.07113260589540005,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7347953617572784,
"step": 275
},
{
"completion_length": 114.58333587646484,
"epoch": 1.6350148367952522,
"grad_norm": 12.856447219848633,
"kl": 0.113525390625,
"learning_rate": 1e-06,
"loss": 0.0045,
"reward": 1.7462435364723206,
"reward_std": 0.07672798447310925,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7566602230072021,
"step": 276
},
{
"completion_length": 114.94791793823242,
"epoch": 1.6409495548961424,
"grad_norm": 9.79211711883545,
"kl": 0.106689453125,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 1.7808694243431091,
"reward_std": 0.07154811033979058,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7912861257791519,
"step": 277
},
{
"completion_length": 114.31250381469727,
"epoch": 1.6468842729970326,
"grad_norm": 4.4307942390441895,
"kl": 0.0987548828125,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.8354142606258392,
"reward_std": 0.03158940875437111,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8354142606258392,
"step": 278
},
{
"completion_length": 113.84375190734863,
"epoch": 1.6528189910979227,
"grad_norm": 6.534969806671143,
"kl": 0.1024169921875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.8007658421993256,
"reward_std": 0.03258772916160524,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.800765797495842,
"step": 279
},
{
"completion_length": 116.56250190734863,
"epoch": 1.658753709198813,
"grad_norm": 3.3599140644073486,
"kl": 0.0963134765625,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7653041183948517,
"reward_std": 0.08127650991082191,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7757207006216049,
"step": 280
},
{
"completion_length": 116.84375381469727,
"epoch": 1.6646884272997031,
"grad_norm": 3.239370107650757,
"kl": 0.0994873046875,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7427730858325958,
"reward_std": 0.17299647070467472,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7740231454372406,
"step": 281
},
{
"completion_length": 116.05208587646484,
"epoch": 1.6706231454005933,
"grad_norm": 9.61308765411377,
"kl": 0.095947265625,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7398262023925781,
"reward_std": 0.03492267336696386,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7398261874914169,
"step": 282
},
{
"completion_length": 117.16666793823242,
"epoch": 1.6765578635014837,
"grad_norm": 2.8493831157684326,
"kl": 0.114990234375,
"learning_rate": 1e-06,
"loss": 0.0046,
"reward": 1.8178634643554688,
"reward_std": 0.04271406587213278,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8178633898496628,
"step": 283
},
{
"completion_length": 120.60416984558105,
"epoch": 1.682492581602374,
"grad_norm": 14.684961318969727,
"kl": 0.091064453125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7771551609039307,
"reward_std": 0.04852711455896497,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7771551162004471,
"step": 284
},
{
"completion_length": 124.32291793823242,
"epoch": 1.688427299703264,
"grad_norm": 15.723881721496582,
"kl": 0.099609375,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7905828952789307,
"reward_std": 0.039514560252428055,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7905828356742859,
"step": 285
},
{
"completion_length": 125.19791984558105,
"epoch": 1.6943620178041543,
"grad_norm": 9.84542179107666,
"kl": 0.0933837890625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7513935565948486,
"reward_std": 0.07400949532166123,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7618101686239243,
"step": 286
},
{
"completion_length": 126.54166793823242,
"epoch": 1.7002967359050445,
"grad_norm": 7.75327205657959,
"kl": 0.1571044921875,
"learning_rate": 1e-06,
"loss": 0.0063,
"reward": 1.7322804033756256,
"reward_std": 0.09522599866613746,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7531136721372604,
"step": 287
},
{
"completion_length": 130.23958778381348,
"epoch": 1.7062314540059347,
"grad_norm": 2.8010153770446777,
"kl": 0.087646484375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7387493252754211,
"reward_std": 0.07657346210908145,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7491659671068192,
"step": 288
},
{
"completion_length": 124.05208587646484,
"epoch": 1.712166172106825,
"grad_norm": 7.392086029052734,
"kl": 0.0909423828125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.8153815567493439,
"reward_std": 0.05484287068247795,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8153815120458603,
"step": 289
},
{
"completion_length": 123.72916984558105,
"epoch": 1.7181008902077153,
"grad_norm": 3.426253080368042,
"kl": 0.0906982421875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.8041094243526459,
"reward_std": 0.03429581504315138,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8041094243526459,
"step": 290
},
{
"completion_length": 128.9895896911621,
"epoch": 1.7240356083086055,
"grad_norm": 5.66267204284668,
"kl": 0.1024169921875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.7256536781787872,
"reward_std": 0.1030060425400734,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7464869767427444,
"step": 291
},
{
"completion_length": 126.36458587646484,
"epoch": 1.7299703264094957,
"grad_norm": 4.350327491760254,
"kl": 0.08984375,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7592671513557434,
"reward_std": 0.10288255475461483,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7801005095243454,
"step": 292
},
{
"completion_length": 126.08333778381348,
"epoch": 1.7359050445103859,
"grad_norm": 5.17351770401001,
"kl": 0.09716796875,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7503591477870941,
"reward_std": 0.08022835082374513,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7607757598161697,
"step": 293
},
{
"completion_length": 124.03125190734863,
"epoch": 1.741839762611276,
"grad_norm": 13.972125053405762,
"kl": 0.095703125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.6726593971252441,
"reward_std": 0.1583157368004322,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7039093673229218,
"step": 294
},
{
"completion_length": 134.20833587646484,
"epoch": 1.7477744807121662,
"grad_norm": 2.3745687007904053,
"kl": 0.1024169921875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.7382087111473083,
"reward_std": 0.13995032757520676,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7694587409496307,
"step": 295
},
{
"completion_length": 131.4270896911621,
"epoch": 1.7537091988130564,
"grad_norm": 3.0401179790496826,
"kl": 0.0921630859375,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7459054291248322,
"reward_std": 0.1274840518599376,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7771554589271545,
"step": 296
},
{
"completion_length": 131.6979217529297,
"epoch": 1.7596439169139466,
"grad_norm": 3.122546911239624,
"kl": 0.103515625,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 1.8136819303035736,
"reward_std": 0.07234706217423081,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8240985721349716,
"step": 297
},
{
"completion_length": 129.11458778381348,
"epoch": 1.7655786350148368,
"grad_norm": 6.673977375030518,
"kl": 0.092529296875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7540749609470367,
"reward_std": 0.11415270157158375,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7749083042144775,
"step": 298
},
{
"completion_length": 131.53125762939453,
"epoch": 1.771513353115727,
"grad_norm": 3.5474982261657715,
"kl": 0.097900390625,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.6313540041446686,
"reward_std": 0.21476275008171797,
"rewards/format_reward": 0.9479166865348816,
"rewards/segmentation_reward": 0.6834373325109482,
"step": 299
},
{
"completion_length": 140.59375381469727,
"epoch": 1.7774480712166172,
"grad_norm": 3.5867486000061035,
"kl": 0.0789794921875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.6557375490665436,
"reward_std": 0.16151662543416023,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.6869875341653824,
"step": 300
},
{
"completion_length": 142.75000762939453,
"epoch": 1.7833827893175074,
"grad_norm": 3.54956316947937,
"kl": 0.087646484375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.777003139257431,
"reward_std": 0.048837858252227306,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7770031094551086,
"step": 301
},
{
"completion_length": 144.80208587646484,
"epoch": 1.7893175074183976,
"grad_norm": 5.2782673835754395,
"kl": 0.08740234375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8177353739738464,
"reward_std": 0.052700204541906714,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8177353739738464,
"step": 302
},
{
"completion_length": 141.9583396911621,
"epoch": 1.7952522255192878,
"grad_norm": 2.5410995483398438,
"kl": 0.0814208984375,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.8333467245101929,
"reward_std": 0.03343971585854888,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8333466798067093,
"step": 303
},
{
"completion_length": 146.65625762939453,
"epoch": 1.801186943620178,
"grad_norm": 3.657487630844116,
"kl": 0.0743408203125,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.7758903205394745,
"reward_std": 0.04749991255812347,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7758902609348297,
"step": 304
},
{
"completion_length": 145.7083396911621,
"epoch": 1.8071216617210681,
"grad_norm": 5.898721218109131,
"kl": 0.0816650390625,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7634993493556976,
"reward_std": 0.13334597554057837,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.7947493195533752,
"step": 305
},
{
"completion_length": 144.53125381469727,
"epoch": 1.8130563798219583,
"grad_norm": 3.5383071899414062,
"kl": 0.078369140625,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8184546828269958,
"reward_std": 0.06767121748998761,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.828871339559555,
"step": 306
},
{
"completion_length": 148.30208587646484,
"epoch": 1.8189910979228485,
"grad_norm": 3.506437063217163,
"kl": 0.078125,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.7857343256473541,
"reward_std": 0.11842209007591009,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8065676689147949,
"step": 307
},
{
"completion_length": 150.96875381469727,
"epoch": 1.8249258160237387,
"grad_norm": 3.9226300716400146,
"kl": 0.0777587890625,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.7295546233654022,
"reward_std": 0.1010670899413526,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.739971250295639,
"step": 308
},
{
"completion_length": 146.6666717529297,
"epoch": 1.8308605341246291,
"grad_norm": 17.635072708129883,
"kl": 0.083740234375,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.7234169244766235,
"reward_std": 0.1391521729528904,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7546669095754623,
"step": 309
},
{
"completion_length": 152.71875762939453,
"epoch": 1.8367952522255193,
"grad_norm": 8.344084739685059,
"kl": 0.08251953125,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7250747382640839,
"reward_std": 0.05801352020353079,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7250746786594391,
"step": 310
},
{
"completion_length": 148.9479217529297,
"epoch": 1.8427299703264095,
"grad_norm": 5.004342555999756,
"kl": 0.08349609375,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7435450851917267,
"reward_std": 0.12095003947615623,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7643783837556839,
"step": 311
},
{
"completion_length": 139.56250381469727,
"epoch": 1.8486646884272997,
"grad_norm": 9.769325256347656,
"kl": 0.08349609375,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.788212239742279,
"reward_std": 0.06093810824677348,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7986288219690323,
"step": 312
},
{
"completion_length": 145.8541717529297,
"epoch": 1.8545994065281899,
"grad_norm": 4.7226643562316895,
"kl": 0.0816650390625,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7469014525413513,
"reward_std": 0.08418664801865816,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7573181390762329,
"step": 313
},
{
"completion_length": 141.0208396911621,
"epoch": 1.86053412462908,
"grad_norm": 4.053534030914307,
"kl": 0.0819091796875,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7462977170944214,
"reward_std": 0.12578246276825666,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7775476276874542,
"step": 314
},
{
"completion_length": 143.83333587646484,
"epoch": 1.8664688427299705,
"grad_norm": 2.9752867221832275,
"kl": 0.080322265625,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.8229963779449463,
"reward_std": 0.03319215914234519,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8229963183403015,
"step": 315
},
{
"completion_length": 142.21875381469727,
"epoch": 1.8724035608308607,
"grad_norm": 3.922870397567749,
"kl": 0.0765380859375,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8421693444252014,
"reward_std": 0.03140254644677043,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8421692997217178,
"step": 316
},
{
"completion_length": 138.50000762939453,
"epoch": 1.8783382789317509,
"grad_norm": 3.026204824447632,
"kl": 0.0841064453125,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.8107014894485474,
"reward_std": 0.0752922969404608,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.810701459646225,
"step": 317
},
{
"completion_length": 135.03125381469727,
"epoch": 1.884272997032641,
"grad_norm": 18.734750747680664,
"kl": 0.08935546875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7768707573413849,
"reward_std": 0.06880995538085699,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7872874289751053,
"step": 318
},
{
"completion_length": 142.09375,
"epoch": 1.8902077151335313,
"grad_norm": 2.997725248336792,
"kl": 0.087890625,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7464922964572906,
"reward_std": 0.13362758047878742,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7673256248235703,
"step": 319
},
{
"completion_length": 145.7916717529297,
"epoch": 1.8961424332344214,
"grad_norm": 7.73359489440918,
"kl": 0.082763671875,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7588372826576233,
"reward_std": 0.07530860649421811,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7692539393901825,
"step": 320
},
{
"completion_length": 138.47917556762695,
"epoch": 1.9020771513353116,
"grad_norm": 5.083198070526123,
"kl": 0.0914306640625,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.72692009806633,
"reward_std": 0.08305464556906372,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.7477534264326096,
"step": 321
},
{
"completion_length": 142.41666793823242,
"epoch": 1.9080118694362018,
"grad_norm": 6.715488433837891,
"kl": 0.0872802734375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.7511941194534302,
"reward_std": 0.031710159964859486,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.751194104552269,
"step": 322
},
{
"completion_length": 138.04167556762695,
"epoch": 1.913946587537092,
"grad_norm": 2.5687637329101562,
"kl": 0.0877685546875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8096555471420288,
"reward_std": 0.03562284540385008,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.809655487537384,
"step": 323
},
{
"completion_length": 136.03125381469727,
"epoch": 1.9198813056379822,
"grad_norm": 5.063670635223389,
"kl": 0.103759765625,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.699085146188736,
"reward_std": 0.09667661227285862,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7095017433166504,
"step": 324
},
{
"completion_length": 135.0208396911621,
"epoch": 1.9258160237388724,
"grad_norm": 4.568605899810791,
"kl": 0.0887451171875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8009403049945831,
"reward_std": 0.04601938929408789,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8009403496980667,
"step": 325
},
{
"completion_length": 130.7604217529297,
"epoch": 1.9317507418397626,
"grad_norm": 4.746346473693848,
"kl": 0.0987548828125,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.7941046059131622,
"reward_std": 0.0846591629087925,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8045211881399155,
"step": 326
},
{
"completion_length": 138.28125381469727,
"epoch": 1.9376854599406528,
"grad_norm": 12.027536392211914,
"kl": 0.08740234375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.80451300740242,
"reward_std": 0.03780581499449909,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8045129328966141,
"step": 327
},
{
"completion_length": 134.66666793823242,
"epoch": 1.943620178041543,
"grad_norm": 8.19904613494873,
"kl": 0.0963134765625,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.7921161651611328,
"reward_std": 0.04007569560781121,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7921161502599716,
"step": 328
},
{
"completion_length": 135.46875762939453,
"epoch": 1.9495548961424332,
"grad_norm": 3.6576757431030273,
"kl": 0.090576171875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.7738652527332306,
"reward_std": 0.04611685499548912,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7842819541692734,
"step": 329
},
{
"completion_length": 139.07291793823242,
"epoch": 1.9554896142433233,
"grad_norm": 5.115423679351807,
"kl": 0.084228515625,
"learning_rate": 1e-06,
"loss": 0.0034,
"reward": 1.8082719147205353,
"reward_std": 0.060723274014890194,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8186886012554169,
"step": 330
},
{
"completion_length": 133.8854217529297,
"epoch": 1.9614243323442135,
"grad_norm": 3.8776745796203613,
"kl": 0.0994873046875,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.8066315650939941,
"reward_std": 0.0820788680575788,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8170481771230698,
"step": 331
},
{
"completion_length": 136.3229217529297,
"epoch": 1.9673590504451037,
"grad_norm": 8.56767463684082,
"kl": 0.08203125,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 1.7753276526927948,
"reward_std": 0.017031708965077996,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7753276079893112,
"step": 332
},
{
"completion_length": 131.2291717529297,
"epoch": 1.973293768545994,
"grad_norm": 3.104804277420044,
"kl": 0.07958984375,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.7486878633499146,
"reward_std": 0.060599199729040265,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7591045498847961,
"step": 333
},
{
"completion_length": 132.98958587646484,
"epoch": 1.979228486646884,
"grad_norm": 3.6994638442993164,
"kl": 0.0992431640625,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 1.7889062762260437,
"reward_std": 0.08577963337302208,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.7993228882551193,
"step": 334
},
{
"completion_length": 132.62500190734863,
"epoch": 1.9851632047477745,
"grad_norm": 3.3083105087280273,
"kl": 0.0926513671875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 1.7760635912418365,
"reward_std": 0.044035853585228324,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7760635614395142,
"step": 335
},
{
"completion_length": 131.53125762939453,
"epoch": 1.9910979228486647,
"grad_norm": 4.454075336456299,
"kl": 0.1187744140625,
"learning_rate": 1e-06,
"loss": 0.0047,
"reward": 1.7569631934165955,
"reward_std": 0.06494582071900368,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7569631636142731,
"step": 336
}
],
"logging_steps": 1.0,
"max_steps": 336,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}