loris3 commited on
Commit
fd67df8
·
verified ·
1 Parent(s): c7ab533

Upload folder using huggingface_hub

Browse files
checkpoints/checkpoint-3966/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2048,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "num_key_value_heads": 12,
20
+ "pad_token_id": 1,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.47.0",
28
+ "use_cache": true,
29
+ "vocab_size": 16000
30
+ }
checkpoints/checkpoint-3966/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.47.0"
7
+ }
checkpoints/checkpoint-3966/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8c0d194da43fdefb45ab560e75e7affad8a5a035910b6f83995886e3e0750ef
3
+ size 388979624
checkpoints/checkpoint-3966/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b7afe73a4bff698fb5dcba06d6ee572d1a9736dec727f9707b8676569a0fda6
3
+ size 778027770
checkpoints/checkpoint-3966/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac0389b5da961b38667013030da96e0e998cdc2366307000dfb275a026d99b15
3
+ size 14244
checkpoints/checkpoint-3966/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9d123214d0df33f7c4f8a61ba0bf6bf909be2ed7cdccafbae16e6057d28353
3
+ size 1064
checkpoints/checkpoint-3966/trainer_state.json ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.220581849761182,
5
+ "eval_steps": 500,
6
+ "global_step": 3966,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08684324793747286,
13
+ "grad_norm": 31.347537994384766,
14
+ "learning_rate": 0.00011666666666666667,
15
+ "loss": 94.1908,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.17368649587494572,
20
+ "grad_norm": 17.979413986206055,
21
+ "learning_rate": 0.00023333333333333333,
22
+ "loss": 56.4907,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.26052974381241856,
27
+ "grad_norm": 8.942031860351562,
28
+ "learning_rate": 0.00035,
29
+ "loss": 48.4783,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.34737299174989145,
34
+ "grad_norm": 6.502614974975586,
35
+ "learning_rate": 0.00046666666666666666,
36
+ "loss": 45.3011,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.4342162396873643,
41
+ "grad_norm": 6.777210235595703,
42
+ "learning_rate": 0.0005833333333333334,
43
+ "loss": 43.6152,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.5210594876248371,
48
+ "grad_norm": 6.147511959075928,
49
+ "learning_rate": 0.0007,
50
+ "loss": 42.3483,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.60790273556231,
55
+ "grad_norm": 5.621304988861084,
56
+ "learning_rate": 0.0006998546367133479,
57
+ "loss": 41.3118,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.6947459834997829,
62
+ "grad_norm": 5.471296787261963,
63
+ "learning_rate": 0.0006994186675990208,
64
+ "loss": 40.7393,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.7815892314372558,
69
+ "grad_norm": 7.381715774536133,
70
+ "learning_rate": 0.0006986924547936092,
71
+ "loss": 40.1379,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.8684324793747286,
76
+ "grad_norm": 4.7940192222595215,
77
+ "learning_rate": 0.000697676601523857,
78
+ "loss": 39.8433,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.9552757273122015,
83
+ "grad_norm": 4.409699440002441,
84
+ "learning_rate": 0.0006963719516055934,
85
+ "loss": 39.5783,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.998697351280938,
90
+ "eval_accuracy": 0.0,
91
+ "eval_loss": 5.713693618774414,
92
+ "eval_normalizer": 966112.0,
93
+ "eval_runtime": 115.6974,
94
+ "eval_samples_per_second": 509.519,
95
+ "eval_steps_per_second": 1.003,
96
+ "step": 575
97
+ },
98
+ {
99
+ "epoch": 1.0434216239687364,
100
+ "grad_norm": 4.693251609802246,
101
+ "learning_rate": 0.0006947795887428181,
102
+ "loss": 38.3627,
103
+ "step": 600
104
+ },
105
+ {
106
+ "epoch": 1.1302648719062094,
107
+ "grad_norm": 5.077625274658203,
108
+ "learning_rate": 0.0006929008356275276,
109
+ "loss": 37.544,
110
+ "step": 650
111
+ },
112
+ {
113
+ "epoch": 1.2171081198436822,
114
+ "grad_norm": 4.627607345581055,
115
+ "learning_rate": 0.0006907372528410224,
116
+ "loss": 37.5222,
117
+ "step": 700
118
+ },
119
+ {
120
+ "epoch": 1.303951367781155,
121
+ "grad_norm": 4.705991744995117,
122
+ "learning_rate": 0.0006882906375576155,
123
+ "loss": 36.4945,
124
+ "step": 750
125
+ },
126
+ {
127
+ "epoch": 1.390794615718628,
128
+ "grad_norm": 4.745748996734619,
129
+ "learning_rate": 0.0006855630220518143,
130
+ "loss": 36.3402,
131
+ "step": 800
132
+ },
133
+ {
134
+ "epoch": 1.4776378636561007,
135
+ "grad_norm": 4.541324138641357,
136
+ "learning_rate": 0.0006825566720102167,
137
+ "loss": 36.423,
138
+ "step": 850
139
+ },
140
+ {
141
+ "epoch": 1.5644811115935737,
142
+ "grad_norm": 4.555329322814941,
143
+ "learning_rate": 0.0006792740846495249,
144
+ "loss": 36.4842,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 1.6513243595310465,
149
+ "grad_norm": 4.505599021911621,
150
+ "learning_rate": 0.0006757179866422389,
151
+ "loss": 36.5019,
152
+ "step": 950
153
+ },
154
+ {
155
+ "epoch": 1.7381676074685193,
156
+ "grad_norm": 4.661733627319336,
157
+ "learning_rate": 0.0006718913318517527,
158
+ "loss": 36.0491,
159
+ "step": 1000
160
+ },
161
+ {
162
+ "epoch": 1.825010855405992,
163
+ "grad_norm": 4.7356438636779785,
164
+ "learning_rate": 0.0006677972988787362,
165
+ "loss": 33.5704,
166
+ "step": 1050
167
+ },
168
+ {
169
+ "epoch": 1.911854103343465,
170
+ "grad_norm": 4.665255069732666,
171
+ "learning_rate": 0.0006634392884208387,
172
+ "loss": 33.8845,
173
+ "step": 1100
174
+ },
175
+ {
176
+ "epoch": 1.998697351280938,
177
+ "grad_norm": 5.151551246643066,
178
+ "learning_rate": 0.0006588209204479085,
179
+ "loss": 34.1235,
180
+ "step": 1150
181
+ },
182
+ {
183
+ "epoch": 1.998697351280938,
184
+ "eval_accuracy": 0.0,
185
+ "eval_loss": 5.946074962615967,
186
+ "eval_normalizer": 966112.0,
187
+ "eval_runtime": 115.8973,
188
+ "eval_samples_per_second": 508.64,
189
+ "eval_steps_per_second": 1.001,
190
+ "step": 1150
191
+ },
192
+ {
193
+ "epoch": 2.086843247937473,
194
+ "grad_norm": 7.160614490509033,
195
+ "learning_rate": 0.0006539460311950741,
196
+ "loss": 75.2515,
197
+ "step": 1200
198
+ },
199
+ {
200
+ "epoch": 2.1736864958749456,
201
+ "grad_norm": 6.339908123016357,
202
+ "learning_rate": 0.000648818669976186,
203
+ "loss": 63.1052,
204
+ "step": 1250
205
+ },
206
+ {
207
+ "epoch": 2.260529743812419,
208
+ "grad_norm": 6.869708061218262,
209
+ "learning_rate": 0.0006434430958202652,
210
+ "loss": 55.7262,
211
+ "step": 1300
212
+ },
213
+ {
214
+ "epoch": 2.3473729917498916,
215
+ "grad_norm": 8.690558433532715,
216
+ "learning_rate": 0.0006378237739337511,
217
+ "loss": 46.9368,
218
+ "step": 1350
219
+ },
220
+ {
221
+ "epoch": 2.4342162396873643,
222
+ "grad_norm": 10.8461275100708,
223
+ "learning_rate": 0.0006319653719914907,
224
+ "loss": 36.8508,
225
+ "step": 1400
226
+ },
227
+ {
228
+ "epoch": 2.521059487624837,
229
+ "grad_norm": 11.691084861755371,
230
+ "learning_rate": 0.000625872756259546,
231
+ "loss": 27.586,
232
+ "step": 1450
233
+ },
234
+ {
235
+ "epoch": 2.60790273556231,
236
+ "grad_norm": 11.534943580627441,
237
+ "learning_rate": 0.0006195509875530431,
238
+ "loss": 20.8625,
239
+ "step": 1500
240
+ },
241
+ {
242
+ "epoch": 2.694745983499783,
243
+ "grad_norm": 10.615392684936523,
244
+ "learning_rate": 0.0006130053170324202,
245
+ "loss": 16.9027,
246
+ "step": 1550
247
+ },
248
+ {
249
+ "epoch": 2.781589231437256,
250
+ "grad_norm": 9.750032424926758,
251
+ "learning_rate": 0.000606241181841564,
252
+ "loss": 14.6911,
253
+ "step": 1600
254
+ },
255
+ {
256
+ "epoch": 2.847590099869735,
257
+ "eval_accuracy": 0.0,
258
+ "eval_loss": 7.1964263916015625,
259
+ "eval_normalizer": 966112.0,
260
+ "eval_runtime": 115.2002,
261
+ "eval_samples_per_second": 511.718,
262
+ "eval_steps_per_second": 1.007,
263
+ "step": 1638
264
+ },
265
+ {
266
+ "epoch": 3.0208423795049937,
267
+ "grad_norm": 9.399425506591797,
268
+ "learning_rate": 0.0005992642005914615,
269
+ "loss": 13.6775,
270
+ "step": 1650
271
+ },
272
+ {
273
+ "epoch": 3.1076856274424665,
274
+ "grad_norm": 8.75631046295166,
275
+ "learning_rate": 0.0005920801686931151,
276
+ "loss": 12.8369,
277
+ "step": 1700
278
+ },
279
+ {
280
+ "epoch": 3.1945288753799392,
281
+ "grad_norm": 8.309281349182129,
282
+ "learning_rate": 0.0005846950535436001,
283
+ "loss": 12.3939,
284
+ "step": 1750
285
+ },
286
+ {
287
+ "epoch": 3.281372123317412,
288
+ "grad_norm": 7.948273658752441,
289
+ "learning_rate": 0.0005771149895692616,
290
+ "loss": 12.1119,
291
+ "step": 1800
292
+ },
293
+ {
294
+ "epoch": 3.368215371254885,
295
+ "grad_norm": 7.4247727394104,
296
+ "learning_rate": 0.0005693462731301704,
297
+ "loss": 11.759,
298
+ "step": 1850
299
+ },
300
+ {
301
+ "epoch": 3.455058619192358,
302
+ "grad_norm": 7.026332378387451,
303
+ "learning_rate": 0.0005613953572900671,
304
+ "loss": 11.5219,
305
+ "step": 1900
306
+ },
307
+ {
308
+ "epoch": 3.541901867129831,
309
+ "grad_norm": 6.633806228637695,
310
+ "learning_rate": 0.0005532688464561429,
311
+ "loss": 11.3874,
312
+ "step": 1950
313
+ },
314
+ {
315
+ "epoch": 3.6287451150673036,
316
+ "grad_norm": 6.791120529174805,
317
+ "learning_rate": 0.0005449734908931053,
318
+ "loss": 11.2119,
319
+ "step": 2000
320
+ },
321
+ {
322
+ "epoch": 3.7155883630047764,
323
+ "grad_norm": 5.996912002563477,
324
+ "learning_rate": 0.0005365161811160892,
325
+ "loss": 11.0684,
326
+ "step": 2050
327
+ },
328
+ {
329
+ "epoch": 3.802431610942249,
330
+ "grad_norm": 5.432217121124268,
331
+ "learning_rate": 0.0005279039421670681,
332
+ "loss": 10.9551,
333
+ "step": 2100
334
+ },
335
+ {
336
+ "epoch": 3.847590099869735,
337
+ "eval_accuracy": 0.0,
338
+ "eval_loss": 8.056244850158691,
339
+ "eval_normalizer": 966112.0,
340
+ "eval_runtime": 115.2236,
341
+ "eval_samples_per_second": 511.614,
342
+ "eval_steps_per_second": 1.007,
343
+ "step": 2126
344
+ },
345
+ {
346
+ "epoch": 4.041684759009987,
347
+ "grad_norm": 9.604610443115234,
348
+ "learning_rate": 0.0005191439277795228,
349
+ "loss": 47.1122,
350
+ "step": 2150
351
+ },
352
+ {
353
+ "epoch": 4.12852800694746,
354
+ "grad_norm": 8.961295127868652,
355
+ "learning_rate": 0.0005102434144362101,
356
+ "loss": 70.3217,
357
+ "step": 2200
358
+ },
359
+ {
360
+ "epoch": 4.215371254884933,
361
+ "grad_norm": 8.817536354064941,
362
+ "learning_rate": 0.0005012097953249728,
363
+ "loss": 66.2661,
364
+ "step": 2250
365
+ },
366
+ {
367
+ "epoch": 4.302214502822405,
368
+ "grad_norm": 8.901741027832031,
369
+ "learning_rate": 0.0004920505741976074,
370
+ "loss": 64.2466,
371
+ "step": 2300
372
+ },
373
+ {
374
+ "epoch": 4.3890577507598785,
375
+ "grad_norm": 9.773693084716797,
376
+ "learning_rate": 0.00048277335913689246,
377
+ "loss": 57.701,
378
+ "step": 2350
379
+ },
380
+ {
381
+ "epoch": 4.475900998697352,
382
+ "grad_norm": 8.872902870178223,
383
+ "learning_rate": 0.0004733858562369547,
384
+ "loss": 57.1105,
385
+ "step": 2400
386
+ },
387
+ {
388
+ "epoch": 4.562744246634824,
389
+ "grad_norm": 8.541085243225098,
390
+ "learning_rate": 0.0004638958632022228,
391
+ "loss": 56.5631,
392
+ "step": 2450
393
+ },
394
+ {
395
+ "epoch": 4.649587494572297,
396
+ "grad_norm": 10.593109130859375,
397
+ "learning_rate": 0.0004543112628702843,
398
+ "loss": 51.5551,
399
+ "step": 2500
400
+ },
401
+ {
402
+ "epoch": 4.73643074250977,
403
+ "grad_norm": 11.323746681213379,
404
+ "learning_rate": 0.00044464001666402774,
405
+ "loss": 45.8094,
406
+ "step": 2550
407
+ },
408
+ {
409
+ "epoch": 4.823273990447243,
410
+ "grad_norm": 11.540026664733887,
411
+ "learning_rate": 0.00043489015797850783,
412
+ "loss": 46.4972,
413
+ "step": 2600
414
+ },
415
+ {
416
+ "epoch": 4.910117238384716,
417
+ "grad_norm": 11.071066856384277,
418
+ "learning_rate": 0.00042506978550802693,
419
+ "loss": 46.8668,
420
+ "step": 2650
421
+ },
422
+ {
423
+ "epoch": 4.996960486322188,
424
+ "grad_norm": 12.022558212280273,
425
+ "learning_rate": 0.00041518705651897615,
426
+ "loss": 35.7416,
427
+ "step": 2700
428
+ },
429
+ {
430
+ "epoch": 4.998697351280938,
431
+ "eval_accuracy": 0.0,
432
+ "eval_loss": 7.155083179473877,
433
+ "eval_normalizer": 966112.0,
434
+ "eval_runtime": 179.9178,
435
+ "eval_samples_per_second": 327.65,
436
+ "eval_steps_per_second": 0.645,
437
+ "step": 2701
438
+ },
439
+ {
440
+ "epoch": 5.085106382978723,
441
+ "grad_norm": 11.98468017578125,
442
+ "learning_rate": 0.0004052501800740239,
443
+ "loss": 32.6326,
444
+ "step": 2750
445
+ },
446
+ {
447
+ "epoch": 5.171949630916196,
448
+ "grad_norm": 12.476215362548828,
449
+ "learning_rate": 0.00039526741021327923,
450
+ "loss": 33.6548,
451
+ "step": 2800
452
+ },
453
+ {
454
+ "epoch": 5.2587928788536695,
455
+ "grad_norm": 12.83471393585205,
456
+ "learning_rate": 0.00038524703909809544,
457
+ "loss": 34.5809,
458
+ "step": 2850
459
+ },
460
+ {
461
+ "epoch": 5.345636126791142,
462
+ "grad_norm": 10.729170799255371,
463
+ "learning_rate": 0.00037519739012320844,
464
+ "loss": 34.2459,
465
+ "step": 2900
466
+ },
467
+ {
468
+ "epoch": 5.432479374728615,
469
+ "grad_norm": 11.369369506835938,
470
+ "learning_rate": 0.0003651268110029309,
471
+ "loss": 26.2298,
472
+ "step": 2950
473
+ },
474
+ {
475
+ "epoch": 5.519322622666087,
476
+ "grad_norm": 12.255570411682129,
477
+ "learning_rate": 0.00035504366683714505,
478
+ "loss": 27.5998,
479
+ "step": 3000
480
+ },
481
+ {
482
+ "epoch": 5.606165870603561,
483
+ "grad_norm": 12.656834602355957,
484
+ "learning_rate": 0.00034495633316285505,
485
+ "loss": 28.2905,
486
+ "step": 3050
487
+ },
488
+ {
489
+ "epoch": 5.693009118541033,
490
+ "grad_norm": 9.809476852416992,
491
+ "learning_rate": 0.0003348731889970691,
492
+ "loss": 25.0195,
493
+ "step": 3100
494
+ },
495
+ {
496
+ "epoch": 5.779852366478506,
497
+ "grad_norm": 10.557951927185059,
498
+ "learning_rate": 0.00032480260987679155,
499
+ "loss": 23.7825,
500
+ "step": 3150
501
+ },
502
+ {
503
+ "epoch": 5.866695614415979,
504
+ "grad_norm": 10.909571647644043,
505
+ "learning_rate": 0.0003147529609019046,
506
+ "loss": 24.4192,
507
+ "step": 3200
508
+ },
509
+ {
510
+ "epoch": 5.953538862353452,
511
+ "grad_norm": 8.427265167236328,
512
+ "learning_rate": 0.0003047325897867208,
513
+ "loss": 24.474,
514
+ "step": 3250
515
+ },
516
+ {
517
+ "epoch": 5.998697351280938,
518
+ "eval_accuracy": 0.0,
519
+ "eval_loss": 8.506017684936523,
520
+ "eval_normalizer": 966112.0,
521
+ "eval_runtime": 116.2487,
522
+ "eval_samples_per_second": 507.102,
523
+ "eval_steps_per_second": 0.998,
524
+ "step": 3276
525
+ },
526
+ {
527
+ "epoch": 6.041684759009987,
528
+ "grad_norm": 6.966476917266846,
529
+ "learning_rate": 0.0002947498199259761,
530
+ "loss": 59.2918,
531
+ "step": 3300
532
+ },
533
+ {
534
+ "epoch": 6.12852800694746,
535
+ "grad_norm": 6.616938591003418,
536
+ "learning_rate": 0.0002848129434810239,
537
+ "loss": 72.441,
538
+ "step": 3350
539
+ },
540
+ {
541
+ "epoch": 6.215371254884933,
542
+ "grad_norm": 7.3314995765686035,
543
+ "learning_rate": 0.00027493021449197306,
544
+ "loss": 59.2743,
545
+ "step": 3400
546
+ },
547
+ {
548
+ "epoch": 6.302214502822405,
549
+ "grad_norm": 10.363481521606445,
550
+ "learning_rate": 0.00026510984202149227,
551
+ "loss": 46.5235,
552
+ "step": 3450
553
+ },
554
+ {
555
+ "epoch": 6.378636561007382,
556
+ "eval_accuracy": 0.0,
557
+ "eval_loss": 7.493179798126221,
558
+ "eval_normalizer": 966112.0,
559
+ "eval_runtime": 118.0663,
560
+ "eval_samples_per_second": 499.296,
561
+ "eval_steps_per_second": 0.982,
562
+ "step": 3494
563
+ },
564
+ {
565
+ "epoch": 7.010421189752496,
566
+ "grad_norm": 10.510085105895996,
567
+ "learning_rate": 0.0002553599833359722,
568
+ "loss": 35.4207,
569
+ "step": 3500
570
+ },
571
+ {
572
+ "epoch": 7.09726443768997,
573
+ "grad_norm": 12.08901309967041,
574
+ "learning_rate": 0.0002456887371297157,
575
+ "loss": 22.8856,
576
+ "step": 3550
577
+ },
578
+ {
579
+ "epoch": 7.184107685627443,
580
+ "grad_norm": 10.019026756286621,
581
+ "learning_rate": 0.00023610413679777718,
582
+ "loss": 14.8815,
583
+ "step": 3600
584
+ },
585
+ {
586
+ "epoch": 7.270950933564915,
587
+ "grad_norm": 10.840076446533203,
588
+ "learning_rate": 0.00022661414376304531,
589
+ "loss": 9.368,
590
+ "step": 3650
591
+ },
592
+ {
593
+ "epoch": 7.357794181502388,
594
+ "grad_norm": 8.214183807373047,
595
+ "learning_rate": 0.00021722664086310753,
596
+ "loss": 6.6961,
597
+ "step": 3700
598
+ },
599
+ {
600
+ "epoch": 7.378636561007382,
601
+ "eval_accuracy": 0.0,
602
+ "eval_loss": 9.528374671936035,
603
+ "eval_normalizer": 966112.0,
604
+ "eval_runtime": 188.9886,
605
+ "eval_samples_per_second": 311.924,
606
+ "eval_steps_per_second": 0.614,
607
+ "step": 3712
608
+ },
609
+ {
610
+ "epoch": 8.066000868432479,
611
+ "grad_norm": 5.275137424468994,
612
+ "learning_rate": 0.00020794942580239256,
613
+ "loss": 71.8294,
614
+ "step": 3750
615
+ },
616
+ {
617
+ "epoch": 8.152844116369952,
618
+ "grad_norm": 5.0429768562316895,
619
+ "learning_rate": 0.0001987902046750272,
620
+ "loss": 69.779,
621
+ "step": 3800
622
+ },
623
+ {
624
+ "epoch": 8.220581849761182,
625
+ "eval_accuracy": 0.0,
626
+ "eval_loss": 6.233586311340332,
627
+ "eval_normalizer": 966112.0,
628
+ "eval_runtime": 114.8088,
629
+ "eval_samples_per_second": 513.462,
630
+ "eval_steps_per_second": 1.01,
631
+ "step": 3839
632
+ },
633
+ {
634
+ "epoch": 9.019105514546244,
635
+ "grad_norm": 6.48486328125,
636
+ "learning_rate": 0.00018975658556379,
637
+ "loss": 62.5376,
638
+ "step": 3850
639
+ },
640
+ {
641
+ "epoch": 9.105948762483717,
642
+ "grad_norm": 6.984375,
643
+ "learning_rate": 0.0001808560722204773,
644
+ "loss": 56.5479,
645
+ "step": 3900
646
+ },
647
+ {
648
+ "epoch": 9.19279201042119,
649
+ "grad_norm": 9.563068389892578,
650
+ "learning_rate": 0.000172096057832932,
651
+ "loss": 49.6458,
652
+ "step": 3950
653
+ },
654
+ {
655
+ "epoch": 9.220581849761182,
656
+ "eval_accuracy": 0.0,
657
+ "eval_loss": 7.126227378845215,
658
+ "eval_normalizer": 966112.0,
659
+ "eval_runtime": 117.5564,
660
+ "eval_samples_per_second": 501.461,
661
+ "eval_steps_per_second": 0.987,
662
+ "step": 3966
663
+ }
664
+ ],
665
+ "logging_steps": 50,
666
+ "max_steps": 5750,
667
+ "num_input_tokens_seen": 0,
668
+ "num_train_epochs": 10,
669
+ "save_steps": 500,
670
+ "stateful_callbacks": {
671
+ "TrainerControl": {
672
+ "args": {
673
+ "should_epoch_stop": false,
674
+ "should_evaluate": false,
675
+ "should_log": false,
676
+ "should_save": true,
677
+ "should_training_stop": false
678
+ },
679
+ "attributes": {}
680
+ }
681
+ },
682
+ "total_flos": 4.151701558926305e+17,
683
+ "train_batch_size": 128,
684
+ "trial_name": null,
685
+ "trial_params": null
686
+ }
checkpoints/checkpoint-3966/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9a830de5c611a0b03c8b33ab915cf89010e60abaf6d5647f96cb57e683f0b5
3
+ size 5432