minpeter commited on
Commit
77a5e93
·
verified ·
1 Parent(s): c96a817

Training in progress, step 22000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba9d75a78fad20f4b1e389f6c85dda0f453be86d800ed2eba32953160cc02033
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ed6ac9ccb5e7d64d2e1c9f2879aedec4abdbea9a50739e1287448962b822ce
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df2f641838670afd6d1bb0181e8efde74cebba7ddaeaad933397844d1eb9afb6
3
  size 209816139
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21aacf2f2f02d684309501173ad828552422076a0c16aea88131afc48854c0e7
3
  size 209816139
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eda74d083cd5d9b07d403914b5a235c44dd87bc93a29636e940f36b95f8743f9
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b324ca901edaaaa5841347eeb4f75e6d12219bec163c3b83caffcbf6520d58a3
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91f1feed6ec98326449107f6ac06aad035f8176b90aa697c6edf6a509039a50c
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f93a058689b8ada6622702c5a5833d4e962616a54302ee183cbf481797944cfb
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6645e7dc37725bbae83eaf70fb81001a75be54d9a6554f43743dfb20cfc0984
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6cc8155f9a668ccc650b25a3629943fe3a02ee796145b1ad7dac78628d32ab5
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
- "epoch": 0.06558523894888724,
6
  "eval_steps": 1000,
7
- "global_step": 21000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6056,6 +6056,294 @@
6056
  "eval_samples_per_second": 50.848,
6057
  "eval_steps_per_second": 3.186,
6058
  "step": 21000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6059
  }
6060
  ],
6061
  "logging_steps": 25,
@@ -6075,7 +6363,7 @@
6075
  "attributes": {}
6076
  }
6077
  },
6078
- "total_flos": 2.663111367480836e+18,
6079
  "train_batch_size": 8,
6080
  "trial_name": null,
6081
  "trial_params": null
 
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
+ "epoch": 0.06870834556550091,
6
  "eval_steps": 1000,
7
+ "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6056
  "eval_samples_per_second": 50.848,
6057
  "eval_steps_per_second": 3.186,
6058
  "step": 21000
6059
+ },
6060
+ {
6061
+ "epoch": 0.06566331661430258,
6062
+ "grad_norm": 37.0,
6063
+ "learning_rate": 0.0009993297472148076,
6064
+ "loss": 33.5467,
6065
+ "step": 21025
6066
+ },
6067
+ {
6068
+ "epoch": 0.06574139427971792,
6069
+ "grad_norm": 38.5,
6070
+ "learning_rate": 0.0009993230482511295,
6071
+ "loss": 33.6705,
6072
+ "step": 21050
6073
+ },
6074
+ {
6075
+ "epoch": 0.06581947194513327,
6076
+ "grad_norm": 39.0,
6077
+ "learning_rate": 0.0009993163159993798,
6078
+ "loss": 33.7872,
6079
+ "step": 21075
6080
+ },
6081
+ {
6082
+ "epoch": 0.06589754961054861,
6083
+ "grad_norm": 45.5,
6084
+ "learning_rate": 0.0009993095504600067,
6085
+ "loss": 33.6316,
6086
+ "step": 21100
6087
+ },
6088
+ {
6089
+ "epoch": 0.06597562727596394,
6090
+ "grad_norm": 38.0,
6091
+ "learning_rate": 0.0009993027516334617,
6092
+ "loss": 33.8796,
6093
+ "step": 21125
6094
+ },
6095
+ {
6096
+ "epoch": 0.06605370494137928,
6097
+ "grad_norm": 43.75,
6098
+ "learning_rate": 0.000999295919520198,
6099
+ "loss": 34.0526,
6100
+ "step": 21150
6101
+ },
6102
+ {
6103
+ "epoch": 0.06613178260679463,
6104
+ "grad_norm": 36.0,
6105
+ "learning_rate": 0.000999289054120671,
6106
+ "loss": 34.1438,
6107
+ "step": 21175
6108
+ },
6109
+ {
6110
+ "epoch": 0.06620986027220997,
6111
+ "grad_norm": 38.0,
6112
+ "learning_rate": 0.0009992821554353382,
6113
+ "loss": 33.7974,
6114
+ "step": 21200
6115
+ },
6116
+ {
6117
+ "epoch": 0.06628793793762532,
6118
+ "grad_norm": 46.0,
6119
+ "learning_rate": 0.00099927522346466,
6120
+ "loss": 33.8107,
6121
+ "step": 21225
6122
+ },
6123
+ {
6124
+ "epoch": 0.06636601560304066,
6125
+ "grad_norm": 45.75,
6126
+ "learning_rate": 0.0009992682582090982,
6127
+ "loss": 33.8952,
6128
+ "step": 21250
6129
+ },
6130
+ {
6131
+ "epoch": 0.066444093268456,
6132
+ "grad_norm": 39.5,
6133
+ "learning_rate": 0.0009992612596691171,
6134
+ "loss": 34.201,
6135
+ "step": 21275
6136
+ },
6137
+ {
6138
+ "epoch": 0.06652217093387135,
6139
+ "grad_norm": 49.25,
6140
+ "learning_rate": 0.0009992542278451832,
6141
+ "loss": 34.2007,
6142
+ "step": 21300
6143
+ },
6144
+ {
6145
+ "epoch": 0.06660024859928668,
6146
+ "grad_norm": 42.0,
6147
+ "learning_rate": 0.0009992471627377657,
6148
+ "loss": 34.3501,
6149
+ "step": 21325
6150
+ },
6151
+ {
6152
+ "epoch": 0.06667832626470202,
6153
+ "grad_norm": 48.75,
6154
+ "learning_rate": 0.0009992400643473354,
6155
+ "loss": 34.4321,
6156
+ "step": 21350
6157
+ },
6158
+ {
6159
+ "epoch": 0.06675640393011736,
6160
+ "grad_norm": 43.25,
6161
+ "learning_rate": 0.0009992329326743653,
6162
+ "loss": 34.638,
6163
+ "step": 21375
6164
+ },
6165
+ {
6166
+ "epoch": 0.06683448159553271,
6167
+ "grad_norm": 41.75,
6168
+ "learning_rate": 0.000999225767719331,
6169
+ "loss": 34.588,
6170
+ "step": 21400
6171
+ },
6172
+ {
6173
+ "epoch": 0.06691255926094805,
6174
+ "grad_norm": 44.5,
6175
+ "learning_rate": 0.0009992185694827102,
6176
+ "loss": 34.7111,
6177
+ "step": 21425
6178
+ },
6179
+ {
6180
+ "epoch": 0.0669906369263634,
6181
+ "grad_norm": 50.5,
6182
+ "learning_rate": 0.0009992113379649829,
6183
+ "loss": 34.7677,
6184
+ "step": 21450
6185
+ },
6186
+ {
6187
+ "epoch": 0.06706871459177874,
6188
+ "grad_norm": 62.0,
6189
+ "learning_rate": 0.000999204073166631,
6190
+ "loss": 35.0234,
6191
+ "step": 21475
6192
+ },
6193
+ {
6194
+ "epoch": 0.06714679225719408,
6195
+ "grad_norm": 48.0,
6196
+ "learning_rate": 0.0009991967750881388,
6197
+ "loss": 35.0909,
6198
+ "step": 21500
6199
+ },
6200
+ {
6201
+ "epoch": 0.06722486992260941,
6202
+ "grad_norm": 49.5,
6203
+ "learning_rate": 0.000999189443729993,
6204
+ "loss": 35.4811,
6205
+ "step": 21525
6206
+ },
6207
+ {
6208
+ "epoch": 0.06730294758802476,
6209
+ "grad_norm": 58.0,
6210
+ "learning_rate": 0.0009991820790926824,
6211
+ "loss": 35.2726,
6212
+ "step": 21550
6213
+ },
6214
+ {
6215
+ "epoch": 0.0673810252534401,
6216
+ "grad_norm": 55.5,
6217
+ "learning_rate": 0.0009991746811766975,
6218
+ "loss": 35.629,
6219
+ "step": 21575
6220
+ },
6221
+ {
6222
+ "epoch": 0.06745910291885544,
6223
+ "grad_norm": 44.0,
6224
+ "learning_rate": 0.000999167249982532,
6225
+ "loss": 35.4736,
6226
+ "step": 21600
6227
+ },
6228
+ {
6229
+ "epoch": 0.06753718058427079,
6230
+ "grad_norm": 45.75,
6231
+ "learning_rate": 0.0009991597855106814,
6232
+ "loss": 35.2275,
6233
+ "step": 21625
6234
+ },
6235
+ {
6236
+ "epoch": 0.06761525824968613,
6237
+ "grad_norm": 41.5,
6238
+ "learning_rate": 0.0009991522877616428,
6239
+ "loss": 35.2907,
6240
+ "step": 21650
6241
+ },
6242
+ {
6243
+ "epoch": 0.06769333591510147,
6244
+ "grad_norm": 56.5,
6245
+ "learning_rate": 0.000999144756735916,
6246
+ "loss": 35.2988,
6247
+ "step": 21675
6248
+ },
6249
+ {
6250
+ "epoch": 0.06777141358051682,
6251
+ "grad_norm": 56.0,
6252
+ "learning_rate": 0.000999137192434004,
6253
+ "loss": 35.2948,
6254
+ "step": 21700
6255
+ },
6256
+ {
6257
+ "epoch": 0.06784949124593215,
6258
+ "grad_norm": 42.0,
6259
+ "learning_rate": 0.0009991295948564103,
6260
+ "loss": 35.1186,
6261
+ "step": 21725
6262
+ },
6263
+ {
6264
+ "epoch": 0.06792756891134749,
6265
+ "grad_norm": 43.25,
6266
+ "learning_rate": 0.0009991219640036416,
6267
+ "loss": 35.115,
6268
+ "step": 21750
6269
+ },
6270
+ {
6271
+ "epoch": 0.06800564657676283,
6272
+ "grad_norm": 43.75,
6273
+ "learning_rate": 0.0009991142998762065,
6274
+ "loss": 35.347,
6275
+ "step": 21775
6276
+ },
6277
+ {
6278
+ "epoch": 0.06808372424217818,
6279
+ "grad_norm": 45.0,
6280
+ "learning_rate": 0.000999106602474616,
6281
+ "loss": 35.3008,
6282
+ "step": 21800
6283
+ },
6284
+ {
6285
+ "epoch": 0.06816180190759352,
6286
+ "grad_norm": 66.0,
6287
+ "learning_rate": 0.0009990988717993832,
6288
+ "loss": 35.321,
6289
+ "step": 21825
6290
+ },
6291
+ {
6292
+ "epoch": 0.06823987957300887,
6293
+ "grad_norm": 56.0,
6294
+ "learning_rate": 0.0009990911078510238,
6295
+ "loss": 35.373,
6296
+ "step": 21850
6297
+ },
6298
+ {
6299
+ "epoch": 0.06831795723842421,
6300
+ "grad_norm": 49.25,
6301
+ "learning_rate": 0.000999083310630055,
6302
+ "loss": 35.2404,
6303
+ "step": 21875
6304
+ },
6305
+ {
6306
+ "epoch": 0.06839603490383955,
6307
+ "grad_norm": 46.0,
6308
+ "learning_rate": 0.000999075480136997,
6309
+ "loss": 35.2177,
6310
+ "step": 21900
6311
+ },
6312
+ {
6313
+ "epoch": 0.06847411256925488,
6314
+ "grad_norm": 43.5,
6315
+ "learning_rate": 0.0009990676163723715,
6316
+ "loss": 35.1759,
6317
+ "step": 21925
6318
+ },
6319
+ {
6320
+ "epoch": 0.06855219023467023,
6321
+ "grad_norm": 54.5,
6322
+ "learning_rate": 0.000999059719336703,
6323
+ "loss": 34.7193,
6324
+ "step": 21950
6325
+ },
6326
+ {
6327
+ "epoch": 0.06863026790008557,
6328
+ "grad_norm": 48.25,
6329
+ "learning_rate": 0.0009990517890305175,
6330
+ "loss": 34.6676,
6331
+ "step": 21975
6332
+ },
6333
+ {
6334
+ "epoch": 0.06870834556550091,
6335
+ "grad_norm": 44.75,
6336
+ "learning_rate": 0.0009990438254543442,
6337
+ "loss": 34.4965,
6338
+ "step": 22000
6339
+ },
6340
+ {
6341
+ "epoch": 0.06870834556550091,
6342
+ "eval_loss": 34.531646728515625,
6343
+ "eval_runtime": 102.6371,
6344
+ "eval_samples_per_second": 50.693,
6345
+ "eval_steps_per_second": 3.176,
6346
+ "step": 22000
6347
  }
6348
  ],
6349
  "logging_steps": 25,
 
6363
  "attributes": {}
6364
  }
6365
  },
6366
+ "total_flos": 2.7899608404454277e+18,
6367
  "train_batch_size": 8,
6368
  "trial_name": null,
6369
  "trial_params": null