Training in progress, step 22000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 373077376
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3ed6ac9ccb5e7d64d2e1c9f2879aedec4abdbea9a50739e1287448962b822ce
|
| 3 |
size 373077376
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 209816139
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21aacf2f2f02d684309501173ad828552422076a0c16aea88131afc48854c0e7
|
| 3 |
size 209816139
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b324ca901edaaaa5841347eeb4f75e6d12219bec163c3b83caffcbf6520d58a3
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f93a058689b8ada6622702c5a5833d4e962616a54302ee183cbf481797944cfb
|
| 3 |
size 14917
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6cc8155f9a668ccc650b25a3629943fe3a02ee796145b1ad7dac78628d32ab5
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6056,6 +6056,294 @@
|
|
| 6056 |
"eval_samples_per_second": 50.848,
|
| 6057 |
"eval_steps_per_second": 3.186,
|
| 6058 |
"step": 21000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6059 |
}
|
| 6060 |
],
|
| 6061 |
"logging_steps": 25,
|
|
@@ -6075,7 +6363,7 @@
|
|
| 6075 |
"attributes": {}
|
| 6076 |
}
|
| 6077 |
},
|
| 6078 |
-
"total_flos": 2.
|
| 6079 |
"train_batch_size": 8,
|
| 6080 |
"trial_name": null,
|
| 6081 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 2000,
|
| 3 |
"best_metric": 9.218317031860352,
|
| 4 |
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
|
| 5 |
+
"epoch": 0.06870834556550091,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 22000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6056 |
"eval_samples_per_second": 50.848,
|
| 6057 |
"eval_steps_per_second": 3.186,
|
| 6058 |
"step": 21000
|
| 6059 |
+
},
|
| 6060 |
+
{
|
| 6061 |
+
"epoch": 0.06566331661430258,
|
| 6062 |
+
"grad_norm": 37.0,
|
| 6063 |
+
"learning_rate": 0.0009993297472148076,
|
| 6064 |
+
"loss": 33.5467,
|
| 6065 |
+
"step": 21025
|
| 6066 |
+
},
|
| 6067 |
+
{
|
| 6068 |
+
"epoch": 0.06574139427971792,
|
| 6069 |
+
"grad_norm": 38.5,
|
| 6070 |
+
"learning_rate": 0.0009993230482511295,
|
| 6071 |
+
"loss": 33.6705,
|
| 6072 |
+
"step": 21050
|
| 6073 |
+
},
|
| 6074 |
+
{
|
| 6075 |
+
"epoch": 0.06581947194513327,
|
| 6076 |
+
"grad_norm": 39.0,
|
| 6077 |
+
"learning_rate": 0.0009993163159993798,
|
| 6078 |
+
"loss": 33.7872,
|
| 6079 |
+
"step": 21075
|
| 6080 |
+
},
|
| 6081 |
+
{
|
| 6082 |
+
"epoch": 0.06589754961054861,
|
| 6083 |
+
"grad_norm": 45.5,
|
| 6084 |
+
"learning_rate": 0.0009993095504600067,
|
| 6085 |
+
"loss": 33.6316,
|
| 6086 |
+
"step": 21100
|
| 6087 |
+
},
|
| 6088 |
+
{
|
| 6089 |
+
"epoch": 0.06597562727596394,
|
| 6090 |
+
"grad_norm": 38.0,
|
| 6091 |
+
"learning_rate": 0.0009993027516334617,
|
| 6092 |
+
"loss": 33.8796,
|
| 6093 |
+
"step": 21125
|
| 6094 |
+
},
|
| 6095 |
+
{
|
| 6096 |
+
"epoch": 0.06605370494137928,
|
| 6097 |
+
"grad_norm": 43.75,
|
| 6098 |
+
"learning_rate": 0.000999295919520198,
|
| 6099 |
+
"loss": 34.0526,
|
| 6100 |
+
"step": 21150
|
| 6101 |
+
},
|
| 6102 |
+
{
|
| 6103 |
+
"epoch": 0.06613178260679463,
|
| 6104 |
+
"grad_norm": 36.0,
|
| 6105 |
+
"learning_rate": 0.000999289054120671,
|
| 6106 |
+
"loss": 34.1438,
|
| 6107 |
+
"step": 21175
|
| 6108 |
+
},
|
| 6109 |
+
{
|
| 6110 |
+
"epoch": 0.06620986027220997,
|
| 6111 |
+
"grad_norm": 38.0,
|
| 6112 |
+
"learning_rate": 0.0009992821554353382,
|
| 6113 |
+
"loss": 33.7974,
|
| 6114 |
+
"step": 21200
|
| 6115 |
+
},
|
| 6116 |
+
{
|
| 6117 |
+
"epoch": 0.06628793793762532,
|
| 6118 |
+
"grad_norm": 46.0,
|
| 6119 |
+
"learning_rate": 0.00099927522346466,
|
| 6120 |
+
"loss": 33.8107,
|
| 6121 |
+
"step": 21225
|
| 6122 |
+
},
|
| 6123 |
+
{
|
| 6124 |
+
"epoch": 0.06636601560304066,
|
| 6125 |
+
"grad_norm": 45.75,
|
| 6126 |
+
"learning_rate": 0.0009992682582090982,
|
| 6127 |
+
"loss": 33.8952,
|
| 6128 |
+
"step": 21250
|
| 6129 |
+
},
|
| 6130 |
+
{
|
| 6131 |
+
"epoch": 0.066444093268456,
|
| 6132 |
+
"grad_norm": 39.5,
|
| 6133 |
+
"learning_rate": 0.0009992612596691171,
|
| 6134 |
+
"loss": 34.201,
|
| 6135 |
+
"step": 21275
|
| 6136 |
+
},
|
| 6137 |
+
{
|
| 6138 |
+
"epoch": 0.06652217093387135,
|
| 6139 |
+
"grad_norm": 49.25,
|
| 6140 |
+
"learning_rate": 0.0009992542278451832,
|
| 6141 |
+
"loss": 34.2007,
|
| 6142 |
+
"step": 21300
|
| 6143 |
+
},
|
| 6144 |
+
{
|
| 6145 |
+
"epoch": 0.06660024859928668,
|
| 6146 |
+
"grad_norm": 42.0,
|
| 6147 |
+
"learning_rate": 0.0009992471627377657,
|
| 6148 |
+
"loss": 34.3501,
|
| 6149 |
+
"step": 21325
|
| 6150 |
+
},
|
| 6151 |
+
{
|
| 6152 |
+
"epoch": 0.06667832626470202,
|
| 6153 |
+
"grad_norm": 48.75,
|
| 6154 |
+
"learning_rate": 0.0009992400643473354,
|
| 6155 |
+
"loss": 34.4321,
|
| 6156 |
+
"step": 21350
|
| 6157 |
+
},
|
| 6158 |
+
{
|
| 6159 |
+
"epoch": 0.06675640393011736,
|
| 6160 |
+
"grad_norm": 43.25,
|
| 6161 |
+
"learning_rate": 0.0009992329326743653,
|
| 6162 |
+
"loss": 34.638,
|
| 6163 |
+
"step": 21375
|
| 6164 |
+
},
|
| 6165 |
+
{
|
| 6166 |
+
"epoch": 0.06683448159553271,
|
| 6167 |
+
"grad_norm": 41.75,
|
| 6168 |
+
"learning_rate": 0.000999225767719331,
|
| 6169 |
+
"loss": 34.588,
|
| 6170 |
+
"step": 21400
|
| 6171 |
+
},
|
| 6172 |
+
{
|
| 6173 |
+
"epoch": 0.06691255926094805,
|
| 6174 |
+
"grad_norm": 44.5,
|
| 6175 |
+
"learning_rate": 0.0009992185694827102,
|
| 6176 |
+
"loss": 34.7111,
|
| 6177 |
+
"step": 21425
|
| 6178 |
+
},
|
| 6179 |
+
{
|
| 6180 |
+
"epoch": 0.0669906369263634,
|
| 6181 |
+
"grad_norm": 50.5,
|
| 6182 |
+
"learning_rate": 0.0009992113379649829,
|
| 6183 |
+
"loss": 34.7677,
|
| 6184 |
+
"step": 21450
|
| 6185 |
+
},
|
| 6186 |
+
{
|
| 6187 |
+
"epoch": 0.06706871459177874,
|
| 6188 |
+
"grad_norm": 62.0,
|
| 6189 |
+
"learning_rate": 0.000999204073166631,
|
| 6190 |
+
"loss": 35.0234,
|
| 6191 |
+
"step": 21475
|
| 6192 |
+
},
|
| 6193 |
+
{
|
| 6194 |
+
"epoch": 0.06714679225719408,
|
| 6195 |
+
"grad_norm": 48.0,
|
| 6196 |
+
"learning_rate": 0.0009991967750881388,
|
| 6197 |
+
"loss": 35.0909,
|
| 6198 |
+
"step": 21500
|
| 6199 |
+
},
|
| 6200 |
+
{
|
| 6201 |
+
"epoch": 0.06722486992260941,
|
| 6202 |
+
"grad_norm": 49.5,
|
| 6203 |
+
"learning_rate": 0.000999189443729993,
|
| 6204 |
+
"loss": 35.4811,
|
| 6205 |
+
"step": 21525
|
| 6206 |
+
},
|
| 6207 |
+
{
|
| 6208 |
+
"epoch": 0.06730294758802476,
|
| 6209 |
+
"grad_norm": 58.0,
|
| 6210 |
+
"learning_rate": 0.0009991820790926824,
|
| 6211 |
+
"loss": 35.2726,
|
| 6212 |
+
"step": 21550
|
| 6213 |
+
},
|
| 6214 |
+
{
|
| 6215 |
+
"epoch": 0.0673810252534401,
|
| 6216 |
+
"grad_norm": 55.5,
|
| 6217 |
+
"learning_rate": 0.0009991746811766975,
|
| 6218 |
+
"loss": 35.629,
|
| 6219 |
+
"step": 21575
|
| 6220 |
+
},
|
| 6221 |
+
{
|
| 6222 |
+
"epoch": 0.06745910291885544,
|
| 6223 |
+
"grad_norm": 44.0,
|
| 6224 |
+
"learning_rate": 0.000999167249982532,
|
| 6225 |
+
"loss": 35.4736,
|
| 6226 |
+
"step": 21600
|
| 6227 |
+
},
|
| 6228 |
+
{
|
| 6229 |
+
"epoch": 0.06753718058427079,
|
| 6230 |
+
"grad_norm": 45.75,
|
| 6231 |
+
"learning_rate": 0.0009991597855106814,
|
| 6232 |
+
"loss": 35.2275,
|
| 6233 |
+
"step": 21625
|
| 6234 |
+
},
|
| 6235 |
+
{
|
| 6236 |
+
"epoch": 0.06761525824968613,
|
| 6237 |
+
"grad_norm": 41.5,
|
| 6238 |
+
"learning_rate": 0.0009991522877616428,
|
| 6239 |
+
"loss": 35.2907,
|
| 6240 |
+
"step": 21650
|
| 6241 |
+
},
|
| 6242 |
+
{
|
| 6243 |
+
"epoch": 0.06769333591510147,
|
| 6244 |
+
"grad_norm": 56.5,
|
| 6245 |
+
"learning_rate": 0.000999144756735916,
|
| 6246 |
+
"loss": 35.2988,
|
| 6247 |
+
"step": 21675
|
| 6248 |
+
},
|
| 6249 |
+
{
|
| 6250 |
+
"epoch": 0.06777141358051682,
|
| 6251 |
+
"grad_norm": 56.0,
|
| 6252 |
+
"learning_rate": 0.000999137192434004,
|
| 6253 |
+
"loss": 35.2948,
|
| 6254 |
+
"step": 21700
|
| 6255 |
+
},
|
| 6256 |
+
{
|
| 6257 |
+
"epoch": 0.06784949124593215,
|
| 6258 |
+
"grad_norm": 42.0,
|
| 6259 |
+
"learning_rate": 0.0009991295948564103,
|
| 6260 |
+
"loss": 35.1186,
|
| 6261 |
+
"step": 21725
|
| 6262 |
+
},
|
| 6263 |
+
{
|
| 6264 |
+
"epoch": 0.06792756891134749,
|
| 6265 |
+
"grad_norm": 43.25,
|
| 6266 |
+
"learning_rate": 0.0009991219640036416,
|
| 6267 |
+
"loss": 35.115,
|
| 6268 |
+
"step": 21750
|
| 6269 |
+
},
|
| 6270 |
+
{
|
| 6271 |
+
"epoch": 0.06800564657676283,
|
| 6272 |
+
"grad_norm": 43.75,
|
| 6273 |
+
"learning_rate": 0.0009991142998762065,
|
| 6274 |
+
"loss": 35.347,
|
| 6275 |
+
"step": 21775
|
| 6276 |
+
},
|
| 6277 |
+
{
|
| 6278 |
+
"epoch": 0.06808372424217818,
|
| 6279 |
+
"grad_norm": 45.0,
|
| 6280 |
+
"learning_rate": 0.000999106602474616,
|
| 6281 |
+
"loss": 35.3008,
|
| 6282 |
+
"step": 21800
|
| 6283 |
+
},
|
| 6284 |
+
{
|
| 6285 |
+
"epoch": 0.06816180190759352,
|
| 6286 |
+
"grad_norm": 66.0,
|
| 6287 |
+
"learning_rate": 0.0009990988717993832,
|
| 6288 |
+
"loss": 35.321,
|
| 6289 |
+
"step": 21825
|
| 6290 |
+
},
|
| 6291 |
+
{
|
| 6292 |
+
"epoch": 0.06823987957300887,
|
| 6293 |
+
"grad_norm": 56.0,
|
| 6294 |
+
"learning_rate": 0.0009990911078510238,
|
| 6295 |
+
"loss": 35.373,
|
| 6296 |
+
"step": 21850
|
| 6297 |
+
},
|
| 6298 |
+
{
|
| 6299 |
+
"epoch": 0.06831795723842421,
|
| 6300 |
+
"grad_norm": 49.25,
|
| 6301 |
+
"learning_rate": 0.000999083310630055,
|
| 6302 |
+
"loss": 35.2404,
|
| 6303 |
+
"step": 21875
|
| 6304 |
+
},
|
| 6305 |
+
{
|
| 6306 |
+
"epoch": 0.06839603490383955,
|
| 6307 |
+
"grad_norm": 46.0,
|
| 6308 |
+
"learning_rate": 0.000999075480136997,
|
| 6309 |
+
"loss": 35.2177,
|
| 6310 |
+
"step": 21900
|
| 6311 |
+
},
|
| 6312 |
+
{
|
| 6313 |
+
"epoch": 0.06847411256925488,
|
| 6314 |
+
"grad_norm": 43.5,
|
| 6315 |
+
"learning_rate": 0.0009990676163723715,
|
| 6316 |
+
"loss": 35.1759,
|
| 6317 |
+
"step": 21925
|
| 6318 |
+
},
|
| 6319 |
+
{
|
| 6320 |
+
"epoch": 0.06855219023467023,
|
| 6321 |
+
"grad_norm": 54.5,
|
| 6322 |
+
"learning_rate": 0.000999059719336703,
|
| 6323 |
+
"loss": 34.7193,
|
| 6324 |
+
"step": 21950
|
| 6325 |
+
},
|
| 6326 |
+
{
|
| 6327 |
+
"epoch": 0.06863026790008557,
|
| 6328 |
+
"grad_norm": 48.25,
|
| 6329 |
+
"learning_rate": 0.0009990517890305175,
|
| 6330 |
+
"loss": 34.6676,
|
| 6331 |
+
"step": 21975
|
| 6332 |
+
},
|
| 6333 |
+
{
|
| 6334 |
+
"epoch": 0.06870834556550091,
|
| 6335 |
+
"grad_norm": 44.75,
|
| 6336 |
+
"learning_rate": 0.0009990438254543442,
|
| 6337 |
+
"loss": 34.4965,
|
| 6338 |
+
"step": 22000
|
| 6339 |
+
},
|
| 6340 |
+
{
|
| 6341 |
+
"epoch": 0.06870834556550091,
|
| 6342 |
+
"eval_loss": 34.531646728515625,
|
| 6343 |
+
"eval_runtime": 102.6371,
|
| 6344 |
+
"eval_samples_per_second": 50.693,
|
| 6345 |
+
"eval_steps_per_second": 3.176,
|
| 6346 |
+
"step": 22000
|
| 6347 |
}
|
| 6348 |
],
|
| 6349 |
"logging_steps": 25,
|
|
|
|
| 6363 |
"attributes": {}
|
| 6364 |
}
|
| 6365 |
},
|
| 6366 |
+
"total_flos": 2.7899608404454277e+18,
|
| 6367 |
"train_batch_size": 8,
|
| 6368 |
"trial_name": null,
|
| 6369 |
"trial_params": null
|