Merge branch 'main' of hf.co:tangledgroup/tangled-alpha-0.9-core
Browse files- README.md +49 -0
- config.json → config-0.json +3 -3
README.md
CHANGED
|
@@ -76,6 +76,10 @@ Total number of tokens in the optimized dataset '../core-data-6-32769-65537-6553
|
|
| 76 |
|
| 77 |
i=7, min_len=65537, max_len=131073, block_size=131073, chunk_size=16384125, len(dataset)=634, len(dataset) * block_size=83100282
|
| 78 |
Total number of tokens in the optimized dataset '../core-data-7-65537-131073-131073-125' is 83100282
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
```
|
| 80 |
|
| 81 |
```bash
|
|
@@ -83,6 +87,45 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
| 83 |
```
|
| 84 |
|
| 85 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
```
|
| 87 |
|
| 88 |
Backup `wandb`:
|
|
@@ -91,6 +134,12 @@ Backup `wandb`:
|
|
| 91 |
mv wandb wandb-pretrain-core-0
|
| 92 |
```
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
Chat with model:
|
| 95 |
|
| 96 |
```bash
|
|
|
|
| 76 |
|
| 77 |
i=7, min_len=65537, max_len=131073, block_size=131073, chunk_size=16384125, len(dataset)=634, len(dataset) * block_size=83100282
|
| 78 |
Total number of tokens in the optimized dataset '../core-data-7-65537-131073-131073-125' is 83100282
|
| 79 |
+
|
| 80 |
+
real 292m54.341s
|
| 81 |
+
user 2118m1.154s
|
| 82 |
+
sys 12m2.746s
|
| 83 |
```
|
| 84 |
|
| 85 |
```bash
|
|
|
|
| 87 |
```
|
| 88 |
|
| 89 |
```
|
| 90 |
+
Seed set to 23
|
| 91 |
+
Time to instantiate model: 0.44 seconds.
|
| 92 |
+
Total parameters: 234,914,304
|
| 93 |
+
Verifying settings ...
|
| 94 |
+
Measured TFLOPs: 55520.94
|
| 95 |
+
Epoch 1 | iter 64 step 1 | loss train: 11.977, val: n/a | iter time: 490.27 ms (step) remaining time: 6 days, 22:47:04
|
| 96 |
+
Epoch 1 | iter 128 step 2 | loss train: 11.970, val: n/a | iter time: 351.11 ms (step) remaining time: 4 days, 16:53:01
|
| 97 |
+
Epoch 1 | iter 192 step 3 | loss train: 11.971, val: n/a | iter time: 353.74 ms (step) remaining time: 3 days, 23:43:23
|
| 98 |
+
Epoch 1 | iter 256 step 4 | loss train: 11.974, val: n/a | iter time: 355.03 ms (step) remaining time: 3 days, 14:41:57
|
| 99 |
+
Epoch 1 | iter 320 step 5 | loss train: 11.964, val: n/a | iter time: 357.36 ms (step) remaining time: 3 days, 9:21:54
|
| 100 |
+
Epoch 1 | iter 384 step 6 | loss train: 11.957, val: n/a | iter time: 362.27 ms (step) remaining time: 3 days, 5:53:20
|
| 101 |
+
Epoch 1 | iter 448 step 7 | loss train: 11.948, val: n/a | iter time: 359.89 ms (step) remaining time: 3 days, 3:26:34
|
| 102 |
+
Epoch 1 | iter 512 step 8 | loss train: 11.938, val: n/a | iter time: 363.84 ms (step) remaining time: 3 days, 1:37:54
|
| 103 |
+
Epoch 1 | iter 576 step 9 | loss train: 11.920, val: n/a | iter time: 362.75 ms (step) remaining time: 3 days, 0:13:59
|
| 104 |
+
Epoch 1 | iter 640 step 10 | loss train: 11.900, val: n/a | iter time: 363.46 ms (step) remaining time: 2 days, 23:07:06
|
| 105 |
+
# ...
|
| 106 |
+
Epoch 1 | iter 643264 step 10051 | loss train: 2.834, val: 2.669 | iter time: 360.50 ms (step) remaining time: 0:03:59
|
| 107 |
+
Epoch 2 | iter 643328 step 10052 | loss train: 2.837, val: 2.669 | iter time: 359.53 ms (step) remaining time: 0:03:37
|
| 108 |
+
Epoch 2 | iter 643392 step 10053 | loss train: 2.768, val: 2.669 | iter time: 362.83 ms (step) remaining time: 0:03:15
|
| 109 |
+
Epoch 2 | iter 643456 step 10054 | loss train: 2.695, val: 2.669 | iter time: 363.85 ms (step) remaining time: 0:02:53
|
| 110 |
+
Epoch 2 | iter 643520 step 10055 | loss train: 2.768, val: 2.669 | iter time: 365.40 ms (step) remaining time: 0:02:30
|
| 111 |
+
Epoch 2 | iter 643584 step 10056 | loss train: 2.710, val: 2.669 | iter time: 364.72 ms (step) remaining time: 0:02:08
|
| 112 |
+
Epoch 2 | iter 643648 step 10057 | loss train: 2.749, val: 2.669 | iter time: 365.00 ms (step) remaining time: 0:01:46
|
| 113 |
+
Epoch 2 | iter 643712 step 10058 | loss train: 2.748, val: 2.669 | iter time: 363.42 ms (step) remaining time: 0:01:24
|
| 114 |
+
Epoch 2 | iter 643776 step 10059 | loss train: 2.710, val: 2.669 | iter time: 364.49 ms (step) remaining time: 0:01:02
|
| 115 |
+
Epoch 2 | iter 643840 step 10060 | loss train: 2.738, val: 2.669 | iter time: 364.43 ms (step) remaining time: 0:00:39
|
| 116 |
+
Epoch 2 | iter 643904 step 10061 | loss train: 2.734, val: 2.669 | iter time: 364.94 ms (step) remaining time: 0:00:17
|
| 117 |
+
Validating ...
|
| 118 |
+
Final evaluation | val loss: 2.669 | val ppl: 14.422
|
| 119 |
+
Saving checkpoint to '../out/pretrain-core-0/final/lit_model.pth'
|
| 120 |
+
----------------------------------------
|
| 121 |
+
| Performance
|
| 122 |
+
| - Total tokens : 5,275,279,360
|
| 123 |
+
| - Training Time : 223314.37 s
|
| 124 |
+
| - Tok/sec : 5541.09 tok/s
|
| 125 |
+
| ----------------------------------------
|
| 126 |
+
| Memory Usage
|
| 127 |
+
| - Memory Used : 22.33 GB
|
| 128 |
+
----------------------------------------
|
| 129 |
```
|
| 130 |
|
| 131 |
Backup `wandb`:
|
|
|
|
| 134 |
mv wandb wandb-pretrain-core-0
|
| 135 |
```
|
| 136 |
|
| 137 |
+
Copy config:
|
| 138 |
+
|
| 139 |
+
```bash
|
| 140 |
+
cp ../config-0.json ../out/pretrain-core-0/final
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
Chat with model:
|
| 144 |
|
| 145 |
```bash
|
config.json → config-0.json
RENAMED
|
@@ -14,13 +14,13 @@
|
|
| 14 |
"max_position_embeddings": 131072,
|
| 15 |
"mlp_bias": false,
|
| 16 |
"model_type": "llama",
|
| 17 |
-
"num_attention_heads":
|
| 18 |
"num_hidden_layers": 32,
|
| 19 |
-
"num_key_value_heads":
|
| 20 |
"pretraining_tp": 1,
|
| 21 |
"rms_norm_eps": 1e-05,
|
| 22 |
"rope_scaling": null,
|
| 23 |
-
"rope_theta":
|
| 24 |
"tie_word_embeddings": true,
|
| 25 |
"torch_dtype": "bfloat16",
|
| 26 |
"transformers_version": "4.45.0.dev0",
|
|
|
|
| 14 |
"max_position_embeddings": 131072,
|
| 15 |
"mlp_bias": false,
|
| 16 |
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 8,
|
| 18 |
"num_hidden_layers": 32,
|
| 19 |
+
"num_key_value_heads": 8,
|
| 20 |
"pretraining_tp": 1,
|
| 21 |
"rms_norm_eps": 1e-05,
|
| 22 |
"rope_scaling": null,
|
| 23 |
+
"rope_theta": 4300.0,
|
| 24 |
"tie_word_embeddings": true,
|
| 25 |
"torch_dtype": "bfloat16",
|
| 26 |
"transformers_version": "4.45.0.dev0",
|