[GHA] experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb result notebook & reports
#167
by
picocreator
- opened
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb
CHANGED
|
@@ -3,13 +3,13 @@
|
|
| 3 |
{
|
| 4 |
"attachments": {},
|
| 5 |
"cell_type": "markdown",
|
| 6 |
-
"id": "
|
| 7 |
"metadata": {
|
| 8 |
"papermill": {
|
| 9 |
-
"duration": 0.
|
| 10 |
-
"end_time": "2023-09-
|
| 11 |
"exception": false,
|
| 12 |
-
"start_time": "2023-09-
|
| 13 |
"status": "completed"
|
| 14 |
},
|
| 15 |
"tags": []
|
|
@@ -23,13 +23,13 @@
|
|
| 23 |
{
|
| 24 |
"attachments": {},
|
| 25 |
"cell_type": "markdown",
|
| 26 |
-
"id": "
|
| 27 |
"metadata": {
|
| 28 |
"papermill": {
|
| 29 |
-
"duration": 0.
|
| 30 |
-
"end_time": "2023-09-
|
| 31 |
"exception": false,
|
| 32 |
-
"start_time": "2023-09-
|
| 33 |
"status": "completed"
|
| 34 |
},
|
| 35 |
"tags": []
|
|
@@ -41,19 +41,19 @@
|
|
| 41 |
{
|
| 42 |
"cell_type": "code",
|
| 43 |
"execution_count": 1,
|
| 44 |
-
"id": "
|
| 45 |
"metadata": {
|
| 46 |
"execution": {
|
| 47 |
-
"iopub.execute_input": "2023-09-
|
| 48 |
-
"iopub.status.busy": "2023-09-
|
| 49 |
-
"iopub.status.idle": "2023-09-
|
| 50 |
-
"shell.execute_reply": "2023-09-
|
| 51 |
},
|
| 52 |
"papermill": {
|
| 53 |
-
"duration": 0.
|
| 54 |
-
"end_time": "2023-09-
|
| 55 |
"exception": false,
|
| 56 |
-
"start_time": "2023-09-
|
| 57 |
"status": "completed"
|
| 58 |
},
|
| 59 |
"tags": []
|
|
@@ -69,19 +69,19 @@
|
|
| 69 |
{
|
| 70 |
"cell_type": "code",
|
| 71 |
"execution_count": 2,
|
| 72 |
-
"id": "
|
| 73 |
"metadata": {
|
| 74 |
"execution": {
|
| 75 |
-
"iopub.execute_input": "2023-09-
|
| 76 |
-
"iopub.status.busy": "2023-09-
|
| 77 |
-
"iopub.status.idle": "2023-09-
|
| 78 |
-
"shell.execute_reply": "2023-09-
|
| 79 |
},
|
| 80 |
"papermill": {
|
| 81 |
-
"duration": 0.
|
| 82 |
-
"end_time": "2023-09-
|
| 83 |
"exception": false,
|
| 84 |
-
"start_time": "2023-09-
|
| 85 |
"status": "completed"
|
| 86 |
},
|
| 87 |
"tags": []
|
|
@@ -140,19 +140,19 @@
|
|
| 140 |
{
|
| 141 |
"cell_type": "code",
|
| 142 |
"execution_count": 3,
|
| 143 |
-
"id": "
|
| 144 |
"metadata": {
|
| 145 |
"execution": {
|
| 146 |
-
"iopub.execute_input": "2023-09-
|
| 147 |
-
"iopub.status.busy": "2023-09-
|
| 148 |
-
"iopub.status.idle": "2023-09-
|
| 149 |
-
"shell.execute_reply": "2023-09-
|
| 150 |
},
|
| 151 |
"papermill": {
|
| 152 |
-
"duration":
|
| 153 |
-
"end_time": "2023-09-
|
| 154 |
"exception": false,
|
| 155 |
-
"start_time": "2023-09-
|
| 156 |
"status": "completed"
|
| 157 |
},
|
| 158 |
"tags": []
|
|
@@ -162,14 +162,20 @@
|
|
| 162 |
"name": "stdout",
|
| 163 |
"output_type": "stream",
|
| 164 |
"text": [
|
| 165 |
-
"[2023-09-29
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
]
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"name": "stdout",
|
| 170 |
"output_type": "stream",
|
| 171 |
"text": [
|
| 172 |
-
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
|
| 173 |
"---- Initializing model ----\r\n",
|
| 174 |
"No of layers: 6\r\n",
|
| 175 |
"Embedding size: 2048\r\n",
|
|
@@ -228,42 +234,42 @@
|
|
| 228 |
"output_type": "stream",
|
| 229 |
"text": [
|
| 230 |
"2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
|
| 231 |
-
"2048 7168 0 blocks.0.ffn.value.weight\r\n"
|
|
|
|
| 232 |
]
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"name": "stdout",
|
| 236 |
"output_type": "stream",
|
| 237 |
"text": [
|
| 238 |
-
"2048 2048 1.0 blocks.1.att.
|
| 239 |
]
|
| 240 |
},
|
| 241 |
{
|
| 242 |
"name": "stdout",
|
| 243 |
"output_type": "stream",
|
| 244 |
"text": [
|
| 245 |
-
"2048 2048 1.0 blocks.1.att.
|
| 246 |
]
|
| 247 |
},
|
| 248 |
{
|
| 249 |
"name": "stdout",
|
| 250 |
"output_type": "stream",
|
| 251 |
"text": [
|
| 252 |
-
"2048 2048 1.0 blocks.1.att.
|
| 253 |
]
|
| 254 |
},
|
| 255 |
{
|
| 256 |
"name": "stdout",
|
| 257 |
"output_type": "stream",
|
| 258 |
"text": [
|
| 259 |
-
"2048 2048
|
| 260 |
]
|
| 261 |
},
|
| 262 |
{
|
| 263 |
"name": "stdout",
|
| 264 |
"output_type": "stream",
|
| 265 |
"text": [
|
| 266 |
-
"2048 2048 0 blocks.1.att.output.weight\r\n",
|
| 267 |
"7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
|
| 268 |
]
|
| 269 |
},
|
|
@@ -272,42 +278,42 @@
|
|
| 272 |
"output_type": "stream",
|
| 273 |
"text": [
|
| 274 |
"2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
|
| 275 |
-
"2048 7168 0 blocks.1.ffn.value.weight\r\n"
|
| 276 |
-
"2048 2048 1.0 blocks.2.att.gate.weight\r\n"
|
| 277 |
]
|
| 278 |
},
|
| 279 |
{
|
| 280 |
"name": "stdout",
|
| 281 |
"output_type": "stream",
|
| 282 |
"text": [
|
| 283 |
-
"2048 2048 1.0 blocks.2.att.
|
| 284 |
]
|
| 285 |
},
|
| 286 |
{
|
| 287 |
"name": "stdout",
|
| 288 |
"output_type": "stream",
|
| 289 |
"text": [
|
| 290 |
-
"2048 2048 1.0 blocks.2.att.
|
| 291 |
]
|
| 292 |
},
|
| 293 |
{
|
| 294 |
"name": "stdout",
|
| 295 |
"output_type": "stream",
|
| 296 |
"text": [
|
| 297 |
-
"2048 2048 1.0 blocks.2.att.
|
| 298 |
]
|
| 299 |
},
|
| 300 |
{
|
| 301 |
"name": "stdout",
|
| 302 |
"output_type": "stream",
|
| 303 |
"text": [
|
| 304 |
-
"2048 2048 0
|
| 305 |
]
|
| 306 |
},
|
| 307 |
{
|
| 308 |
"name": "stdout",
|
| 309 |
"output_type": "stream",
|
| 310 |
"text": [
|
|
|
|
| 311 |
"7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
|
| 312 |
]
|
| 313 |
},
|
|
@@ -360,13 +366,7 @@
|
|
| 360 |
"output_type": "stream",
|
| 361 |
"text": [
|
| 362 |
"2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
|
| 363 |
-
"2048 7168 0 blocks.3.ffn.value.weight\r\n"
|
| 364 |
-
]
|
| 365 |
-
},
|
| 366 |
-
{
|
| 367 |
-
"name": "stdout",
|
| 368 |
-
"output_type": "stream",
|
| 369 |
-
"text": [
|
| 370 |
"2048 2048 1.0 blocks.4.att.gate.weight\r\n"
|
| 371 |
]
|
| 372 |
},
|
|
@@ -404,13 +404,7 @@
|
|
| 404 |
"output_type": "stream",
|
| 405 |
"text": [
|
| 406 |
"2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
|
| 407 |
-
"2048 7168 0 blocks.4.ffn.value.weight\r\n"
|
| 408 |
-
]
|
| 409 |
-
},
|
| 410 |
-
{
|
| 411 |
-
"name": "stdout",
|
| 412 |
-
"output_type": "stream",
|
| 413 |
-
"text": [
|
| 414 |
"2048 2048 1.0 blocks.5.att.gate.weight\r\n"
|
| 415 |
]
|
| 416 |
},
|
|
@@ -439,13 +433,7 @@
|
|
| 439 |
"name": "stdout",
|
| 440 |
"output_type": "stream",
|
| 441 |
"text": [
|
| 442 |
-
"2048 2048 0 blocks.5.att.output.weight\r\n"
|
| 443 |
-
]
|
| 444 |
-
},
|
| 445 |
-
{
|
| 446 |
-
"name": "stdout",
|
| 447 |
-
"output_type": "stream",
|
| 448 |
-
"text": [
|
| 449 |
"7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
|
| 450 |
]
|
| 451 |
},
|
|
@@ -471,13 +459,13 @@
|
|
| 471 |
},
|
| 472 |
{
|
| 473 |
"cell_type": "markdown",
|
| 474 |
-
"id": "
|
| 475 |
"metadata": {
|
| 476 |
"papermill": {
|
| 477 |
-
"duration": 0.
|
| 478 |
-
"end_time": "2023-09-
|
| 479 |
"exception": false,
|
| 480 |
-
"start_time": "2023-09-
|
| 481 |
"status": "completed"
|
| 482 |
},
|
| 483 |
"tags": []
|
|
@@ -489,19 +477,19 @@
|
|
| 489 |
{
|
| 490 |
"cell_type": "code",
|
| 491 |
"execution_count": 4,
|
| 492 |
-
"id": "
|
| 493 |
"metadata": {
|
| 494 |
"execution": {
|
| 495 |
-
"iopub.execute_input": "2023-09-
|
| 496 |
-
"iopub.status.busy": "2023-09-
|
| 497 |
-
"iopub.status.idle": "2023-09-
|
| 498 |
-
"shell.execute_reply": "2023-09-
|
| 499 |
},
|
| 500 |
"papermill": {
|
| 501 |
-
"duration": 5.
|
| 502 |
-
"end_time": "2023-09-
|
| 503 |
"exception": false,
|
| 504 |
-
"start_time": "2023-09-
|
| 505 |
"status": "completed"
|
| 506 |
},
|
| 507 |
"tags": []
|
|
@@ -527,19 +515,19 @@
|
|
| 527 |
{
|
| 528 |
"cell_type": "code",
|
| 529 |
"execution_count": 5,
|
| 530 |
-
"id": "
|
| 531 |
"metadata": {
|
| 532 |
"execution": {
|
| 533 |
-
"iopub.execute_input": "2023-09-
|
| 534 |
-
"iopub.status.busy": "2023-09-
|
| 535 |
-
"iopub.status.idle": "2023-09-
|
| 536 |
-
"shell.execute_reply": "2023-09-
|
| 537 |
},
|
| 538 |
"papermill": {
|
| 539 |
-
"duration": 0.
|
| 540 |
-
"end_time": "2023-09-
|
| 541 |
"exception": false,
|
| 542 |
-
"start_time": "2023-09-
|
| 543 |
"status": "completed"
|
| 544 |
},
|
| 545 |
"tags": []
|
|
@@ -549,17 +537,16 @@
|
|
| 549 |
"name": "stdout",
|
| 550 |
"output_type": "stream",
|
| 551 |
"text": [
|
| 552 |
-
"/usr/bin/sh: 1:
|
| 553 |
]
|
| 554 |
}
|
| 555 |
],
|
| 556 |
"source": [
|
| 557 |
"# Start the foundation model training\n",
|
| 558 |
"!cd \"{TRAINER_DIR}\" && \\\n",
|
| 559 |
-
" export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
|
| 560 |
" export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
|
| 561 |
" python lightning_trainer.py fit \\\n",
|
| 562 |
-
" -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n",
|
| 563 |
" --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
|
| 564 |
" --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
|
| 565 |
" --trainer.devices=\"{GPU_DEVICES}\" \\\n",
|
|
@@ -572,19 +559,19 @@
|
|
| 572 |
{
|
| 573 |
"cell_type": "code",
|
| 574 |
"execution_count": 6,
|
| 575 |
-
"id": "
|
| 576 |
"metadata": {
|
| 577 |
"execution": {
|
| 578 |
-
"iopub.execute_input": "2023-09-
|
| 579 |
-
"iopub.status.busy": "2023-09-
|
| 580 |
-
"iopub.status.idle": "2023-09-
|
| 581 |
-
"shell.execute_reply": "2023-09-
|
| 582 |
},
|
| 583 |
"papermill": {
|
| 584 |
-
"duration": 0.
|
| 585 |
-
"end_time": "2023-09-
|
| 586 |
"exception": false,
|
| 587 |
-
"start_time": "2023-09-
|
| 588 |
"status": "completed"
|
| 589 |
},
|
| 590 |
"tags": []
|
|
@@ -615,19 +602,19 @@
|
|
| 615 |
{
|
| 616 |
"cell_type": "code",
|
| 617 |
"execution_count": 7,
|
| 618 |
-
"id": "
|
| 619 |
"metadata": {
|
| 620 |
"execution": {
|
| 621 |
-
"iopub.execute_input": "2023-09-
|
| 622 |
-
"iopub.status.busy": "2023-09-
|
| 623 |
-
"iopub.status.idle": "2023-09-
|
| 624 |
-
"shell.execute_reply": "2023-09-
|
| 625 |
},
|
| 626 |
"papermill": {
|
| 627 |
-
"duration":
|
| 628 |
-
"end_time": "2023-09-
|
| 629 |
"exception": false,
|
| 630 |
-
"start_time": "2023-09-
|
| 631 |
"status": "completed"
|
| 632 |
},
|
| 633 |
"tags": []
|
|
@@ -637,14 +624,28 @@
|
|
| 637 |
"name": "stdout",
|
| 638 |
"output_type": "stream",
|
| 639 |
"text": [
|
| 640 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
]
|
| 642 |
}
|
| 643 |
],
|
| 644 |
"source": [
|
| 645 |
"# # Lets do a quick dragon prompt validation\n",
|
| 646 |
"!cd \"{INFERENCE_DIR}\" && \\\n",
|
| 647 |
-
" export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
|
| 648 |
" python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
|
| 649 |
]
|
| 650 |
}
|
|
@@ -669,14 +670,14 @@
|
|
| 669 |
},
|
| 670 |
"papermill": {
|
| 671 |
"default_parameters": {},
|
| 672 |
-
"duration":
|
| 673 |
-
"end_time": "2023-09-
|
| 674 |
"environment_variables": {},
|
| 675 |
"exception": null,
|
| 676 |
"input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
| 677 |
"output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
| 678 |
"parameters": {},
|
| 679 |
-
"start_time": "2023-09-
|
| 680 |
"version": "2.4.0"
|
| 681 |
}
|
| 682 |
},
|
|
|
|
| 3 |
{
|
| 4 |
"attachments": {},
|
| 5 |
"cell_type": "markdown",
|
| 6 |
+
"id": "ef458e0c",
|
| 7 |
"metadata": {
|
| 8 |
"papermill": {
|
| 9 |
+
"duration": 0.002614,
|
| 10 |
+
"end_time": "2023-09-29T05:06:25.725060",
|
| 11 |
"exception": false,
|
| 12 |
+
"start_time": "2023-09-29T05:06:25.722446",
|
| 13 |
"status": "completed"
|
| 14 |
},
|
| 15 |
"tags": []
|
|
|
|
| 23 |
{
|
| 24 |
"attachments": {},
|
| 25 |
"cell_type": "markdown",
|
| 26 |
+
"id": "58eb3f3e",
|
| 27 |
"metadata": {
|
| 28 |
"papermill": {
|
| 29 |
+
"duration": 0.00201,
|
| 30 |
+
"end_time": "2023-09-29T05:06:25.730966",
|
| 31 |
"exception": false,
|
| 32 |
+
"start_time": "2023-09-29T05:06:25.728956",
|
| 33 |
"status": "completed"
|
| 34 |
},
|
| 35 |
"tags": []
|
|
|
|
| 41 |
{
|
| 42 |
"cell_type": "code",
|
| 43 |
"execution_count": 1,
|
| 44 |
+
"id": "e0abbad9",
|
| 45 |
"metadata": {
|
| 46 |
"execution": {
|
| 47 |
+
"iopub.execute_input": "2023-09-29T05:06:25.737449Z",
|
| 48 |
+
"iopub.status.busy": "2023-09-29T05:06:25.736495Z",
|
| 49 |
+
"iopub.status.idle": "2023-09-29T05:06:26.482958Z",
|
| 50 |
+
"shell.execute_reply": "2023-09-29T05:06:26.482054Z"
|
| 51 |
},
|
| 52 |
"papermill": {
|
| 53 |
+
"duration": 0.751859,
|
| 54 |
+
"end_time": "2023-09-29T05:06:26.485032",
|
| 55 |
"exception": false,
|
| 56 |
+
"start_time": "2023-09-29T05:06:25.733173",
|
| 57 |
"status": "completed"
|
| 58 |
},
|
| 59 |
"tags": []
|
|
|
|
| 69 |
{
|
| 70 |
"cell_type": "code",
|
| 71 |
"execution_count": 2,
|
| 72 |
+
"id": "42d56a7f",
|
| 73 |
"metadata": {
|
| 74 |
"execution": {
|
| 75 |
+
"iopub.execute_input": "2023-09-29T05:06:26.491452Z",
|
| 76 |
+
"iopub.status.busy": "2023-09-29T05:06:26.490928Z",
|
| 77 |
+
"iopub.status.idle": "2023-09-29T05:06:26.499148Z",
|
| 78 |
+
"shell.execute_reply": "2023-09-29T05:06:26.498384Z"
|
| 79 |
},
|
| 80 |
"papermill": {
|
| 81 |
+
"duration": 0.013307,
|
| 82 |
+
"end_time": "2023-09-29T05:06:26.500768",
|
| 83 |
"exception": false,
|
| 84 |
+
"start_time": "2023-09-29T05:06:26.487461",
|
| 85 |
"status": "completed"
|
| 86 |
},
|
| 87 |
"tags": []
|
|
|
|
| 140 |
{
|
| 141 |
"cell_type": "code",
|
| 142 |
"execution_count": 3,
|
| 143 |
+
"id": "5514ed91",
|
| 144 |
"metadata": {
|
| 145 |
"execution": {
|
| 146 |
+
"iopub.execute_input": "2023-09-29T05:06:26.507274Z",
|
| 147 |
+
"iopub.status.busy": "2023-09-29T05:06:26.506786Z",
|
| 148 |
+
"iopub.status.idle": "2023-09-29T05:06:55.991075Z",
|
| 149 |
+
"shell.execute_reply": "2023-09-29T05:06:55.990231Z"
|
| 150 |
},
|
| 151 |
"papermill": {
|
| 152 |
+
"duration": 29.490941,
|
| 153 |
+
"end_time": "2023-09-29T05:06:55.994238",
|
| 154 |
"exception": false,
|
| 155 |
+
"start_time": "2023-09-29T05:06:26.503297",
|
| 156 |
"status": "completed"
|
| 157 |
},
|
| 158 |
"tags": []
|
|
|
|
| 162 |
"name": "stdout",
|
| 163 |
"output_type": "stream",
|
| 164 |
"text": [
|
| 165 |
+
"[2023-09-29 05:06:30,625] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"name": "stdout",
|
| 170 |
+
"output_type": "stream",
|
| 171 |
+
"text": [
|
| 172 |
+
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
|
| 173 |
]
|
| 174 |
},
|
| 175 |
{
|
| 176 |
"name": "stdout",
|
| 177 |
"output_type": "stream",
|
| 178 |
"text": [
|
|
|
|
| 179 |
"---- Initializing model ----\r\n",
|
| 180 |
"No of layers: 6\r\n",
|
| 181 |
"Embedding size: 2048\r\n",
|
|
|
|
| 234 |
"output_type": "stream",
|
| 235 |
"text": [
|
| 236 |
"2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
|
| 237 |
+
"2048 7168 0 blocks.0.ffn.value.weight\r\n",
|
| 238 |
+
"2048 2048 1.0 blocks.1.att.gate.weight\r\n"
|
| 239 |
]
|
| 240 |
},
|
| 241 |
{
|
| 242 |
"name": "stdout",
|
| 243 |
"output_type": "stream",
|
| 244 |
"text": [
|
| 245 |
+
"2048 2048 1.0 blocks.1.att.receptance.weight\r\n"
|
| 246 |
]
|
| 247 |
},
|
| 248 |
{
|
| 249 |
"name": "stdout",
|
| 250 |
"output_type": "stream",
|
| 251 |
"text": [
|
| 252 |
+
"2048 2048 1.0 blocks.1.att.key.weight\r\n"
|
| 253 |
]
|
| 254 |
},
|
| 255 |
{
|
| 256 |
"name": "stdout",
|
| 257 |
"output_type": "stream",
|
| 258 |
"text": [
|
| 259 |
+
"2048 2048 1.0 blocks.1.att.value.weight\r\n"
|
| 260 |
]
|
| 261 |
},
|
| 262 |
{
|
| 263 |
"name": "stdout",
|
| 264 |
"output_type": "stream",
|
| 265 |
"text": [
|
| 266 |
+
"2048 2048 0 blocks.1.att.output.weight\r\n"
|
| 267 |
]
|
| 268 |
},
|
| 269 |
{
|
| 270 |
"name": "stdout",
|
| 271 |
"output_type": "stream",
|
| 272 |
"text": [
|
|
|
|
| 273 |
"7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
|
| 274 |
]
|
| 275 |
},
|
|
|
|
| 278 |
"output_type": "stream",
|
| 279 |
"text": [
|
| 280 |
"2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
|
| 281 |
+
"2048 7168 0 blocks.1.ffn.value.weight\r\n"
|
|
|
|
| 282 |
]
|
| 283 |
},
|
| 284 |
{
|
| 285 |
"name": "stdout",
|
| 286 |
"output_type": "stream",
|
| 287 |
"text": [
|
| 288 |
+
"2048 2048 1.0 blocks.2.att.gate.weight\r\n"
|
| 289 |
]
|
| 290 |
},
|
| 291 |
{
|
| 292 |
"name": "stdout",
|
| 293 |
"output_type": "stream",
|
| 294 |
"text": [
|
| 295 |
+
"2048 2048 1.0 blocks.2.att.receptance.weight\r\n"
|
| 296 |
]
|
| 297 |
},
|
| 298 |
{
|
| 299 |
"name": "stdout",
|
| 300 |
"output_type": "stream",
|
| 301 |
"text": [
|
| 302 |
+
"2048 2048 1.0 blocks.2.att.key.weight\r\n"
|
| 303 |
]
|
| 304 |
},
|
| 305 |
{
|
| 306 |
"name": "stdout",
|
| 307 |
"output_type": "stream",
|
| 308 |
"text": [
|
| 309 |
+
"2048 2048 1.0 blocks.2.att.value.weight\r\n"
|
| 310 |
]
|
| 311 |
},
|
| 312 |
{
|
| 313 |
"name": "stdout",
|
| 314 |
"output_type": "stream",
|
| 315 |
"text": [
|
| 316 |
+
"2048 2048 0 blocks.2.att.output.weight\r\n",
|
| 317 |
"7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
|
| 318 |
]
|
| 319 |
},
|
|
|
|
| 366 |
"output_type": "stream",
|
| 367 |
"text": [
|
| 368 |
"2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
|
| 369 |
+
"2048 7168 0 blocks.3.ffn.value.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
"2048 2048 1.0 blocks.4.att.gate.weight\r\n"
|
| 371 |
]
|
| 372 |
},
|
|
|
|
| 404 |
"output_type": "stream",
|
| 405 |
"text": [
|
| 406 |
"2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
|
| 407 |
+
"2048 7168 0 blocks.4.ffn.value.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
"2048 2048 1.0 blocks.5.att.gate.weight\r\n"
|
| 409 |
]
|
| 410 |
},
|
|
|
|
| 433 |
"name": "stdout",
|
| 434 |
"output_type": "stream",
|
| 435 |
"text": [
|
| 436 |
+
"2048 2048 0 blocks.5.att.output.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
"7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
|
| 438 |
]
|
| 439 |
},
|
|
|
|
| 459 |
},
|
| 460 |
{
|
| 461 |
"cell_type": "markdown",
|
| 462 |
+
"id": "8afd9e50",
|
| 463 |
"metadata": {
|
| 464 |
"papermill": {
|
| 465 |
+
"duration": 0.005752,
|
| 466 |
+
"end_time": "2023-09-29T05:06:56.006385",
|
| 467 |
"exception": false,
|
| 468 |
+
"start_time": "2023-09-29T05:06:56.000633",
|
| 469 |
"status": "completed"
|
| 470 |
},
|
| 471 |
"tags": []
|
|
|
|
| 477 |
{
|
| 478 |
"cell_type": "code",
|
| 479 |
"execution_count": 4,
|
| 480 |
+
"id": "ff78d2bd",
|
| 481 |
"metadata": {
|
| 482 |
"execution": {
|
| 483 |
+
"iopub.execute_input": "2023-09-29T05:06:56.020959Z",
|
| 484 |
+
"iopub.status.busy": "2023-09-29T05:06:56.020447Z",
|
| 485 |
+
"iopub.status.idle": "2023-09-29T05:07:01.579575Z",
|
| 486 |
+
"shell.execute_reply": "2023-09-29T05:07:01.578476Z"
|
| 487 |
},
|
| 488 |
"papermill": {
|
| 489 |
+
"duration": 5.569483,
|
| 490 |
+
"end_time": "2023-09-29T05:07:01.582319",
|
| 491 |
"exception": false,
|
| 492 |
+
"start_time": "2023-09-29T05:06:56.012836",
|
| 493 |
"status": "completed"
|
| 494 |
},
|
| 495 |
"tags": []
|
|
|
|
| 515 |
{
|
| 516 |
"cell_type": "code",
|
| 517 |
"execution_count": 5,
|
| 518 |
+
"id": "f656d56b",
|
| 519 |
"metadata": {
|
| 520 |
"execution": {
|
| 521 |
+
"iopub.execute_input": "2023-09-29T05:07:01.598719Z",
|
| 522 |
+
"iopub.status.busy": "2023-09-29T05:07:01.597947Z",
|
| 523 |
+
"iopub.status.idle": "2023-09-29T05:07:01.851778Z",
|
| 524 |
+
"shell.execute_reply": "2023-09-29T05:07:01.850738Z"
|
| 525 |
},
|
| 526 |
"papermill": {
|
| 527 |
+
"duration": 0.265316,
|
| 528 |
+
"end_time": "2023-09-29T05:07:01.854564",
|
| 529 |
"exception": false,
|
| 530 |
+
"start_time": "2023-09-29T05:07:01.589248",
|
| 531 |
"status": "completed"
|
| 532 |
},
|
| 533 |
"tags": []
|
|
|
|
| 537 |
"name": "stdout",
|
| 538 |
"output_type": "stream",
|
| 539 |
"text": [
|
| 540 |
+
"/usr/bin/sh: 1: python: not found\r\n"
|
| 541 |
]
|
| 542 |
}
|
| 543 |
],
|
| 544 |
"source": [
|
| 545 |
"# Start the foundation model training\n",
|
| 546 |
"!cd \"{TRAINER_DIR}\" && \\\n",
|
|
|
|
| 547 |
" export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
|
| 548 |
" python lightning_trainer.py fit \\\n",
|
| 549 |
+
" -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\" \\\n",
|
| 550 |
" --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
|
| 551 |
" --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
|
| 552 |
" --trainer.devices=\"{GPU_DEVICES}\" \\\n",
|
|
|
|
| 559 |
{
|
| 560 |
"cell_type": "code",
|
| 561 |
"execution_count": 6,
|
| 562 |
+
"id": "c7b46f94",
|
| 563 |
"metadata": {
|
| 564 |
"execution": {
|
| 565 |
+
"iopub.execute_input": "2023-09-29T05:07:01.871225Z",
|
| 566 |
+
"iopub.status.busy": "2023-09-29T05:07:01.870345Z",
|
| 567 |
+
"iopub.status.idle": "2023-09-29T05:07:02.373808Z",
|
| 568 |
+
"shell.execute_reply": "2023-09-29T05:07:02.372753Z"
|
| 569 |
},
|
| 570 |
"papermill": {
|
| 571 |
+
"duration": 0.51526,
|
| 572 |
+
"end_time": "2023-09-29T05:07:02.376685",
|
| 573 |
"exception": false,
|
| 574 |
+
"start_time": "2023-09-29T05:07:01.861425",
|
| 575 |
"status": "completed"
|
| 576 |
},
|
| 577 |
"tags": []
|
|
|
|
| 602 |
{
|
| 603 |
"cell_type": "code",
|
| 604 |
"execution_count": 7,
|
| 605 |
+
"id": "9f558c57",
|
| 606 |
"metadata": {
|
| 607 |
"execution": {
|
| 608 |
+
"iopub.execute_input": "2023-09-29T05:07:02.393471Z",
|
| 609 |
+
"iopub.status.busy": "2023-09-29T05:07:02.392695Z",
|
| 610 |
+
"iopub.status.idle": "2023-09-29T05:07:08.804315Z",
|
| 611 |
+
"shell.execute_reply": "2023-09-29T05:07:08.803244Z"
|
| 612 |
},
|
| 613 |
"papermill": {
|
| 614 |
+
"duration": 6.42299,
|
| 615 |
+
"end_time": "2023-09-29T05:07:08.806769",
|
| 616 |
"exception": false,
|
| 617 |
+
"start_time": "2023-09-29T05:07:02.383779",
|
| 618 |
"status": "completed"
|
| 619 |
},
|
| 620 |
"tags": []
|
|
|
|
| 624 |
"name": "stdout",
|
| 625 |
"output_type": "stream",
|
| 626 |
"text": [
|
| 627 |
+
"[2023-09-29 05:07:06,749] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
|
| 628 |
+
]
|
| 629 |
+
},
|
| 630 |
+
{
|
| 631 |
+
"name": "stdout",
|
| 632 |
+
"output_type": "stream",
|
| 633 |
+
"text": [
|
| 634 |
+
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
|
| 635 |
+
"Traceback (most recent call last):\r\n",
|
| 636 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
|
| 637 |
+
" model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
|
| 638 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
|
| 639 |
+
" self.model = RWKV(**model_config)\r\n",
|
| 640 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
|
| 641 |
+
" raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
|
| 642 |
+
"ValueError: load_model file '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n"
|
| 643 |
]
|
| 644 |
}
|
| 645 |
],
|
| 646 |
"source": [
|
| 647 |
"# # Lets do a quick dragon prompt validation\n",
|
| 648 |
"!cd \"{INFERENCE_DIR}\" && \\\n",
|
|
|
|
| 649 |
" python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
|
| 650 |
]
|
| 651 |
}
|
|
|
|
| 670 |
},
|
| 671 |
"papermill": {
|
| 672 |
"default_parameters": {},
|
| 673 |
+
"duration": 44.644446,
|
| 674 |
+
"end_time": "2023-09-29T05:07:09.133994",
|
| 675 |
"environment_variables": {},
|
| 676 |
"exception": null,
|
| 677 |
"input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
| 678 |
"output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
| 679 |
"parameters": {},
|
| 680 |
+
"start_time": "2023-09-29T05:06:24.489548",
|
| 681 |
"version": "2.4.0"
|
| 682 |
}
|
| 683 |
},
|