Upload folder using huggingface_hub
Browse files- README.md +21 -16
- checkpoint_epoch_1.pt +2 -2
- config.json +13 -0
- latest_checkpoint.pt +2 -2
- training_history.json +2 -2
- trial_results.json +4 -4
README.md
CHANGED
|
@@ -11,48 +11,53 @@ datasets:
|
|
| 11 |
- xsum
|
| 12 |
metrics:
|
| 13 |
- rouge
|
|
|
|
|
|
|
|
|
|
| 14 |
---
|
| 15 |
|
| 16 |
# MoE Text Summarization Model (Trial Run)
|
| 17 |
|
| 18 |
## Model Description
|
| 19 |
|
| 20 |
-
This is a Mixture-of-Experts (MoE) model for text summarization, trained on a small subset of the XSum dataset as a trial run.
|
| 21 |
|
| 22 |
## Model Details
|
| 23 |
|
| 24 |
- **Model Type**: Mixture-of-Experts Text Summarization
|
| 25 |
- **Architecture**: Encoder-Decoder with MoE in encoder
|
| 26 |
- **Training Data**: XSum dataset (trial: 10 samples)
|
| 27 |
-
- **Routing Type**:
|
| 28 |
- **Number of Experts**: 4
|
| 29 |
- **Top-K**: 2
|
| 30 |
|
| 31 |
-
##
|
| 32 |
|
| 33 |
-
-
|
| 34 |
-
-
|
| 35 |
-
-
|
| 36 |
-
- `trial_results.json`: Complete trial run results
|
| 37 |
|
| 38 |
## Usage
|
| 39 |
|
| 40 |
```python
|
| 41 |
import torch
|
|
|
|
| 42 |
|
| 43 |
-
# Load
|
| 44 |
-
|
| 45 |
-
model_state = checkpoint['model_state_dict']
|
| 46 |
-
model_config = checkpoint['model_config']
|
| 47 |
|
| 48 |
-
#
|
| 49 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
```
|
| 51 |
|
| 52 |
-
##
|
| 53 |
|
| 54 |
-
This
|
| 55 |
-
For production use, train on the full dataset.
|
| 56 |
|
| 57 |
## Citation
|
| 58 |
|
|
|
|
| 11 |
- xsum
|
| 12 |
metrics:
|
| 13 |
- rouge
|
| 14 |
+
widget:
|
| 15 |
+
- text: "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris."
|
| 16 |
+
example_title: "Sample Text"
|
| 17 |
---
|
| 18 |
|
| 19 |
# MoE Text Summarization Model (Trial Run)
|
| 20 |
|
| 21 |
## Model Description
|
| 22 |
|
| 23 |
+
This is a Mixture-of-Experts (MoE) model for text summarization, trained on a small subset of the XSum dataset as a trial run. The model demonstrates the MoE architecture with 4 experts and top-2 routing.
|
| 24 |
|
| 25 |
## Model Details
|
| 26 |
|
| 27 |
- **Model Type**: Mixture-of-Experts Text Summarization
|
| 28 |
- **Architecture**: Encoder-Decoder with MoE in encoder
|
| 29 |
- **Training Data**: XSum dataset (trial: 10 samples)
|
| 30 |
+
- **Routing Type**: topk
|
| 31 |
- **Number of Experts**: 4
|
| 32 |
- **Top-K**: 2
|
| 33 |
|
| 34 |
+
## Training Details
|
| 35 |
|
| 36 |
+
- **Training Samples**: 10 (trial run)
|
| 37 |
+
- **Epochs**: 1
|
| 38 |
+
- **Final Loss**: 10.604265594482422
|
|
|
|
| 39 |
|
| 40 |
## Usage
|
| 41 |
|
| 42 |
```python
|
| 43 |
import torch
|
| 44 |
+
from transformers import AutoTokenizer
|
| 45 |
|
| 46 |
+
# Load tokenizer
|
| 47 |
+
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-xsum')
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Load model (you'll need the MoE implementation)
|
| 50 |
+
# model = MoESummarizationModel.from_pretrained('vivekdhayaal/moe-xsum-trial')
|
| 51 |
+
|
| 52 |
+
# Example usage
|
| 53 |
+
text = "Your input text here..."
|
| 54 |
+
# Generate summary with the model
|
| 55 |
```
|
| 56 |
|
| 57 |
+
## Note
|
| 58 |
|
| 59 |
+
This is a trial run model trained on only 10 samples for demonstration purposes.
|
| 60 |
+
For production use, train on the full XSum dataset.
|
| 61 |
|
| 62 |
## Citation
|
| 63 |
|
checkpoint_epoch_1.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12c0245d7bc1ce82267cabbff6a034bf249c9854c9483bbc981a67ebdfcea8ab
|
| 3 |
+
size 547645498
|
config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "moe_summarization",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"MoESummarizationModel"
|
| 5 |
+
],
|
| 6 |
+
"vocab_size": 50265,
|
| 7 |
+
"d_model": 256,
|
| 8 |
+
"num_experts": 4,
|
| 9 |
+
"top_k": 2,
|
| 10 |
+
"routing_type": "topk",
|
| 11 |
+
"trial_run": true,
|
| 12 |
+
"training_samples": 10
|
| 13 |
+
}
|
latest_checkpoint.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:716c086f434371987cdb6b0f45bdfd005151816727c9b6228441d6729fe94b04
|
| 3 |
+
size 547645114
|
training_history.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
"epoch": 1,
|
| 4 |
-
"train_loss": 10.
|
| 5 |
"train_aux_loss": 0.0,
|
| 6 |
-
"timestamp": "2025-11-14T15:
|
| 7 |
}
|
| 8 |
]
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
"epoch": 1,
|
| 4 |
+
"train_loss": 10.604265594482422,
|
| 5 |
"train_aux_loss": 0.0,
|
| 6 |
+
"timestamp": "2025-11-14T15:34:39.752216"
|
| 7 |
}
|
| 8 |
]
|
trial_results.json
CHANGED
|
@@ -7,18 +7,18 @@
|
|
| 7 |
"top_k": 2,
|
| 8 |
"routing_type": "topk",
|
| 9 |
"epochs": 1,
|
| 10 |
-
"final_loss": 10.
|
| 11 |
"training_samples": 10
|
| 12 |
},
|
| 13 |
"training_history": [
|
| 14 |
{
|
| 15 |
"epoch": 1,
|
| 16 |
-
"train_loss": 10.
|
| 17 |
"train_aux_loss": 0.0,
|
| 18 |
-
"timestamp": "2025-11-14T15:
|
| 19 |
}
|
| 20 |
],
|
| 21 |
"repo_url": null,
|
| 22 |
"checkpoint_dir": "trial_checkpoints",
|
| 23 |
-
"timestamp": "2025-11-14T15:
|
| 24 |
}
|
|
|
|
| 7 |
"top_k": 2,
|
| 8 |
"routing_type": "topk",
|
| 9 |
"epochs": 1,
|
| 10 |
+
"final_loss": 10.625629997253418,
|
| 11 |
"training_samples": 10
|
| 12 |
},
|
| 13 |
"training_history": [
|
| 14 |
{
|
| 15 |
"epoch": 1,
|
| 16 |
+
"train_loss": 10.625629997253418,
|
| 17 |
"train_aux_loss": 0.0,
|
| 18 |
+
"timestamp": "2025-11-14T15:11:49.179446"
|
| 19 |
}
|
| 20 |
],
|
| 21 |
"repo_url": null,
|
| 22 |
"checkpoint_dir": "trial_checkpoints",
|
| 23 |
+
"timestamp": "2025-11-14T15:11:52.756849"
|
| 24 |
}
|