matbee commited on 10 days ago

Commit

07823f7

verified ·

1 Parent(s): 0a290eb

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -0
README.md +66 -53
fp16/dacvae_decoder.onnx +3 -0
fp16/dacvae_decoder.onnx.data +3 -0
fp16/dacvae_encoder.onnx +3 -0
fp16/dacvae_encoder.onnx.data +3 -0
fp16/dit_single_step.onnx +3 -0
fp16/dit_single_step.onnx.data +3 -0
fp16/t5_encoder.onnx +3 -0
fp16/t5_encoder.onnx.data +3 -0
fp16/tokenizer/special_tokens_map.json +107 -0
fp16/tokenizer/spiece.model +3 -0
fp16/tokenizer/tokenizer.json +0 -0
fp16/tokenizer/tokenizer_config.json +939 -0
fp16/tokenizer_config.json +7 -0
fp16/vision_encoder.onnx +3 -0
fp16/vision_encoder.onnx.data +3 -0
fp32/dacvae_decoder.onnx +3 -0
fp32/dacvae_decoder.onnx.data +3 -0
fp32/dacvae_encoder.onnx +3 -0
fp32/dacvae_encoder.onnx.data +3 -0
fp32/dit_single_step.onnx +3 -0
fp32/dit_single_step.onnx.data +3 -0
fp32/t5_encoder.onnx +3 -0
fp32/t5_encoder.onnx.data +3 -0
fp32/tokenizer/special_tokens_map.json +107 -0
fp32/tokenizer/spiece.model +3 -0
fp32/tokenizer/tokenizer.json +0 -0
fp32/tokenizer/tokenizer_config.json +939 -0
fp32/tokenizer_config.json +7 -0
fp32/vision_encoder.onnx +3 -0
fp32/vision_encoder.onnx.data +3 -0
onnx_export/__init__.py +1 -0
onnx_export/__pycache__/__init__.cpython-312.pyc +0 -0
onnx_export/__pycache__/export_dacvae.cpython-312.pyc +0 -0
onnx_export/__pycache__/export_dit.cpython-312.pyc +0 -0
onnx_export/__pycache__/export_peaframe.cpython-312.pyc +0 -0
onnx_export/__pycache__/export_t5.cpython-312.pyc +0 -0
onnx_export/__pycache__/export_vision.cpython-312.pyc +0 -0
onnx_export/__pycache__/quantize_large_model.cpython-312.pyc +0 -0
onnx_export/__pycache__/quantize_models.cpython-312.pyc +0 -0
onnx_export/__pycache__/standalone_config.cpython-312.pyc +0 -0
onnx_export/export_all.py +130 -0
onnx_export/export_dacvae.py +427 -0
onnx_export/export_dit.py +574 -0
onnx_export/export_peaframe.py +288 -0
onnx_export/export_t5.py +315 -0
onnx_export/export_vision.py +113 -0
onnx_export/quantize_large_model.py +115 -0
onnx_export/quantize_models.py +286 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp16/dacvae_decoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp16/dacvae_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp16/dit_single_step.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp16/t5_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp16/vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp32/dacvae_decoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp32/dacvae_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp32/dit_single_step.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp32/t5_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+fp32/vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,95 +1,108 @@
 # SAM-Audio ONNX (Large)
 ONNX-converted models for [SAM-Audio](https://github.com/facebookresearch/sam-audio) (facebook/sam-audio-large) - Meta's Semantic Audio Modeling for audio source separation.
-## Model Files
-| File | Description | Size |
-|------|-------------|------|
-| `dacvae_encoder.onnx` | Audio encoder (48kHz → latent) | ~110 MB |
-| `dacvae_decoder.onnx` | Audio decoder (latent → 48kHz) | ~320 MB |
-| `t5_encoder.onnx` | Text encoder (T5-base) | ~440 MB |
-| `dit_single_step.onnx` | DiT denoiser (single ODE step) | ~2 GB |
-| `vision_encoder.onnx` | Vision encoder (CLIP-based) | ~1.2 GB |
-| `tokenizer/` | SentencePiece tokenizer files | - |
 ## Installation
 ```bash
 pip install onnxruntime sentencepiece torchaudio torchvision torchcodec soundfile
-# For CUDA support:
 pip install onnxruntime-gpu
 ```
-## Quick Start
-```python
-import numpy as np
-import onnxruntime as ort
-from huggingface_hub import hf_hub_download
-# Download models
-model_dir = "sam-audio-large-onnx"
-for f in ["dacvae_encoder.onnx", "dacvae_decoder.onnx", "t5_encoder.onnx",
-          "dit_single_step.onnx", "vision_encoder.onnx"]:
-    hf_hub_download("matbee/sam-audio-large-onnx", f, local_dir=model_dir)
-    if f != "vision_encoder.onnx":  # vision encoder embeds weights
-        hf_hub_download("matbee/sam-audio-large-onnx", f + ".data", local_dir=model_dir)
-```
-## Usage Examples
-### Audio-Only Separation
 ```bash
 python onnx_inference.py \
-    --audio input.wav \
     --text "a person speaking" \
-    --output separated.wav
 ```
-### Video-Guided Separation
 ```bash
 python onnx_inference.py \
     --video input.mp4 \
-    --text "the sound of typing" \
-    --output separated.wav
 ```
-### Visual Prompting with SAM3 Mask
 ```bash
-# First generate a mask with SAM3 (see generate_sam3_mask.py)
 python onnx_inference.py \
-    --video input.mp4 \
-    --mask object_mask.mp4 \
-    --text "" \
-    --output isolated.wav \
-    --output-video visualization.mp4
 ```
-## Model Details
 - **Audio Sample Rate**: 48kHz
 - **Audio Hop Length**: 1536 samples
 - **Vision Input Size**: 336×336 pixels
 - **Text Encoder**: T5-base (768-dim)
 - **Vision Encoder**: PE-Core-L14-336 (1024-dim)
-- **ODE Solver**: Midpoint method (configurable steps)
-## License
-SAM-Audio is released under the [CC-BY-NC 4.0 license](https://creativecommons.org/licenses/by-nc/4.0/).
-## Citation
-```bibtex
-@article{samaudio2024,
-  title={SAM-Audio: Semantic Audio Modeling},
-  author={Meta AI},
-  year={2024}
-}
 ```
 ## Acknowledgments
 Original model by [Meta AI Research](https://github.com/facebookresearch/sam-audio).
-ONNX conversion by [@matbee](https://huggingface.co/matbee).

+---
+license: other
+base_model: facebook/sam-audio-large
+tags:
+- onnx
+- audio
+- sam-audio
+- source-separation
+- audio-visual
+---
 # SAM-Audio ONNX (Large)
 ONNX-converted models for [SAM-Audio](https://github.com/facebookresearch/sam-audio) (facebook/sam-audio-large) - Meta's Semantic Audio Modeling for audio source separation.
+This repository contains both **FP32** and **FP16** versions of the models.
+## Model Variants
+| Variant | DiT Size | Total Size | Notes |
+|---------|----------|------------|-------|
+| `fp32/` | 11.76 GB | ~13.9 GB | Full precision |
+| `fp16/` | 5.88 GB | ~8.0 GB | Half precision (recommended) |
+## Model Files (per variant)
+| File | Description | FP32 Size | FP16 Size |
+|------|-------------|-----------|-----------|
+| `dacvae_encoder.onnx` | Audio encoder (48kHz → latent) | 110 MB | 110 MB |
+| `dacvae_decoder.onnx` | Audio decoder (latent → 48kHz) | 320 MB | 320 MB |
+| `t5_encoder.onnx` | Text encoder (T5-base) | 440 MB | 440 MB |
+| `dit_single_step.onnx` | DiT denoiser (3B params) | 11.76 GB | 5.88 GB |
+| `vision_encoder.onnx` | Vision encoder (CLIP-based) | 1.27 GB | 1.27 GB |
+| `tokenizer/` | SentencePiece tokenizer files | - | - |
 ## Installation
 ```bash
 pip install onnxruntime sentencepiece torchaudio torchvision torchcodec soundfile
+# For CUDA support (recommended for large model):
 pip install onnxruntime-gpu
 ```
+## Usage
+### Using FP16 Models (Recommended)
 ```bash
 python onnx_inference.py \
+    --video input.mp4 \
     --text "a person speaking" \
+    --model-dir fp16 \
+    --output target.wav \
+    --output-residual residual.wav
 ```
+### Using FP32 Models
 ```bash
 python onnx_inference.py \
     --video input.mp4 \
+    --text "keyboard typing" \
+    --model-dir fp32 \
+    --output target.wav
 ```
+### Audio-Only Mode
 ```bash
 python onnx_inference.py \
+    --audio input.wav \
+    --text "drums" \
+    --model-dir fp16 \
+    --output drums.wav
 ```
+## Model Specifications
 - **Audio Sample Rate**: 48kHz
 - **Audio Hop Length**: 1536 samples
 - **Vision Input Size**: 336×336 pixels
 - **Text Encoder**: T5-base (768-dim)
 - **Vision Encoder**: PE-Core-L14-336 (1024-dim)
+- **DiT Parameters**: ~3 billion
+- **ODE Solver**: Midpoint method (default 16 steps)
+## Exporting Models
+### Export FP16 DiT (Recommended)
+```bash
+python -m onnx_export.export_dit \
+    --output-dir ./my_models \
+    --model-id facebook/sam-audio-large \
+    --fp16 \
+    --device cuda
+```
+### Export Other Components
+```bash
+python -m onnx_export.export_dacvae --output-dir ./my_models --model-id facebook/sam-audio-large
+python -m onnx_export.export_t5 --output-dir ./my_models --model-id facebook/sam-audio-large
+python -m onnx_export.export_vision --model facebook/sam-audio-large --output ./my_models
 ```
+## License
+SAM-Audio is released under the [CC-BY-NC 4.0 license](https://creativecommons.org/licenses/by-nc/4.0/). See [original repository](https://huggingface.co/facebook/sam-audio-large) for full terms.
 ## Acknowledgments
 Original model by [Meta AI Research](https://github.com/facebookresearch/sam-audio).

fp16/dacvae_decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be13c0842945293ca0f5d77514783dc126007c5b490f00444c1c29f44f7035df
+size 1103715

fp16/dacvae_decoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1dba74364b1d721a4d7f921b18e58577c4f7375c3e2f82e46bf4898ecd61cba
+size 320536576

fp16/dacvae_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97a859c46e43a78dec7a974dcaaf433890e78bb2ac3b4dc5b498da414b86580b
+size 866783

fp16/dacvae_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68db13ee862cd7b5869176e5f080b2f88bc1a97711babd59d55ed93b26b047f6
+size 110231552

fp16/dit_single_step.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85f1260ac7773fd3efa231ace4ebdf063f57c4a52719b3db09f1871af5a5f59b
+size 5698535

fp16/dit_single_step.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d21fde5750792dd4f9d10f85ccfa397363ec9c3bd72b3c711bcbe5a6e8f48b6
+size 5878317056

fp16/t5_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1776c8bce2eb4dbcd297bde499ea780253bc09677cfa8e18baef383859437e1c
+size 1110394

fp16/t5_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a91e63b28acfc81e85f659309625b54b5bf0c2e88161025d3a3b8580d4e20c8
+size 438566912

fp16/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

fp16/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

fp16/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

fp16/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,939 @@

+{
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

fp16/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "model_name": "google-t5/t5-base",
+  "max_length": 77,
+  "vocab_size": 32100,
+  "pad_token_id": 0,
+  "eos_token_id": 1
+}

fp16/vision_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea8535c5bc55fbcc62fba78774e34c4ba59dc721275194efda73b26aee2eead9
+size 3098779

fp16/vision_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8b15b05f71b646b454bf77037c82cd1335917f39b2f847baaf5cb4d20880ee9
+size 1268842496

fp32/dacvae_decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be13c0842945293ca0f5d77514783dc126007c5b490f00444c1c29f44f7035df
+size 1103715

fp32/dacvae_decoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1dba74364b1d721a4d7f921b18e58577c4f7375c3e2f82e46bf4898ecd61cba
+size 320536576

fp32/dacvae_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97a859c46e43a78dec7a974dcaaf433890e78bb2ac3b4dc5b498da414b86580b
+size 866783

fp32/dacvae_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68db13ee862cd7b5869176e5f080b2f88bc1a97711babd59d55ed93b26b047f6
+size 110231552

fp32/dit_single_step.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03568412927cffbef44e99fc666b7fc0348807ad7a4852fe8ec8c8f272846f83
+size 5115331

fp32/dit_single_step.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc5380da46b3962029b0d1c604f7ceff527ebe50e13190b7f674b91f29cf0072
+size 11755978752

fp32/t5_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1776c8bce2eb4dbcd297bde499ea780253bc09677cfa8e18baef383859437e1c
+size 1110394

fp32/t5_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a91e63b28acfc81e85f659309625b54b5bf0c2e88161025d3a3b8580d4e20c8
+size 438566912

fp32/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

fp32/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

fp32/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

fp32/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,939 @@

+{
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

fp32/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "model_name": "google-t5/t5-base",
+  "max_length": 77,
+  "vocab_size": 32100,
+  "pad_token_id": 0,
+  "eos_token_id": 1
+}

fp32/vision_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7011e2c8f01fff26d89ba2515005f34a505d5be381ddc910d089b57cb90b605
+size 2822352

fp32/vision_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8b15b05f71b646b454bf77037c82cd1335917f39b2f847baaf5cb4d20880ee9
+size 1268842496

onnx_export/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ONNX Export utilities for SAM Audio

onnx_export/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (148 Bytes). View file

onnx_export/__pycache__/export_dacvae.cpython-312.pyc ADDED Viewed

Binary file (15.3 kB). View file

onnx_export/__pycache__/export_dit.cpython-312.pyc ADDED Viewed

Binary file (23.7 kB). View file

onnx_export/__pycache__/export_peaframe.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

onnx_export/__pycache__/export_t5.cpython-312.pyc ADDED Viewed

Binary file (11.1 kB). View file

onnx_export/__pycache__/export_vision.cpython-312.pyc ADDED Viewed

Binary file (5.27 kB). View file

onnx_export/__pycache__/quantize_large_model.cpython-312.pyc ADDED Viewed

Binary file (5.3 kB). View file

onnx_export/__pycache__/quantize_models.cpython-312.pyc ADDED Viewed

Binary file (11.1 kB). View file

onnx_export/__pycache__/standalone_config.cpython-312.pyc ADDED Viewed

Binary file (5.93 kB). View file

onnx_export/export_all.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python3
+"""
+Export all SAM Audio components to ONNX format.
+This script exports:
+1. DACVAE encoder and decoder (audio codec)
+2. T5 text encoder
+3. DiT transformer (single-step for ODE solving)
+4. Vision encoder (CLIP-based, for video-guided separation)
+    python -m onnx_export.export_all --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import subprocess
+import sys
+def run_export(module: str, args: list[str]) -> bool:
+    """Run an export module with the given arguments."""
+    cmd = [sys.executable, "-m", module] + args
+    print(f"\n{'='*60}")
+    print(f"Running: {' '.join(cmd)}")
+    print(f"{'='*60}\n")
+    result = subprocess.run(cmd, cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    return result.returncode == 0
+def main():
+    parser = argparse.ArgumentParser(description="Export all SAM Audio components to ONNX")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/sam-audio-small",
+        help="SAM-Audio model ID (e.g., facebook/sam-audio-small, facebook/sam-audio-large, facebook/sam-audio-base-tv)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--skip-dacvae",
+        action="store_true",
+        help="Skip DACVAE export",
+    )
+    parser.add_argument(
+        "--skip-t5",
+        action="store_true",
+        help="Skip T5 export",
+    )
+    parser.add_argument(
+        "--skip-dit",
+        action="store_true",
+        help="Skip DiT export",
+    )
+    parser.add_argument(
+        "--skip-vision",
+        action="store_true",
+        help="Skip Vision encoder export",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    results = {}
+    # Export DACVAE
+    if not args.skip_dacvae:
+        export_args = ["--output-dir", args.output_dir, "--model-id", args.model]
+        if args.verify:
+            export_args.append("--verify")
+        results["DACVAE"] = run_export("onnx_export.export_dacvae", export_args)
+    # Export T5 (always uses google-t5/t5-base, independent of SAM-Audio model)
+    if not args.skip_t5:
+        export_args = ["--output-dir", args.output_dir]
+        if args.verify:
+            export_args.append("--verify")
+        results["T5"] = run_export("onnx_export.export_t5", export_args)
+    # Export DiT
+    if not args.skip_dit:
+        export_args = ["--output-dir", args.output_dir, "--model-id", args.model]
+        if args.verify:
+            export_args.append("--verify")
+        results["DiT"] = run_export("onnx_export.export_dit", export_args)
+    # Export Vision Encoder
+    if not args.skip_vision:
+        export_args = ["--output", args.output_dir, "--model", args.model]
+        results["Vision"] = run_export("onnx_export.export_vision", export_args)
+    # Print summary
+    print(f"\n{'='*60}")
+    print("Export Summary")
+    print(f"{'='*60}")
+    all_success = True
+    for name, success in results.items():
+        status = "✓" if success else "✗"
+        print(f"  {status} {name}")
+        if not success:
+            all_success = False
+    # List exported files
+    print(f"\nExported files in {args.output_dir}:")
+    for f in sorted(os.listdir(args.output_dir)):
+        path = os.path.join(args.output_dir, f)
+        if os.path.isfile(path):
+            size_mb = os.path.getsize(path) / (1024 * 1024)
+            print(f"  {f}: {size_mb:.1f} MB")
+    if all_success:
+        print("\n✓ All exports completed successfully!")
+    else:
+        print("\n✗ Some exports failed")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

onnx_export/export_dacvae.py ADDED Viewed

	@@ -0,0 +1,427 @@

+#!/usr/bin/env python3
+"""
+Export DACVAE (audio codec) to ONNX format.
+This exports the encoder and decoder separately:
+- Encoder: audio waveform → latent features
+- Decoder: latent features → audio waveform
+Usage:
+    python -m onnx_export.export_dacvae --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+import dacvae
+from huggingface_hub import hf_hub_download
+# Default DACVAE configuration (matches SAM Audio)
+DEFAULT_CONFIG = {
+    "encoder_dim": 64,
+    "encoder_rates": [2, 8, 10, 12],
+    "latent_dim": 1024,
+    "decoder_dim": 1536,
+    "decoder_rates": [12, 10, 8, 2],
+    "n_codebooks": 16,
+    "codebook_size": 1024,
+    "codebook_dim": 128,
+    "quantizer_dropout": False,
+    "sample_rate": 48000,
+}
+class DACVAEEncoderWrapper(nn.Module):
+    """Wrapper for DACVAE encoder that outputs continuous latent features."""
+    def __init__(self, encoder, quantizer):
+        super().__init__()
+        self.encoder = encoder
+        self.in_proj = quantizer.in_proj
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio to latent features.
+        Args:
+            audio: Input waveform, shape (batch, 1, samples)
+        Returns:
+            latent_features: Continuous latent mean, shape (batch, 128, time_steps)
+        """
+        x = self.encoder(audio)
+        # in_proj outputs 256 dim, chunk into mean and variance, use only mean
+        mean, _ = self.in_proj(x).chunk(2, dim=1)
+        return mean
+class DACVAEDecoderWrapper(nn.Module):
+    """Wrapper for DACVAE decoder that takes continuous latent features."""
+    def __init__(self, decoder, quantizer):
+        super().__init__()
+        self.decoder = decoder
+        self.out_proj = quantizer.out_proj
+    def forward(self, latent_features: torch.Tensor) -> torch.Tensor:
+        """
+        Decode latent features to audio.
+        Args:
+            latent_features: Continuous latent, shape (batch, 128, time_steps)
+        Returns:
+            audio: Output waveform, shape (batch, 1, samples)
+        """
+        x = self.out_proj(latent_features)
+        return self.decoder(x)
+def create_dacvae_model(model_id: str = "facebook/sam-audio-small") -> dacvae.DACVAE:
+    """
+    Create and load DACVAE model with weights from SAM Audio checkpoint.
+    This uses the standalone dacvae library, avoiding loading the full SAM Audio
+    model and its dependencies (vision encoder, imagebind, etc).
+    """
+    print(f"Creating DACVAE model...")
+    model = dacvae.DACVAE(
+        encoder_dim=DEFAULT_CONFIG["encoder_dim"],
+        encoder_rates=DEFAULT_CONFIG["encoder_rates"],
+        latent_dim=DEFAULT_CONFIG["latent_dim"],
+        decoder_dim=DEFAULT_CONFIG["decoder_dim"],
+        decoder_rates=DEFAULT_CONFIG["decoder_rates"],
+        n_codebooks=DEFAULT_CONFIG["n_codebooks"],
+        codebook_size=DEFAULT_CONFIG["codebook_size"],
+        codebook_dim=DEFAULT_CONFIG["codebook_dim"],
+        quantizer_dropout=DEFAULT_CONFIG["quantizer_dropout"],
+        sample_rate=DEFAULT_CONFIG["sample_rate"],
+    ).eval()
+    # Load weights from SAM Audio checkpoint
+    print(f"Downloading checkpoint from {model_id}...")
+    checkpoint_path = hf_hub_download(
+        repo_id=model_id,
+        filename="checkpoint.pt",
+    )
+    print("Loading DACVAE weights from checkpoint...")
+    state_dict = torch.load(
+        checkpoint_path,
+        map_location="cpu",
+        weights_only=True,
+        mmap=True,  # Memory-efficient loading
+    )
+    # Extract only DACVAE weights (prefixed with "audio_codec.")
+    dacvae_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith("audio_codec."):
+            new_key = k.replace("audio_codec.", "")
+            dacvae_state_dict[new_key] = v.clone()
+    # Load weights
+    model.load_state_dict(dacvae_state_dict, strict=False)
+    # Clear large checkpoint from memory
+    del state_dict
+    print(f"  ✓ Loaded {len(dacvae_state_dict)} DACVAE weight tensors")
+    # Calculate hop_length for reference
+    import numpy as np
+    hop_length = int(np.prod(DEFAULT_CONFIG["encoder_rates"]))
+    model.hop_length = hop_length
+    model.sample_rate = DEFAULT_CONFIG["sample_rate"]
+    return model
+def export_encoder(
+    dacvae_model: dacvae.DACVAE,
+    output_path: str,
+    opset_version: int = 21,
+    device: str = "cpu",
+) -> None:
+    """Export DACVAE encoder to ONNX."""
+    print(f"Exporting DACVAE encoder to {output_path}...")
+    wrapper = DACVAEEncoderWrapper(
+        dacvae_model.encoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Sample input: 1 second of audio at 48kHz
+    sample_rate = DEFAULT_CONFIG["sample_rate"]
+    dummy_audio = torch.randn(1, 1, sample_rate, device=device)
+    torch.onnx.export(
+        wrapper,
+        (dummy_audio,),
+        output_path,
+        input_names=["audio"],
+        output_names=["latent_features"],
+        dynamic_axes={
+            "audio": {0: "batch", 2: "samples"},
+            "latent_features": {0: "batch", 2: "time_steps"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print(f"  ✓ Encoder exported successfully")
+    # Validate
+    import onnx
+    # Load without external data to avoid OOM - we just need to validate structure
+    model = onnx.load(output_path, load_external_data=False)
+    onnx.checker.check_model(model, full_check=False)
+    print(f"  ✓ ONNX model validation passed")
+def export_decoder(
+    dacvae_model: dacvae.DACVAE,
+    output_path: str,
+    opset_version: int = 21,
+    device: str = "cpu",
+) -> None:
+    """Export DACVAE decoder to ONNX."""
+    print(f"Exporting DACVAE decoder to {output_path}...")
+    wrapper = DACVAEDecoderWrapper(
+        dacvae_model.decoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Sample input: 25 time steps (1 second at 48kHz with hop_length=1920)
+    hop_length = int(__import__("numpy").prod(DEFAULT_CONFIG["encoder_rates"]))
+    time_steps = DEFAULT_CONFIG["sample_rate"] // hop_length
+    dummy_latent = torch.randn(1, 128, time_steps, device=device)
+    torch.onnx.export(
+        wrapper,
+        (dummy_latent,),
+        output_path,
+        input_names=["latent_features"],
+        output_names=["waveform"],
+        dynamic_axes={
+            "latent_features": {0: "batch", 2: "time_steps"},
+            "waveform": {0: "batch", 2: "samples"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print(f"  ✓ Decoder exported successfully")
+    # Validate
+    import onnx
+    # Load without external data to avoid OOM - we just need to validate structure
+    model = onnx.load(output_path, load_external_data=False)
+    onnx.checker.check_model(model, full_check=False)
+    print(f"  ✓ ONNX model validation passed")
+def verify_encoder(
+    dacvae_model: dacvae.DACVAE,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-4,
+) -> bool:
+    """Verify ONNX encoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying encoder output...")
+    wrapper = DACVAEEncoderWrapper(
+        dacvae_model.encoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Test with random audio
+    sample_rate = DEFAULT_CONFIG["sample_rate"]
+    test_audio = torch.randn(1, 1, sample_rate * 2, device=device)  # 2 seconds
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = wrapper(test_audio).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_output = sess.run(
+        ["latent_features"],
+        {"audio": test_audio.cpu().numpy()}
+    )[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+    if max_diff > tolerance:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def verify_decoder(
+    dacvae_model: dacvae.DACVAE,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify ONNX decoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying decoder output...")
+    wrapper = DACVAEDecoderWrapper(
+        dacvae_model.decoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Test with random latent
+    hop_length = int(np.prod(DEFAULT_CONFIG["encoder_rates"]))
+    time_steps = DEFAULT_CONFIG["sample_rate"] // hop_length  # 25 steps = 1 second
+    test_latent = torch.randn(1, 128, time_steps, device=device)
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = wrapper(test_latent).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_output = sess.run(
+        ["waveform"],
+        {"latent_features": test_latent.cpu().numpy()}
+    )[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+    if max_diff > tolerance:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description="Export DACVAE to ONNX")
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default="facebook/sam-audio-small",
+        help="HuggingFace model ID (default: facebook/sam-audio-small)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--opset-version",
+        type=int,
+        default=18,
+        help="ONNX opset version (default: 18)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-4,
+        help="Tolerance for verification (default: 1e-4)",
+    )
+    parser.add_argument(
+        "--encoder-only",
+        action="store_true",
+        help="Export only the encoder",
+    )
+    parser.add_argument(
+        "--decoder-only",
+        action="store_true",
+        help="Export only the decoder",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load model
+    dacvae_model = create_dacvae_model(args.model_id)
+    print(f"\nDACVAE Configuration:")
+    print(f"  Model: {args.model_id}")
+    print(f"  Sample rate: {DEFAULT_CONFIG['sample_rate']} Hz")
+    print(f"  Hop length: {int(__import__('numpy').prod(DEFAULT_CONFIG['encoder_rates']))}")
+    print(f"  Latent dim: 128 (continuous)")
+    # Export encoder
+    if not args.decoder_only:
+        encoder_path = os.path.join(args.output_dir, "dacvae_encoder.onnx")
+        export_encoder(
+            dacvae_model,
+            encoder_path,
+            opset_version=args.opset_version,
+            device=args.device,
+        )
+        if args.verify:
+            verify_encoder(
+                dacvae_model,
+                encoder_path,
+                device=args.device,
+                tolerance=args.tolerance,
+            )
+    # Export decoder
+    if not args.encoder_only:
+        decoder_path = os.path.join(args.output_dir, "dacvae_decoder.onnx")
+        export_decoder(
+            dacvae_model,
+            decoder_path,
+            opset_version=args.opset_version,
+            device=args.device,
+        )
+        if args.verify:
+            verify_decoder(
+                dacvae_model,
+                decoder_path,
+                device=args.device,
+                tolerance=args.tolerance * 10,  # Decoder has higher tolerance
+            )
+    print(f"\n✓ Export complete! Models saved to {args.output_dir}/")
+if __name__ == "__main__":
+    main()

onnx_export/export_dit.py ADDED Viewed

	@@ -0,0 +1,574 @@

+#!/usr/bin/env python3
+"""
+Export DiT Transformer with unrolled ODE solver to ONNX format.
+The DiT transformer is the core denoising model in SAM Audio. It uses a flow-based
+generative model with an ODE solver. For ONNX export, we unroll the fixed-step
+midpoint ODE solver into a static computation graph.
+The default configuration uses:
+- method: "midpoint"
+- step_size: 2/32 (0.0625)
+- integration range: [0, 1]
+- total steps: 16
+This creates a single ONNX model that performs the complete denoising process,
+taking noise and conditioning as input and producing denoised audio features.
+Usage:
+    python -m onnx_export.export_dit --output-dir onnx_models --verify
+"""
+import os
+import math
+import argparse
+import torch
+import torch.nn as nn
+from typing import Optional
+class SinusoidalEmbedding(nn.Module):
+    """Sinusoidal timestep embedding (identical to SAMAudio implementation)."""
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        inv_freq = torch.exp(
+            -math.log(theta) * torch.arange(half_dim).float() / half_dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, pos=None):
+        if pos is None:
+            seq_len, device = x.shape[1], x.device
+            pos = torch.arange(seq_len, device=device)
+        emb = torch.einsum("i, j -> i j", pos, self.inv_freq)
+        emb = torch.cat((emb.cos(), emb.sin()), dim=-1)
+        return emb
+class EmbedAnchors(nn.Module):
+    """Anchor embedding (identical to SAMAudio implementation)."""
+    def __init__(self, num_embeddings: int, embedding_dim: int, out_dim: int):
+        super().__init__()
+        self.embed = nn.Embedding(
+            num_embeddings + 1, embedding_dim, padding_idx=num_embeddings
+        )
+        self.gate = nn.Parameter(torch.tensor([0.0]))
+        self.proj = nn.Linear(embedding_dim, out_dim, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        if anchor_ids is None:
+            return x
+        embs = self.embed(anchor_ids.gather(1, anchor_alignment))
+        proj = self.proj(embs)
+        return x + self.gate.tanh() * proj
+class DiTSingleStepWrapper(nn.Module):
+    """
+    Wrapper for DiT that performs a single forward pass (one ODE evaluation).
+    This mirrors the SAMAudio.forward() method exactly.
+    """
+    def __init__(
+        self,
+        transformer: nn.Module,
+        proj: nn.Module,
+        align_masked_video: nn.Module,
+        embed_anchors: nn.Module,
+        timestep_emb: nn.Module,
+        memory_proj: nn.Module,
+    ):
+        super().__init__()
+        self.transformer = transformer
+        self.proj = proj
+        self.align_masked_video = align_masked_video
+        self.embed_anchors = embed_anchors
+        self.timestep_emb = timestep_emb
+        self.memory_proj = memory_proj
+    def forward(
+        self,
+        noisy_audio: torch.Tensor,
+        time: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        text_mask: torch.Tensor,
+        masked_video_features: torch.Tensor,
+        anchor_ids: torch.Tensor,
+        anchor_alignment: torch.Tensor,
+        audio_pad_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Single forward pass of the DiT (one ODE function evaluation).
+        This exactly mirrors SAMAudio.forward() method.
+        """
+        # Align inputs (concatenate noisy_audio with audio_features)
+        # Same as SAMAudio.align_inputs()
+        x = torch.cat(
+            [
+                noisy_audio,
+                torch.zeros_like(audio_features),
+                audio_features,
+            ],
+            dim=2,
+        )
+        projected = self.proj(x)
+        aligned = self.align_masked_video(projected, masked_video_features)
+        aligned = self.embed_anchors(aligned, anchor_ids, anchor_alignment)
+        # Timestep embedding and memory
+        # Same as SAMAudio.forward()
+        timestep_emb_val = self.timestep_emb(time, pos=time).unsqueeze(1)
+        memory = self.memory_proj(text_features) + timestep_emb_val
+        # Transformer forward
+        output = self.transformer(
+            aligned,
+            time,
+            padding_mask=audio_pad_mask,
+            memory=memory,
+            memory_padding_mask=text_mask,
+        )
+        return output
+class UnrolledDiTWrapper(nn.Module):
+    """
+    DiT wrapper with unrolled midpoint ODE solver.
+    The midpoint method computes:
+        k1 = f(t, y)
+        k2 = f(t + h/2, y + h/2 * k1)
+        y_new = y + h * k2
+    With step_size=0.0625 and range [0,1], we have 16 steps.
+    """
+    def __init__(
+        self,
+        single_step: DiTSingleStepWrapper,
+        num_steps: int = 16,
+    ):
+        super().__init__()
+        self.single_step = single_step
+        self.num_steps = num_steps
+        self.step_size = 1.0 / num_steps
+    def forward(
+        self,
+        noise: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        text_mask: torch.Tensor,
+        masked_video_features: torch.Tensor,
+        anchor_ids: torch.Tensor,
+        anchor_alignment: torch.Tensor,
+        audio_pad_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Complete denoising using unrolled midpoint ODE solver."""
+        B = noise.shape[0]
+        h = self.step_size
+        y = noise
+        t = torch.zeros(B, device=noise.device, dtype=noise.dtype)
+        for step in range(self.num_steps):
+            # k1 = f(t, y)
+            k1 = self.single_step(
+                y, t,
+                audio_features, text_features, text_mask,
+                masked_video_features, anchor_ids, anchor_alignment, audio_pad_mask
+            )
+            # k2 = f(t + h/2, y + h/2 * k1)
+            t_mid = t + h / 2
+            y_mid = y + (h / 2) * k1
+            k2 = self.single_step(
+                y_mid, t_mid,
+                audio_features, text_features, text_mask,
+                masked_video_features, anchor_ids, anchor_alignment, audio_pad_mask
+            )
+            # y = y + h * k2
+            y = y + h * k2
+            t = t + h
+        return y
+def load_sam_audio_components(model_id: str = "facebook/sam-audio-small", device: str = "cpu"):
+    """
+    Load SAM Audio components needed for DiT export.
+    Since we can't load the full SAMAudio model (missing perception_models),
+    we construct the components directly and load weights from checkpoint.
+    """
+    import json
+    import sys
+    import types
+    import importlib.util
+    from huggingface_hub import hf_hub_download
+    print(f"Loading SAM Audio components from {model_id}...")
+    # Download config
+    config_path = hf_hub_download(repo_id=model_id, filename="config.json")
+    with open(config_path) as f:
+        config = json.load(f)
+    # Download checkpoint
+    checkpoint_path = hf_hub_download(repo_id=model_id, filename="checkpoint.pt")
+    # Use our standalone config that doesn't have 'core' dependencies
+    from onnx_export.standalone_config import TransformerConfig
+    sam_audio_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    # Create fake module hierarchy so transformer.py's relative imports work
+    if 'sam_audio' not in sys.modules:
+        sam_audio_pkg = types.ModuleType('sam_audio')
+        sam_audio_pkg.__path__ = [os.path.join(sam_audio_path, 'sam_audio')]
+        sys.modules['sam_audio'] = sam_audio_pkg
+    if 'sam_audio.model' not in sys.modules:
+        model_pkg = types.ModuleType('sam_audio.model')
+        model_pkg.__path__ = [os.path.join(sam_audio_path, 'sam_audio', 'model')]
+        sys.modules['sam_audio.model'] = model_pkg
+    # Register our standalone config as sam_audio.model.config
+    if 'sam_audio.model.config' not in sys.modules:
+        import onnx_export.standalone_config as standalone_config
+        sys.modules['sam_audio.model.config'] = standalone_config
+    # Now import transformer module - it will use our standalone config
+    transformer_spec = importlib.util.spec_from_file_location(
+        "sam_audio.model.transformer",
+        os.path.join(sam_audio_path, "sam_audio", "model", "transformer.py")
+    )
+    transformer_module = importlib.util.module_from_spec(transformer_spec)
+    sys.modules['sam_audio.model.transformer'] = transformer_module
+    transformer_spec.loader.exec_module(transformer_module)
+    DiT = transformer_module.DiT
+    # Import align module
+    align_spec = importlib.util.spec_from_file_location(
+        "sam_audio.model.align",
+        os.path.join(sam_audio_path, "sam_audio", "model", "align.py")
+    )
+    align_module = importlib.util.module_from_spec(align_spec)
+    sys.modules['sam_audio.model.align'] = align_module
+    align_spec.loader.exec_module(align_module)
+    AlignModalities = align_module.AlignModalities
+    # Create transformer
+    transformer_config = TransformerConfig(**config.get("transformer", {}))
+    transformer = DiT(transformer_config)
+    # Calculate dimensions
+    in_channels = config.get("in_channels", 768)
+    num_anchors = config.get("num_anchors", 3)
+    anchor_embedding_dim = config.get("anchor_embedding_dim", 128)
+    # Get vision encoder dim for align_masked_video
+    vision_config = config.get("vision_encoder", {})
+    vision_dim = vision_config.get("dim", 768)
+    # Create components exactly as SAMAudio does
+    proj = nn.Linear(in_channels, transformer_config.d_model)
+    align_masked_video = AlignModalities(vision_dim, transformer_config.d_model)
+    embed_anchors = EmbedAnchors(num_anchors, anchor_embedding_dim, transformer_config.d_model)
+    timestep_emb = SinusoidalEmbedding(transformer_config.d_model)
+    # Memory projection for text features
+    text_encoder_config = config.get("text_encoder", {})
+    text_encoder_dim = text_encoder_config.get("dim", 1024)  # google/flan-t5-large
+    memory_proj = nn.Linear(text_encoder_dim, transformer_config.d_model)
+    # Load weights from checkpoint
+    print("Loading weights from checkpoint...")
+    state_dict = torch.load(checkpoint_path, map_location="cpu", mmap=True)
+    # Filter and load weights for each component
+    transformer_state = {}
+    proj_state = {}
+    align_state = {}
+    embed_anchors_state = {}
+    memory_proj_state = {}
+    for key, value in state_dict.items():
+        if key.startswith("transformer."):
+            new_key = key[len("transformer."):]
+            transformer_state[new_key] = value
+        elif key.startswith("proj."):
+            new_key = key[len("proj."):]
+            proj_state[new_key] = value
+        elif key.startswith("align_masked_video."):
+            new_key = key[len("align_masked_video."):]
+            align_state[new_key] = value
+        elif key.startswith("embed_anchors."):
+            new_key = key[len("embed_anchors."):]
+            embed_anchors_state[new_key] = value
+        elif key.startswith("memory_proj."):
+            new_key = key[len("memory_proj."):]
+            memory_proj_state[new_key] = value
+    transformer.load_state_dict(transformer_state)
+    proj.load_state_dict(proj_state)
+    align_masked_video.load_state_dict(align_state)
+    embed_anchors.load_state_dict(embed_anchors_state)
+    memory_proj.load_state_dict(memory_proj_state)
+    print(f"  ✓ Loaded transformer weights ({len(transformer_state)} tensors)")
+    print(f"  ✓ Loaded component weights")
+    # Create single step wrapper
+    single_step = DiTSingleStepWrapper(
+        transformer=transformer,
+        proj=proj,
+        align_masked_video=align_masked_video,
+        embed_anchors=embed_anchors,
+        timestep_emb=timestep_emb,
+        memory_proj=memory_proj,
+    ).eval().to(device)
+    return single_step, config
+def create_sample_inputs(batch_size: int = 1, seq_len: int = 25, device: str = "cpu"):
+    """Create sample inputs for tracing."""
+    latent_dim = 128
+    text_dim = 768  # T5-base hidden size (SAM Audio was trained with 768-dim text)
+    vision_dim = 1024  # Vision encoder dim from config
+    text_len = 77
+    return {
+        "noisy_audio": torch.randn(batch_size, seq_len, 2 * latent_dim, device=device),
+        "time": torch.zeros(batch_size, device=device),
+        "audio_features": torch.randn(batch_size, seq_len, 2 * latent_dim, device=device),
+        "text_features": torch.randn(batch_size, text_len, text_dim, device=device),
+        "text_mask": torch.ones(batch_size, text_len, dtype=torch.bool, device=device),
+        "masked_video_features": torch.zeros(batch_size, vision_dim, seq_len, device=device),
+        "anchor_ids": torch.zeros(batch_size, seq_len, dtype=torch.long, device=device),
+        "anchor_alignment": torch.zeros(batch_size, seq_len, dtype=torch.long, device=device),
+        "audio_pad_mask": torch.ones(batch_size, seq_len, dtype=torch.bool, device=device),
+    }
+def export_dit_single_step(
+    single_step: DiTSingleStepWrapper,
+    output_path: str,
+    opset_version: int = 21,
+    device: str = "cpu",
+    fp16: bool = False,
+):
+    """Export single-step DiT to ONNX (for runtime ODE solving)."""
+    import onnx
+    print(f"Exporting DiT single-step to {output_path}...")
+    # Convert to FP16 if requested
+    if fp16:
+        print("  Converting model to FP16...")
+        single_step = single_step.half()
+    sample_inputs = create_sample_inputs(device=device)
+    # Convert float inputs to FP16 if exporting in FP16
+    if fp16:
+        for key, value in sample_inputs.items():
+            if value.dtype == torch.float32:
+                sample_inputs[key] = value.half()
+    torch.onnx.export(
+        single_step,
+        tuple(sample_inputs.values()),
+        output_path,
+        input_names=list(sample_inputs.keys()),
+        output_names=["velocity"],
+        dynamic_axes={
+            "noisy_audio": {0: "batch_size", 1: "seq_len"},
+            "time": {0: "batch_size"},
+            "audio_features": {0: "batch_size", 1: "seq_len"},
+            "text_features": {0: "batch_size", 1: "text_len"},
+            "text_mask": {0: "batch_size", 1: "text_len"},
+            "masked_video_features": {0: "batch_size", 2: "seq_len"},
+            "anchor_ids": {0: "batch_size", 1: "seq_len"},
+            "anchor_alignment": {0: "batch_size", 1: "seq_len"},
+            "audio_pad_mask": {0: "batch_size", 1: "seq_len"},
+            "velocity": {0: "batch_size", 1: "seq_len"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print("  ��� DiT single-step exported successfully")
+    # When using external_data=True, we can't run check_model on a model
+    # loaded without external data - the checker validates data references.
+    # Since torch.onnx.export with dynamo=True already validates the model,
+    # we just verify the files exist.
+    external_data_path = output_path + ".data"
+    if os.path.exists(external_data_path):
+        print(f"  ✓ External data file exists ({os.path.getsize(external_data_path) / 1e9:.2f} GB)")
+    else:
+        raise RuntimeError(f"External data file missing: {external_data_path}")
+    # Verify the ONNX file structure is valid (without loading weights)
+    model = onnx.load(output_path, load_external_data=False)
+    print(f"  ✓ ONNX model structure loaded ({len(model.graph.node)} nodes)")
+    return True
+def verify_dit_single_step(
+    single_step: DiTSingleStepWrapper,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify single-step ONNX output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying DiT single-step output...")
+    sample_inputs = create_sample_inputs(device=device)
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = single_step(**sample_inputs).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_inputs = {}
+    for name, tensor in sample_inputs.items():
+        if tensor.dtype == torch.bool:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(bool)
+        elif tensor.dtype == torch.long:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(np.int64)
+        else:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(np.float32)
+    onnx_output = sess.run(["velocity"], onnx_inputs)[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max difference: {max_diff:.2e}")
+    print(f"  Mean difference: {mean_diff:.2e}")
+    if max_diff < tolerance:
+        print(f"  ✓ Verification passed (tolerance: {tolerance})")
+        return True
+    else:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Export DiT Transformer to ONNX")
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default="facebook/sam-audio-small",
+        help="SAM Audio model ID from HuggingFace",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=16,
+        help="Number of ODE solver steps (default: 16)",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=21,
+        help="ONNX opset version (default: 21)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-3,
+        help="Tolerance for verification (default: 1e-3)",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Export model in FP16 precision (half the size)",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load components
+    single_step, config = load_sam_audio_components(args.model_id, args.device)
+    print(f"\nDiT Configuration:")
+    print(f"  Model: {args.model_id}")
+    print(f"  ODE steps: {args.num_steps}")
+    print(f"  Step size: {1.0/args.num_steps:.4f}")
+    # Export single-step model
+    single_step_path = os.path.join(args.output_dir, "dit_single_step.onnx")
+    export_dit_single_step(
+        single_step,
+        single_step_path,
+        opset_version=args.opset,
+        device=args.device,
+        fp16=args.fp16,
+    )
+    if args.fp16:
+        print(f"  ✓ Model exported in FP16 precision")
+    # Verify single-step
+    if args.verify:
+        verify_dit_single_step(
+            single_step,
+            single_step_path,
+            device=args.device,
+            tolerance=args.tolerance,
+        )
+    print(f"\n✓ Export complete! Model saved to {args.output_dir}")
+if __name__ == "__main__":
+    main()

onnx_export/export_peaframe.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python3
+"""
+Export PE-A-Frame (Perception Encoder Audio Frame) span predictor to ONNX.
+The PE-A-Frame model is used for automatic anchor detection in SAM Audio.
+It analyzes audio features and predicts which segments correspond to the
+target audio source.
+Usage:
+    python -m onnx_export.export_peaframe --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+from typing import Optional
+class PEAFrameWrapper(nn.Module):
+    """
+    Wrapper for PE-A-Frame model for ONNX export.
+    Exposes the forward pass that takes audio features and returns
+    frame-level predictions.
+    """
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model
+    def forward(
+        self,
+        audio_features: torch.Tensor,
+        audio_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for span prediction.
+        Args:
+            audio_features: Audio features [batch, seq_len, hidden_dim]
+            audio_mask: Optional attention mask [batch, seq_len]
+        Returns:
+            Frame-level predictions [batch, seq_len, num_classes]
+        """
+        return self.model(audio_features, attention_mask=audio_mask)
+def load_peaframe_model(config_name: str = "pe-a-frame-large", device: str = "cpu"):
+    """Load the PE-A-Frame model from perception_models."""
+    from core.audio_visual_encoder.pe import PEAudioFrame
+    print(f"Loading PE-A-Frame model: {config_name}...")
+    model = PEAudioFrame.from_config(config_name, pretrained=True)
+    model = model.eval().to(device)
+    num_params = sum(p.numel() for p in model.parameters())
+    print(f"  ✓ Model loaded: {num_params:,} parameters")
+    return model
+def get_tokenizer(model):
+    """Get the text tokenizer from the model config."""
+    from transformers import AutoTokenizer
+    text_model_name = model.config.text_model._name_or_path
+    return AutoTokenizer.from_pretrained(text_model_name)
+def create_sample_inputs(model, batch_size: int = 1, device: str = "cpu"):
+    """Create sample inputs for tracing."""
+    tokenizer = get_tokenizer(model)
+    # Sample text query
+    text = "a person speaking"
+    tokens = tokenizer(
+        [text] * batch_size,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=77,
+    )
+    # Sample audio (10 seconds at 16kHz)
+    # DAC encoder expects (batch, channels, samples) format
+    sample_rate = 16000
+    audio_len = sample_rate * 10
+    audio = torch.randn(batch_size, 1, audio_len, device=device)  # Added channel dimension
+    return {
+        "input_ids": tokens["input_ids"].to(device),
+        "attention_mask": tokens["attention_mask"].to(device),
+        "input_values": audio,
+    }
+def export_peaframe(
+    model: nn.Module,
+    output_path: str,
+    opset_version: int = 21,
+    device: str = "cpu",
+):
+    """Export PE-A-Frame to ONNX."""
+    import onnx
+    print(f"Exporting PE-A-Frame to {output_path}...")
+    sample_inputs = create_sample_inputs(model, device=device)
+    # Put model in eval mode
+    model = model.eval()
+    # Test forward pass first
+    with torch.no_grad():
+        try:
+            output = model(
+                input_ids=sample_inputs["input_ids"],
+                input_values=sample_inputs["input_values"],
+                attention_mask=sample_inputs["attention_mask"],
+                return_spans=False,  # Disable span return for ONNX (list output)
+            )
+            print(f"  Test forward pass: audio_embeds shape = {output.audio_embeds.shape}")
+            print(f"  Test forward pass: text_embeds shape = {output.text_embeds.shape}")
+        except Exception as e:
+            print(f"  Forward pass failed: {e}")
+            raise
+    # Create a wrapper that returns just the audio embeddings for simpler ONNX
+    class PEAFrameONNXWrapper(nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+        def forward(self, input_ids, input_values, attention_mask):
+            output = self.model(
+                input_ids=input_ids,
+                input_values=input_values,
+                attention_mask=attention_mask,
+                return_spans=False,
+            )
+            return output.audio_embeds, output.text_embeds
+    wrapper = PEAFrameONNXWrapper(model)
+    wrapper.eval()
+    torch.onnx.export(
+        wrapper,
+        (sample_inputs["input_ids"], sample_inputs["input_values"], sample_inputs["attention_mask"]),
+        output_path,
+        input_names=["input_ids", "input_values", "attention_mask"],
+        output_names=["audio_embeds", "text_embeds"],
+        dynamic_axes={
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "input_values": {0: "batch_size", 1: "audio_len"},
+            "attention_mask": {0: "batch_size", 1: "seq_len"},
+            "audio_embeds": {0: "batch_size", 1: "num_frames"},
+            "text_embeds": {0: "batch_size"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        external_data=True,
+    )
+    print("  ✓ PE-A-Frame exported successfully")
+    # Load without external data to avoid OOM - we just need to validate structure
+    onnx_model = onnx.load(output_path, load_external_data=False)
+    onnx.checker.check_model(onnx_model, full_check=False)
+    print("  ✓ ONNX model validation passed")
+    return True
+def verify_peaframe(
+    model: nn.Module,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify ONNX output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying PE-A-Frame output...")
+    sample_inputs = create_sample_inputs(model, device=device)
+    # PyTorch output
+    model = model.eval()
+    with torch.no_grad():
+        pytorch_output = model(
+            input_ids=sample_inputs["input_ids"],
+            input_values=sample_inputs["input_values"],
+            attention_mask=sample_inputs["attention_mask"],
+            return_spans=False,
+        )
+        pytorch_audio_embeds = pytorch_output.audio_embeds.cpu().numpy()
+        pytorch_text_embeds = pytorch_output.text_embeds.cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_inputs = {
+        "input_ids": sample_inputs["input_ids"].cpu().numpy().astype(np.int64),
+        "input_values": sample_inputs["input_values"].cpu().numpy().astype(np.float32),
+        "attention_mask": sample_inputs["attention_mask"].cpu().numpy().astype(np.int64),
+    }
+    onnx_outputs = sess.run(["audio_embeds", "text_embeds"], onnx_inputs)
+    onnx_audio_embeds = onnx_outputs[0]
+    onnx_text_embeds = onnx_outputs[1]
+    # Compare
+    audio_max_diff = np.abs(pytorch_audio_embeds - onnx_audio_embeds).max()
+    text_max_diff = np.abs(pytorch_text_embeds - onnx_text_embeds).max()
+    print(f"  Audio embeds max diff: {audio_max_diff:.2e}")
+    print(f"  Text embeds max diff: {text_max_diff:.2e}")
+    max_diff = max(audio_max_diff, text_max_diff)
+    if max_diff < tolerance:
+        print(f"  ✓ Verification passed (tolerance: {tolerance})")
+        return True
+    else:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Export PE-A-Frame to ONNX")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="pe-a-frame-large",
+        help="PE-A-Frame config name",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=18,
+        help="ONNX opset version",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-3,
+        help="Verification tolerance",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load model
+    model = load_peaframe_model(args.config, args.device)
+    # Export
+    output_path = os.path.join(args.output_dir, "peaframe.onnx")
+    export_peaframe(model, output_path, args.opset, args.device)
+    # Verify
+    if args.verify:
+        verify_peaframe(model, output_path, args.device, args.tolerance)
+    print(f"\n✓ Export complete! Model saved to {output_path}")
+if __name__ == "__main__":
+    main()

onnx_export/export_t5.py ADDED Viewed

	@@ -0,0 +1,315 @@

+#!/usr/bin/env python3
+"""
+Export T5 Text Encoder to ONNX format.
+The T5 encoder takes tokenized input_ids and attention_mask, and produces
+hidden states. For SAM Audio inference, the output hidden states and attention
+mask are used as conditioning for the DiT transformer.
+Usage:
+    python -m onnx_export.export_t5 --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+class T5EncoderWrapper(nn.Module):
+    """
+    Wrapper for T5EncoderModel that provides a clean interface for ONNX export.
+    The wrapper takes tokenized inputs (input_ids, attention_mask) and returns
+    the last hidden state. This matches how SAMAudio's T5TextEncoder uses the model.
+    """
+    def __init__(self, t5_model, max_length: int = 77):
+        super().__init__()
+        self.model = t5_model
+        self.max_length = max_length
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            input_ids: Tokenized input IDs, shape (batch, seq_len)
+            attention_mask: Attention mask, shape (batch, seq_len)
+        Returns:
+            hidden_states: T5 encoder output, shape (batch, seq_len, hidden_dim)
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        return outputs.last_hidden_state
+def load_t5_encoder(model_name: str = "google-t5/t5-base", device: str = "cuda"):
+    """
+    Load T5 encoder model and tokenizer.
+    SAM Audio's DiT was trained with T5-base (768-dim) text features.
+    """
+    from transformers import T5EncoderModel, AutoTokenizer
+    print(f"Loading T5 encoder: {model_name}...")
+    model = T5EncoderModel.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = model.eval().to(device)
+    return model, tokenizer
+def export_t5_encoder(
+    t5_model,
+    tokenizer,
+    output_path: str,
+    opset_version: int = 21,
+    max_length: int = 77,
+    device: str = "cuda",
+):
+    """Export T5 encoder to ONNX format."""
+    import onnx
+    print(f"Exporting T5 encoder to {output_path}...")
+    wrapper = T5EncoderWrapper(t5_model, max_length=max_length).eval().to(device)
+    # Create sample input
+    sample_text = ["A dog barking loudly in the background"]
+    encoded = tokenizer(
+        sample_text,
+        truncation=True,
+        max_length=max_length,
+        padding="max_length",  # Pad to max_length for consistent shape
+        return_tensors="pt",
+    )
+    sample_input_ids = encoded["input_ids"].to(device)
+    sample_attention_mask = encoded["attention_mask"].to(device)
+    # Export using torch.onnx.export
+    torch.onnx.export(
+        wrapper,
+        (sample_input_ids, sample_attention_mask),
+        output_path,
+        input_names=["input_ids", "attention_mask"],
+        output_names=["hidden_states"],
+        dynamic_axes={
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
+            "hidden_states": {0: "batch_size", 1: "sequence_length"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,  # T5-large is ~1GB
+    )
+    print("  ✓ T5 encoder exported successfully")
+    # Load without external data to avoid OOM - we just need to validate structure
+    model = onnx.load(output_path, load_external_data=False)
+    onnx.checker.check_model(model, full_check=False)
+    print("  ✓ ONNX model validation passed")
+    return True
+def verify_t5_encoder(
+    t5_model,
+    tokenizer,
+    onnx_path: str,
+    max_length: int = 77,
+    device: str = "cuda",
+    tolerance: float = 1e-4,
+) -> bool:
+    """Verify ONNX T5 encoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying T5 encoder output...")
+    wrapper = T5EncoderWrapper(t5_model, max_length=max_length).eval().to(device)
+    # Test with multiple texts
+    test_texts = [
+        "A dog barking in the distance",
+        "Piano music playing softly",
+        "Rain falling on a rooftop",
+    ]
+    for text in test_texts:
+        # Tokenize
+        encoded = tokenizer(
+            [text],
+            truncation=True,
+            max_length=max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device)
+        # PyTorch output
+        with torch.no_grad():
+            pytorch_output = wrapper(input_ids, attention_mask).cpu().numpy()
+        # ONNX Runtime output
+        sess = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
+        onnx_output = sess.run(
+            ["hidden_states"],
+            {
+                "input_ids": input_ids.cpu().numpy().astype(np.int64),
+                "attention_mask": attention_mask.cpu().numpy().astype(np.int64),
+            }
+        )[0]
+        # Compare
+        max_diff = np.abs(pytorch_output - onnx_output).max()
+        mean_diff = np.abs(pytorch_output - onnx_output).mean()
+        print(f"  Text: '{text[:30]}...'")
+        print(f"    Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+        if max_diff > tolerance:
+            print(f"  ✗ Verification failed for text: {text}")
+            return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def save_tokenizer_config(tokenizer, output_dir: str):
+    """
+    Save tokenizer vocabulary and configuration for runtime use.
+    This allows the ONNX runtime to perform tokenization without
+    needing the full transformers library.
+    """
+    import json
+    tokenizer_dir = os.path.join(output_dir, "tokenizer")
+    tokenizer.save_pretrained(tokenizer_dir)
+    # Also save a simple config for ONNX.js usage
+    config = {
+        "model_name": tokenizer.name_or_path,
+        "max_length": 77,
+        "vocab_size": tokenizer.vocab_size,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+    config_path = os.path.join(output_dir, "tokenizer_config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"  ✓ Tokenizer saved to {tokenizer_dir}")
+    return tokenizer_dir
+def main():
+    parser = argparse.ArgumentParser(description="Export T5 Text Encoder to ONNX")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="google-t5/t5-base",
+        help="T5 model name from HuggingFace (default: google-t5/t5-base)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=77,
+        help="Maximum token sequence length (default: 77)",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=18,
+        help="ONNX opset version (default: 18)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to use for export (default: cuda)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-4,
+        help="Tolerance for verification (default: 1e-4)",
+    )
+    parser.add_argument(
+        "--save-tokenizer",
+        action="store_true",
+        default=True,
+        help="Save tokenizer for runtime use (default: True)",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load T5
+    t5_model, tokenizer = load_t5_encoder(args.model_name, args.device)
+    print(f"\nT5 Configuration:")
+    print(f"  Model: {args.model_name}")
+    print(f"  Hidden size: {t5_model.config.d_model}")
+    print(f"  Max length: {args.max_length}")
+    print(f"  Vocab size: {tokenizer.vocab_size}")
+    # Export
+    encoder_path = os.path.join(args.output_dir, "t5_encoder.onnx")
+    export_t5_encoder(
+        t5_model,
+        tokenizer,
+        encoder_path,
+        opset_version=args.opset,
+        max_length=args.max_length,
+        device=args.device,
+    )
+    # Save tokenizer
+    if args.save_tokenizer:
+        save_tokenizer_config(tokenizer, args.output_dir)
+    # Verify
+    if args.verify:
+        verify_t5_encoder(
+            t5_model,
+            tokenizer,
+            encoder_path,
+            max_length=args.max_length,
+            device=args.device,
+            tolerance=args.tolerance,
+        )
+    print(f"\n✓ Export complete! Model saved to {encoder_path}")
+if __name__ == "__main__":
+    main()

onnx_export/export_vision.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/env python3
+import os
+import torch
+import torch.nn as nn
+import onnx
+from sam_audio.model.vision_encoder import PerceptionEncoder
+from onnx_export.standalone_config import PerceptionEncoderConfig
+class VisionEncoderWrapper(nn.Module):
+    """
+    Wrapper for the Vision Encoder (CLIP visual backbone).
+    """
+    def __init__(self, vision_encoder):
+        super().__init__()
+        self.model = vision_encoder.model
+        self.normalize = vision_encoder.normalize_feature
+    def forward(self, x):
+        # x: (N, 3, H, W) where N is number of frames
+        # returns: (N, 1024) features
+        return self.model.encode_image(x, normalize=self.normalize)
+def export_vision_encoder(model_id="facebook/sam-audio-small", output_dir="onnx_models", device="cpu"):
+    """Export the vision encoder to ONNX."""
+    print(f"Loading Vision Encoder from {model_id}...")
+    import torch
+    from transformers import AutoConfig
+    from sam_audio.model.vision_encoder import PerceptionEncoder
+    from onnx_export.standalone_config import PerceptionEncoderConfig
+    print("Fetching config...")
+    cfg_hf = AutoConfig.from_pretrained(model_id)
+    cfg_dict = cfg_hf.to_dict()
+    # Extract vision encoder config
+    v_cfg_dict = cfg_dict.get("vision_encoder", {})
+    v_cfg = PerceptionEncoderConfig(**v_cfg_dict)
+    print(f"Initializing PerceptionEncoder with name: {v_cfg.name}...")
+    vision_encoder = PerceptionEncoder(v_cfg)
+    # Load weights from checkpoint
+    print("Loading weights from SAM Audio checkpoint...")
+    from huggingface_hub import hf_hub_download
+    checkpoint_path = hf_hub_download(repo_id=model_id, filename="checkpoint.pt")
+    state_dict = torch.load(checkpoint_path, map_location="cpu", mmap=True)
+    # Filter vision encoder weights
+    vision_state = {}
+    prefix = "vision_encoder."
+    for key, value in state_dict.items():
+        if key.startswith(prefix):
+            new_key = key[len(prefix):]
+            vision_state[new_key] = value
+    if vision_state:
+        print(f"  Loading {len(vision_state)} tensors into vision encoder...")
+        vision_encoder.load_state_dict(vision_state)
+        print("  ✓ Vision encoder weights loaded.")
+    else:
+        print("  WARNING: No 'vision_encoder' weights found in checkpoint. Using base weights.")
+    image_size = vision_encoder.image_size
+    print(f"  Image size: {image_size}")
+    wrapper = VisionEncoderWrapper(vision_encoder).eval().to(device)
+    # Create dummy input on device
+    image_size = vision_encoder.image_size
+    dummy_input = torch.randn(1, 3, image_size, image_size, device=device)
+    output_path = os.path.join(output_dir, "vision_encoder.onnx")
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Exporting to {output_path}...")
+    input_names = ["video_frames"]
+    output_names = ["vision_features"]
+    opset_version = 18  # Use opset 18 for better CUDA compatibility
+    torch.onnx.export(
+        wrapper,
+        dummy_input,
+        output_path,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes={
+            "video_frames": {0: "num_frames"},
+            "vision_features": {0: "num_frames"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    # Check if data was saved separately
+    data_path = output_path + ".data"
+    if os.path.exists(data_path):
+        print(f"  Large model detected, weights saved to {data_path}")
+    print("✓ Vision encoder export complete!")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="facebook/sam-audio-small")
+    parser.add_argument("--output", type=str, default="onnx_models")
+    parser.add_argument("--device", type=str, default="cpu", help="Device for export (cpu or cuda)")
+    args = parser.parse_args()
+    export_vision_encoder(args.model, args.output, device=args.device)

onnx_export/quantize_large_model.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python3
+"""
+Memory-efficient FP16 conversion for large ONNX models with external data.
+This script converts models by processing tensors one at a time, avoiding
+loading the entire model into memory.
+Usage:
+    python -m onnx_export.quantize_large_model \
+        --input onnx_models_large/dit_single_step.onnx \
+        --output onnx_models_large_fp16/dit_single_step.onnx
+"""
+import os
+import argparse
+import numpy as np
+from pathlib import Path
+def convert_tensor_to_fp16(tensor_data: np.ndarray) -> np.ndarray:
+    """Convert tensor data to FP16 if it's FP32."""
+    if tensor_data.dtype == np.float32:
+        return tensor_data.astype(np.float16)
+    return tensor_data
+def quantize_large_model_fp16(input_path: str, output_path: str):
+    """
+    Convert large ONNX model to FP16 using onnxruntime.transformers.
+    This properly updates both tensor data AND graph type annotations.
+    """
+    import onnx
+    from onnxruntime.transformers import float16
+    import gc
+    input_dir = os.path.dirname(os.path.abspath(input_path))
+    output_dir = os.path.dirname(os.path.abspath(output_path))
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Loading model from {input_path}...")
+    print(f"  (This may take a while for large models)")
+    # Load model with external data
+    model = onnx.load(input_path, load_external_data=False)
+    onnx.load_external_data_for_model(model, input_dir)
+    original_size = sum(
+        np.prod(tensor.dims) * 4  # Assuming FP32
+        for tensor in model.graph.initializer
+        if tensor.data_type == onnx.TensorProto.FLOAT
+    )
+    print(f"  Loaded model ({original_size / 1e9:.2f} GB of FP32 weights)")
+    print(f"Converting to FP16...")
+    model_fp16 = float16.convert_float_to_float16(
+        model,
+        keep_io_types=True,  # Keep inputs/outputs as FP32 for compatibility
+        disable_shape_infer=True,  # Skip shape inference for speed
+    )
+    # Free original model
+    del model
+    gc.collect()
+    # External data file for output
+    output_data_filename = os.path.basename(output_path) + ".data"
+    print(f"Saving to {output_path}...")
+    onnx.save(
+        model_fp16,
+        output_path,
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location=output_data_filename,
+        size_threshold=0,  # Save all tensors externally
+    )
+    # Report results
+    output_data_path = os.path.join(output_dir, output_data_filename)
+    if os.path.exists(output_path) and os.path.exists(output_data_path):
+        output_size = os.path.getsize(output_data_path)
+        print(f"✓ Model saved successfully!")
+        print(f"  Graph: {os.path.getsize(output_path)/1e6:.2f} MB")
+        print(f"  Weights: {output_size/1e9:.2f} GB")
+        print(f"  Reduction: {(1 - output_size / original_size) * 100:.1f}%")
+    else:
+        raise RuntimeError("Output files were not created properly")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description="Memory-efficient FP16 conversion for large ONNX models")
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Input ONNX model path",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output ONNX model path",
+    )
+    args = parser.parse_args()
+    quantize_large_model_fp16(args.input, args.output)
+if __name__ == "__main__":
+    main()

onnx_export/quantize_models.py ADDED Viewed

	@@ -0,0 +1,286 @@

+#!/usr/bin/env python3
+"""
+Quantize ONNX models for SAM Audio to reduce size and improve inference speed.
+Supports:
+- FP16 quantization (recommended for audio models)
+- INT8 dynamic quantization (best size reduction)
+- INT8 static quantization (requires calibration data)
+Usage:
+    # Quantize all models to FP16
+    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16
+    # Quantize to INT8 (dynamic)
+    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_int8 --mode int8
+    # Quantize specific model
+    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16 --models dit
+"""
+import os
+import argparse
+import shutil
+from pathlib import Path
+def get_model_files(model_dir: str) -> dict:
+    """Find all ONNX model files in directory."""
+    models = {}
+    model_names = {
+        "dit_single_step": "DiT Denoiser",
+        "dacvae_encoder": "DACVAE Encoder",
+        "dacvae_decoder": "DACVAE Decoder",
+        "t5_encoder": "T5 Text Encoder",
+        "vision_encoder": "Vision Encoder",
+    }
+    for name, display_name in model_names.items():
+        onnx_path = os.path.join(model_dir, f"{name}.onnx")
+        if os.path.exists(onnx_path):
+            models[name] = {
+                "path": onnx_path,
+                "display_name": display_name,
+                "has_external_data": os.path.exists(f"{onnx_path}.data"),
+            }
+    return models
+def quantize_fp16(input_path: str, output_path: str, has_external_data: bool = False):
+    """Convert model to FP16 precision."""
+    import onnx
+    from onnxruntime.transformers import float16
+    print(f"  Loading model...")
+    # For models with external data, load everything into memory
+    if has_external_data:
+        model_dir = os.path.dirname(os.path.abspath(input_path))
+        model = onnx.load(input_path, load_external_data=False)
+        onnx.load_external_data_for_model(model, model_dir)
+    else:
+        model = onnx.load(input_path)
+    print(f"  Converting to FP16...")
+    model_fp16 = float16.convert_float_to_float16(
+        model,
+        keep_io_types=True,  # Keep inputs/outputs as FP32 for compatibility
+        disable_shape_infer=True,  # Skip shape inference (faster)
+    )
+    # Free original model memory
+    del model
+    import gc
+    gc.collect()
+    # Calculate the size of the FP16 model
+    # We estimate by serializing - only use external data if over 2GB
+    print(f"  Saving to {output_path}...")
+    # First try to save without external data (preferred for smaller models)
+    try:
+        # Serialize to check size
+        model_bytes = model_fp16.SerializeToString()
+        model_size = len(model_bytes)
+        if model_size < 2 * 1024 * 1024 * 1024:  # Under 2GB
+            # Save as self-contained file (no external data)
+            with open(output_path, 'wb') as f:
+                f.write(model_bytes)
+            print(f"  Saved as self-contained ONNX ({model_size/1e6:.1f} MB)")
+        else:
+            # Too large, need external data
+            onnx.save(
+                model_fp16,
+                output_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+                location=os.path.basename(output_path) + ".data",
+                size_threshold=0,
+            )
+            print(f"  Saved with external data ({model_size/1e9:.2f} GB)")
+    except Exception as e:
+        # If serialization fails (too large), use external data
+        print(f"  Model too large for memory, saving with external data...")
+        onnx.save(
+            model_fp16,
+            output_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location=os.path.basename(output_path) + ".data",
+            size_threshold=0,
+        )
+    return True
+def quantize_int8_dynamic(input_path: str, output_path: str, has_external_data: bool = False):
+    """Quantize model to INT8 using dynamic quantization."""
+    from onnxruntime.quantization import quantize_dynamic, QuantType
+    import onnx
+    print(f"  Loading model...")
+    # For models with external data, we need to load and re-save first
+    if has_external_data:
+        model = onnx.load(input_path, load_external_data=True)
+        temp_path = input_path + ".temp.onnx"
+        onnx.save(model, temp_path)
+        input_path = temp_path
+    print(f"  Quantizing to INT8 (dynamic)...")
+    quantize_dynamic(
+        input_path,
+        output_path,
+        weight_type=QuantType.QInt8,
+        extra_options={
+            "EnableSubgraph": True,
+        }
+    )
+    # Cleanup temp file
+    if has_external_data and os.path.exists(input_path + ".temp.onnx"):
+        os.remove(input_path + ".temp.onnx")
+    return True
+def quantize_model(
+    name: str,
+    model_info: dict,
+    output_dir: str,
+    mode: str,
+) -> bool:
+    """Quantize a single model."""
+    input_path = model_info["path"]
+    output_path = os.path.join(output_dir, f"{name}.onnx")
+    has_external_data = model_info["has_external_data"]
+    print(f"\nQuantizing {model_info['display_name']}...")
+    print(f"  Input: {input_path}")
+    print(f"  Output: {output_path}")
+    print(f"  External data: {has_external_data}")
+    try:
+        if mode == "fp16":
+            success = quantize_fp16(input_path, output_path, has_external_data)
+        elif mode == "int8":
+            success = quantize_int8_dynamic(input_path, output_path, has_external_data)
+        else:
+            print(f"  ✗ Unknown quantization mode: {mode}")
+            return False
+        if success:
+            # Report size reduction
+            input_size = os.path.getsize(input_path)
+            if has_external_data:
+                input_size += os.path.getsize(input_path + ".data")
+            output_size = os.path.getsize(output_path)
+            if os.path.exists(output_path + ".data"):
+                output_size += os.path.getsize(output_path + ".data")
+            reduction = (1 - output_size / input_size) * 100
+            print(f"  ✓ Done! Size: {input_size/1e9:.2f}GB → {output_size/1e9:.2f}GB ({reduction:.1f}% reduction)")
+            return True
+    except Exception as e:
+        print(f"  ✗ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return False
+def copy_tokenizer(model_dir: str, output_dir: str):
+    """Copy tokenizer files to output directory."""
+    tokenizer_dir = os.path.join(model_dir, "tokenizer")
+    tokenizer_config = os.path.join(model_dir, "tokenizer_config.json")
+    if os.path.exists(tokenizer_dir):
+        output_tokenizer_dir = os.path.join(output_dir, "tokenizer")
+        if not os.path.exists(output_tokenizer_dir):
+            shutil.copytree(tokenizer_dir, output_tokenizer_dir)
+            print(f"\n✓ Copied tokenizer directory")
+    if os.path.exists(tokenizer_config):
+        shutil.copy(tokenizer_config, os.path.join(output_dir, "tokenizer_config.json"))
+        print(f"✓ Copied tokenizer_config.json")
+def main():
+    parser = argparse.ArgumentParser(description="Quantize ONNX models for SAM Audio")
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default="onnx_models",
+        help="Directory containing ONNX models",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Output directory for quantized models",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["fp16", "int8"],
+        default="fp16",
+        help="Quantization mode: fp16 (recommended) or int8",
+    )
+    parser.add_argument(
+        "--models",
+        type=str,
+        nargs="+",
+        choices=["dit", "dacvae_encoder", "dacvae_decoder", "t5", "vision", "all"],
+        default=["all"],
+        help="Which models to quantize",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Find models
+    models = get_model_files(args.model_dir)
+    if not models:
+        print(f"No ONNX models found in {args.model_dir}")
+        return
+    print(f"Found {len(models)} models in {args.model_dir}")
+    print(f"Quantization mode: {args.mode.upper()}")
+    # Filter models if specific ones requested
+    if "all" not in args.models:
+        name_mapping = {
+            "dit": "dit_single_step",
+            "dacvae_encoder": "dacvae_encoder",
+            "dacvae_decoder": "dacvae_decoder",
+            "t5": "t5_encoder",
+            "vision": "vision_encoder",
+        }
+        selected = {name_mapping[m] for m in args.models if m in name_mapping}
+        models = {k: v for k, v in models.items() if k in selected}
+    # Quantize each model
+    success_count = 0
+    for name, model_info in models.items():
+        if quantize_model(name, model_info, args.output_dir, args.mode):
+            success_count += 1
+    # Copy tokenizer files
+    copy_tokenizer(args.model_dir, args.output_dir)
+    print(f"\n{'='*50}")
+    print(f"✓ Quantization complete! {success_count}/{len(models)} models processed")
+    print(f"  Output directory: {args.output_dir}")
+if __name__ == "__main__":
+    main()