Delete files onnx_export/quantize_models.py onnx_export/quantize_large_model.py with huggingface_hub

Browse files

Files changed (2) hide show

onnx_export/quantize_large_model.py +0 -115
onnx_export/quantize_models.py +0 -286

onnx_export/quantize_large_model.py DELETED Viewed

@@ -1,115 +0,0 @@
-#!/usr/bin/env python3
-"""
-Memory-efficient FP16 conversion for large ONNX models with external data.
-This script converts models by processing tensors one at a time, avoiding
-loading the entire model into memory.
-Usage:
-    python -m onnx_export.quantize_large_model \
-        --input onnx_models_large/dit_single_step.onnx \
-        --output onnx_models_large_fp16/dit_single_step.onnx
-"""
-import os
-import argparse
-import numpy as np
-from pathlib import Path
-def convert_tensor_to_fp16(tensor_data: np.ndarray) -> np.ndarray:
-    """Convert tensor data to FP16 if it's FP32."""
-    if tensor_data.dtype == np.float32:
-        return tensor_data.astype(np.float16)
-    return tensor_data
-def quantize_large_model_fp16(input_path: str, output_path: str):
-    """
-    Convert large ONNX model to FP16 using onnxruntime.transformers.
-    This properly updates both tensor data AND graph type annotations.
-    """
-    import onnx
-    from onnxruntime.transformers import float16
-    import gc
-    input_dir = os.path.dirname(os.path.abspath(input_path))
-    output_dir = os.path.dirname(os.path.abspath(output_path))
-    os.makedirs(output_dir, exist_ok=True)
-    print(f"Loading model from {input_path}...")
-    print(f"  (This may take a while for large models)")
-    # Load model with external data
-    model = onnx.load(input_path, load_external_data=False)
-    onnx.load_external_data_for_model(model, input_dir)
-    original_size = sum(
-        np.prod(tensor.dims) * 4  # Assuming FP32
-        for tensor in model.graph.initializer
-        if tensor.data_type == onnx.TensorProto.FLOAT
-    )
-    print(f"  Loaded model ({original_size / 1e9:.2f} GB of FP32 weights)")
-    print(f"Converting to FP16...")
-    model_fp16 = float16.convert_float_to_float16(
-        model,
-        keep_io_types=True,  # Keep inputs/outputs as FP32 for compatibility
-        disable_shape_infer=True,  # Skip shape inference for speed
-    )
-    # Free original model
-    del model
-    gc.collect()
-    # External data file for output
-    output_data_filename = os.path.basename(output_path) + ".data"
-    print(f"Saving to {output_path}...")
-    onnx.save(
-        model_fp16,
-        output_path,
-        save_as_external_data=True,
-        all_tensors_to_one_file=True,
-        location=output_data_filename,
-        size_threshold=0,  # Save all tensors externally
-    )
-    # Report results
-    output_data_path = os.path.join(output_dir, output_data_filename)
-    if os.path.exists(output_path) and os.path.exists(output_data_path):
-        output_size = os.path.getsize(output_data_path)
-        print(f"✓ Model saved successfully!")
-        print(f"  Graph: {os.path.getsize(output_path)/1e6:.2f} MB")
-        print(f"  Weights: {output_size/1e9:.2f} GB")
-        print(f"  Reduction: {(1 - output_size / original_size) * 100:.1f}%")
-    else:
-        raise RuntimeError("Output files were not created properly")
-    return True
-def main():
-    parser = argparse.ArgumentParser(description="Memory-efficient FP16 conversion for large ONNX models")
-    parser.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Input ONNX model path",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        required=True,
-        help="Output ONNX model path",
-    )
-    args = parser.parse_args()
-    quantize_large_model_fp16(args.input, args.output)
-if __name__ == "__main__":
-    main()

onnx_export/quantize_models.py DELETED Viewed

@@ -1,286 +0,0 @@
-#!/usr/bin/env python3
-"""
-Quantize ONNX models for SAM Audio to reduce size and improve inference speed.
-Supports:
-- FP16 quantization (recommended for audio models)
-- INT8 dynamic quantization (best size reduction)
-- INT8 static quantization (requires calibration data)
-Usage:
-    # Quantize all models to FP16
-    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16
-    # Quantize to INT8 (dynamic)
-    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_int8 --mode int8
-    # Quantize specific model
-    python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16 --models dit
-"""
-import os
-import argparse
-import shutil
-from pathlib import Path
-def get_model_files(model_dir: str) -> dict:
-    """Find all ONNX model files in directory."""
-    models = {}
-    model_names = {
-        "dit_single_step": "DiT Denoiser",
-        "dacvae_encoder": "DACVAE Encoder",
-        "dacvae_decoder": "DACVAE Decoder",
-        "t5_encoder": "T5 Text Encoder",
-        "vision_encoder": "Vision Encoder",
-    }
-    for name, display_name in model_names.items():
-        onnx_path = os.path.join(model_dir, f"{name}.onnx")
-        if os.path.exists(onnx_path):
-            models[name] = {
-                "path": onnx_path,
-                "display_name": display_name,
-                "has_external_data": os.path.exists(f"{onnx_path}.data"),
-            }
-    return models
-def quantize_fp16(input_path: str, output_path: str, has_external_data: bool = False):
-    """Convert model to FP16 precision."""
-    import onnx
-    from onnxruntime.transformers import float16
-    print(f"  Loading model...")
-    # For models with external data, load everything into memory
-    if has_external_data:
-        model_dir = os.path.dirname(os.path.abspath(input_path))
-        model = onnx.load(input_path, load_external_data=False)
-        onnx.load_external_data_for_model(model, model_dir)
-    else:
-        model = onnx.load(input_path)
-    print(f"  Converting to FP16...")
-    model_fp16 = float16.convert_float_to_float16(
-        model,
-        keep_io_types=True,  # Keep inputs/outputs as FP32 for compatibility
-        disable_shape_infer=True,  # Skip shape inference (faster)
-    )
-    # Free original model memory
-    del model
-    import gc
-    gc.collect()
-    # Calculate the size of the FP16 model
-    # We estimate by serializing - only use external data if over 2GB
-    print(f"  Saving to {output_path}...")
-    # First try to save without external data (preferred for smaller models)
-    try:
-        # Serialize to check size
-        model_bytes = model_fp16.SerializeToString()
-        model_size = len(model_bytes)
-        if model_size < 2 * 1024 * 1024 * 1024:  # Under 2GB
-            # Save as self-contained file (no external data)
-            with open(output_path, 'wb') as f:
-                f.write(model_bytes)
-            print(f"  Saved as self-contained ONNX ({model_size/1e6:.1f} MB)")
-        else:
-            # Too large, need external data
-            onnx.save(
-                model_fp16,
-                output_path,
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location=os.path.basename(output_path) + ".data",
-                size_threshold=0,
-            )
-            print(f"  Saved with external data ({model_size/1e9:.2f} GB)")
-    except Exception as e:
-        # If serialization fails (too large), use external data
-        print(f"  Model too large for memory, saving with external data...")
-        onnx.save(
-            model_fp16,
-            output_path,
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            location=os.path.basename(output_path) + ".data",
-            size_threshold=0,
-        )
-    return True
-def quantize_int8_dynamic(input_path: str, output_path: str, has_external_data: bool = False):
-    """Quantize model to INT8 using dynamic quantization."""
-    from onnxruntime.quantization import quantize_dynamic, QuantType
-    import onnx
-    print(f"  Loading model...")
-    # For models with external data, we need to load and re-save first
-    if has_external_data:
-        model = onnx.load(input_path, load_external_data=True)
-        temp_path = input_path + ".temp.onnx"
-        onnx.save(model, temp_path)
-        input_path = temp_path
-    print(f"  Quantizing to INT8 (dynamic)...")
-    quantize_dynamic(
-        input_path,
-        output_path,
-        weight_type=QuantType.QInt8,
-        extra_options={
-            "EnableSubgraph": True,
-        }
-    )
-    # Cleanup temp file
-    if has_external_data and os.path.exists(input_path + ".temp.onnx"):
-        os.remove(input_path + ".temp.onnx")
-    return True
-def quantize_model(
-    name: str,
-    model_info: dict,
-    output_dir: str,
-    mode: str,
-) -> bool:
-    """Quantize a single model."""
-    input_path = model_info["path"]
-    output_path = os.path.join(output_dir, f"{name}.onnx")
-    has_external_data = model_info["has_external_data"]
-    print(f"\nQuantizing {model_info['display_name']}...")
-    print(f"  Input: {input_path}")
-    print(f"  Output: {output_path}")
-    print(f"  External data: {has_external_data}")
-    try:
-        if mode == "fp16":
-            success = quantize_fp16(input_path, output_path, has_external_data)
-        elif mode == "int8":
-            success = quantize_int8_dynamic(input_path, output_path, has_external_data)
-        else:
-            print(f"  ✗ Unknown quantization mode: {mode}")
-            return False
-        if success:
-            # Report size reduction
-            input_size = os.path.getsize(input_path)
-            if has_external_data:
-                input_size += os.path.getsize(input_path + ".data")
-            output_size = os.path.getsize(output_path)
-            if os.path.exists(output_path + ".data"):
-                output_size += os.path.getsize(output_path + ".data")
-            reduction = (1 - output_size / input_size) * 100
-            print(f"  ✓ Done! Size: {input_size/1e9:.2f}GB → {output_size/1e9:.2f}GB ({reduction:.1f}% reduction)")
-            return True
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-    return False
-def copy_tokenizer(model_dir: str, output_dir: str):
-    """Copy tokenizer files to output directory."""
-    tokenizer_dir = os.path.join(model_dir, "tokenizer")
-    tokenizer_config = os.path.join(model_dir, "tokenizer_config.json")
-    if os.path.exists(tokenizer_dir):
-        output_tokenizer_dir = os.path.join(output_dir, "tokenizer")
-        if not os.path.exists(output_tokenizer_dir):
-            shutil.copytree(tokenizer_dir, output_tokenizer_dir)
-            print(f"\n✓ Copied tokenizer directory")
-    if os.path.exists(tokenizer_config):
-        shutil.copy(tokenizer_config, os.path.join(output_dir, "tokenizer_config.json"))
-        print(f"✓ Copied tokenizer_config.json")
-def main():
-    parser = argparse.ArgumentParser(description="Quantize ONNX models for SAM Audio")
-    parser.add_argument(
-        "--model-dir",
-        type=str,
-        default="onnx_models",
-        help="Directory containing ONNX models",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        required=True,
-        help="Output directory for quantized models",
-    )
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["fp16", "int8"],
-        default="fp16",
-        help="Quantization mode: fp16 (recommended) or int8",
-    )
-    parser.add_argument(
-        "--models",
-        type=str,
-        nargs="+",
-        choices=["dit", "dacvae_encoder", "dacvae_decoder", "t5", "vision", "all"],
-        default=["all"],
-        help="Which models to quantize",
-    )
-    args = parser.parse_args()
-    # Create output directory
-    os.makedirs(args.output_dir, exist_ok=True)
-    # Find models
-    models = get_model_files(args.model_dir)
-    if not models:
-        print(f"No ONNX models found in {args.model_dir}")
-        return
-    print(f"Found {len(models)} models in {args.model_dir}")
-    print(f"Quantization mode: {args.mode.upper()}")
-    # Filter models if specific ones requested
-    if "all" not in args.models:
-        name_mapping = {
-            "dit": "dit_single_step",
-            "dacvae_encoder": "dacvae_encoder",
-            "dacvae_decoder": "dacvae_decoder",
-            "t5": "t5_encoder",
-            "vision": "vision_encoder",
-        }
-        selected = {name_mapping[m] for m in args.models if m in name_mapping}
-        models = {k: v for k, v in models.items() if k in selected}
-    # Quantize each model
-    success_count = 0
-    for name, model_info in models.items():
-        if quantize_model(name, model_info, args.output_dir, args.mode):
-            success_count += 1
-    # Copy tokenizer files
-    copy_tokenizer(args.model_dir, args.output_dir)
-    print(f"\n{'='*50}")
-    print(f"✓ Quantization complete! {success_count}/{len(models)} models processed")
-    print(f"  Output directory: {args.output_dir}")
-if __name__ == "__main__":
-    main()