matbee commited on
Commit
2d75c95
·
verified ·
1 Parent(s): 07823f7

Delete files onnx_export/quantize_models.py onnx_export/quantize_large_model.py with huggingface_hub

Browse files
onnx_export/quantize_large_model.py DELETED
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Memory-efficient FP16 conversion for large ONNX models with external data.
4
-
5
- This script converts models by processing tensors one at a time, avoiding
6
- loading the entire model into memory.
7
-
8
- Usage:
9
- python -m onnx_export.quantize_large_model \
10
- --input onnx_models_large/dit_single_step.onnx \
11
- --output onnx_models_large_fp16/dit_single_step.onnx
12
- """
13
-
14
- import os
15
- import argparse
16
- import numpy as np
17
- from pathlib import Path
18
-
19
-
20
- def convert_tensor_to_fp16(tensor_data: np.ndarray) -> np.ndarray:
21
- """Convert tensor data to FP16 if it's FP32."""
22
- if tensor_data.dtype == np.float32:
23
- return tensor_data.astype(np.float16)
24
- return tensor_data
25
-
26
-
27
- def quantize_large_model_fp16(input_path: str, output_path: str):
28
- """
29
- Convert large ONNX model to FP16 using onnxruntime.transformers.
30
-
31
- This properly updates both tensor data AND graph type annotations.
32
- """
33
- import onnx
34
- from onnxruntime.transformers import float16
35
- import gc
36
-
37
- input_dir = os.path.dirname(os.path.abspath(input_path))
38
- output_dir = os.path.dirname(os.path.abspath(output_path))
39
- os.makedirs(output_dir, exist_ok=True)
40
-
41
- print(f"Loading model from {input_path}...")
42
- print(f" (This may take a while for large models)")
43
-
44
- # Load model with external data
45
- model = onnx.load(input_path, load_external_data=False)
46
- onnx.load_external_data_for_model(model, input_dir)
47
-
48
- original_size = sum(
49
- np.prod(tensor.dims) * 4 # Assuming FP32
50
- for tensor in model.graph.initializer
51
- if tensor.data_type == onnx.TensorProto.FLOAT
52
- )
53
-
54
- print(f" Loaded model ({original_size / 1e9:.2f} GB of FP32 weights)")
55
-
56
- print(f"Converting to FP16...")
57
- model_fp16 = float16.convert_float_to_float16(
58
- model,
59
- keep_io_types=True, # Keep inputs/outputs as FP32 for compatibility
60
- disable_shape_infer=True, # Skip shape inference for speed
61
- )
62
-
63
- # Free original model
64
- del model
65
- gc.collect()
66
-
67
- # External data file for output
68
- output_data_filename = os.path.basename(output_path) + ".data"
69
-
70
- print(f"Saving to {output_path}...")
71
- onnx.save(
72
- model_fp16,
73
- output_path,
74
- save_as_external_data=True,
75
- all_tensors_to_one_file=True,
76
- location=output_data_filename,
77
- size_threshold=0, # Save all tensors externally
78
- )
79
-
80
- # Report results
81
- output_data_path = os.path.join(output_dir, output_data_filename)
82
- if os.path.exists(output_path) and os.path.exists(output_data_path):
83
- output_size = os.path.getsize(output_data_path)
84
- print(f"✓ Model saved successfully!")
85
- print(f" Graph: {os.path.getsize(output_path)/1e6:.2f} MB")
86
- print(f" Weights: {output_size/1e9:.2f} GB")
87
- print(f" Reduction: {(1 - output_size / original_size) * 100:.1f}%")
88
- else:
89
- raise RuntimeError("Output files were not created properly")
90
-
91
- return True
92
-
93
-
94
- def main():
95
- parser = argparse.ArgumentParser(description="Memory-efficient FP16 conversion for large ONNX models")
96
- parser.add_argument(
97
- "--input",
98
- type=str,
99
- required=True,
100
- help="Input ONNX model path",
101
- )
102
- parser.add_argument(
103
- "--output",
104
- type=str,
105
- required=True,
106
- help="Output ONNX model path",
107
- )
108
-
109
- args = parser.parse_args()
110
-
111
- quantize_large_model_fp16(args.input, args.output)
112
-
113
-
114
- if __name__ == "__main__":
115
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx_export/quantize_models.py DELETED
@@ -1,286 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Quantize ONNX models for SAM Audio to reduce size and improve inference speed.
4
-
5
- Supports:
6
- - FP16 quantization (recommended for audio models)
7
- - INT8 dynamic quantization (best size reduction)
8
- - INT8 static quantization (requires calibration data)
9
-
10
- Usage:
11
- # Quantize all models to FP16
12
- python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16
13
-
14
- # Quantize to INT8 (dynamic)
15
- python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_int8 --mode int8
16
-
17
- # Quantize specific model
18
- python -m onnx_export.quantize_models --model-dir onnx_models --output-dir onnx_models_fp16 --mode fp16 --models dit
19
- """
20
-
21
- import os
22
- import argparse
23
- import shutil
24
- from pathlib import Path
25
-
26
-
27
- def get_model_files(model_dir: str) -> dict:
28
- """Find all ONNX model files in directory."""
29
- models = {}
30
- model_names = {
31
- "dit_single_step": "DiT Denoiser",
32
- "dacvae_encoder": "DACVAE Encoder",
33
- "dacvae_decoder": "DACVAE Decoder",
34
- "t5_encoder": "T5 Text Encoder",
35
- "vision_encoder": "Vision Encoder",
36
- }
37
-
38
- for name, display_name in model_names.items():
39
- onnx_path = os.path.join(model_dir, f"{name}.onnx")
40
- if os.path.exists(onnx_path):
41
- models[name] = {
42
- "path": onnx_path,
43
- "display_name": display_name,
44
- "has_external_data": os.path.exists(f"{onnx_path}.data"),
45
- }
46
-
47
- return models
48
-
49
-
50
- def quantize_fp16(input_path: str, output_path: str, has_external_data: bool = False):
51
- """Convert model to FP16 precision."""
52
- import onnx
53
- from onnxruntime.transformers import float16
54
-
55
- print(f" Loading model...")
56
-
57
- # For models with external data, load everything into memory
58
- if has_external_data:
59
- model_dir = os.path.dirname(os.path.abspath(input_path))
60
- model = onnx.load(input_path, load_external_data=False)
61
- onnx.load_external_data_for_model(model, model_dir)
62
- else:
63
- model = onnx.load(input_path)
64
-
65
- print(f" Converting to FP16...")
66
- model_fp16 = float16.convert_float_to_float16(
67
- model,
68
- keep_io_types=True, # Keep inputs/outputs as FP32 for compatibility
69
- disable_shape_infer=True, # Skip shape inference (faster)
70
- )
71
-
72
- # Free original model memory
73
- del model
74
- import gc
75
- gc.collect()
76
-
77
- # Calculate the size of the FP16 model
78
- # We estimate by serializing - only use external data if over 2GB
79
- print(f" Saving to {output_path}...")
80
-
81
- # First try to save without external data (preferred for smaller models)
82
- try:
83
- # Serialize to check size
84
- model_bytes = model_fp16.SerializeToString()
85
- model_size = len(model_bytes)
86
-
87
- if model_size < 2 * 1024 * 1024 * 1024: # Under 2GB
88
- # Save as self-contained file (no external data)
89
- with open(output_path, 'wb') as f:
90
- f.write(model_bytes)
91
- print(f" Saved as self-contained ONNX ({model_size/1e6:.1f} MB)")
92
- else:
93
- # Too large, need external data
94
- onnx.save(
95
- model_fp16,
96
- output_path,
97
- save_as_external_data=True,
98
- all_tensors_to_one_file=True,
99
- location=os.path.basename(output_path) + ".data",
100
- size_threshold=0,
101
- )
102
- print(f" Saved with external data ({model_size/1e9:.2f} GB)")
103
- except Exception as e:
104
- # If serialization fails (too large), use external data
105
- print(f" Model too large for memory, saving with external data...")
106
- onnx.save(
107
- model_fp16,
108
- output_path,
109
- save_as_external_data=True,
110
- all_tensors_to_one_file=True,
111
- location=os.path.basename(output_path) + ".data",
112
- size_threshold=0,
113
- )
114
-
115
- return True
116
-
117
-
118
- def quantize_int8_dynamic(input_path: str, output_path: str, has_external_data: bool = False):
119
- """Quantize model to INT8 using dynamic quantization."""
120
- from onnxruntime.quantization import quantize_dynamic, QuantType
121
- import onnx
122
-
123
- print(f" Loading model...")
124
-
125
- # For models with external data, we need to load and re-save first
126
- if has_external_data:
127
- model = onnx.load(input_path, load_external_data=True)
128
- temp_path = input_path + ".temp.onnx"
129
- onnx.save(model, temp_path)
130
- input_path = temp_path
131
-
132
- print(f" Quantizing to INT8 (dynamic)...")
133
-
134
- quantize_dynamic(
135
- input_path,
136
- output_path,
137
- weight_type=QuantType.QInt8,
138
- extra_options={
139
- "EnableSubgraph": True,
140
- }
141
- )
142
-
143
- # Cleanup temp file
144
- if has_external_data and os.path.exists(input_path + ".temp.onnx"):
145
- os.remove(input_path + ".temp.onnx")
146
-
147
- return True
148
-
149
-
150
- def quantize_model(
151
- name: str,
152
- model_info: dict,
153
- output_dir: str,
154
- mode: str,
155
- ) -> bool:
156
- """Quantize a single model."""
157
- input_path = model_info["path"]
158
- output_path = os.path.join(output_dir, f"{name}.onnx")
159
- has_external_data = model_info["has_external_data"]
160
-
161
- print(f"\nQuantizing {model_info['display_name']}...")
162
- print(f" Input: {input_path}")
163
- print(f" Output: {output_path}")
164
- print(f" External data: {has_external_data}")
165
-
166
- try:
167
- if mode == "fp16":
168
- success = quantize_fp16(input_path, output_path, has_external_data)
169
- elif mode == "int8":
170
- success = quantize_int8_dynamic(input_path, output_path, has_external_data)
171
- else:
172
- print(f" ✗ Unknown quantization mode: {mode}")
173
- return False
174
-
175
- if success:
176
- # Report size reduction
177
- input_size = os.path.getsize(input_path)
178
- if has_external_data:
179
- input_size += os.path.getsize(input_path + ".data")
180
-
181
- output_size = os.path.getsize(output_path)
182
- if os.path.exists(output_path + ".data"):
183
- output_size += os.path.getsize(output_path + ".data")
184
-
185
- reduction = (1 - output_size / input_size) * 100
186
- print(f" ✓ Done! Size: {input_size/1e9:.2f}GB → {output_size/1e9:.2f}GB ({reduction:.1f}% reduction)")
187
- return True
188
-
189
- except Exception as e:
190
- print(f" ✗ Error: {e}")
191
- import traceback
192
- traceback.print_exc()
193
- return False
194
-
195
- return False
196
-
197
-
198
- def copy_tokenizer(model_dir: str, output_dir: str):
199
- """Copy tokenizer files to output directory."""
200
- tokenizer_dir = os.path.join(model_dir, "tokenizer")
201
- tokenizer_config = os.path.join(model_dir, "tokenizer_config.json")
202
-
203
- if os.path.exists(tokenizer_dir):
204
- output_tokenizer_dir = os.path.join(output_dir, "tokenizer")
205
- if not os.path.exists(output_tokenizer_dir):
206
- shutil.copytree(tokenizer_dir, output_tokenizer_dir)
207
- print(f"\n✓ Copied tokenizer directory")
208
-
209
- if os.path.exists(tokenizer_config):
210
- shutil.copy(tokenizer_config, os.path.join(output_dir, "tokenizer_config.json"))
211
- print(f"✓ Copied tokenizer_config.json")
212
-
213
-
214
- def main():
215
- parser = argparse.ArgumentParser(description="Quantize ONNX models for SAM Audio")
216
- parser.add_argument(
217
- "--model-dir",
218
- type=str,
219
- default="onnx_models",
220
- help="Directory containing ONNX models",
221
- )
222
- parser.add_argument(
223
- "--output-dir",
224
- type=str,
225
- required=True,
226
- help="Output directory for quantized models",
227
- )
228
- parser.add_argument(
229
- "--mode",
230
- type=str,
231
- choices=["fp16", "int8"],
232
- default="fp16",
233
- help="Quantization mode: fp16 (recommended) or int8",
234
- )
235
- parser.add_argument(
236
- "--models",
237
- type=str,
238
- nargs="+",
239
- choices=["dit", "dacvae_encoder", "dacvae_decoder", "t5", "vision", "all"],
240
- default=["all"],
241
- help="Which models to quantize",
242
- )
243
-
244
- args = parser.parse_args()
245
-
246
- # Create output directory
247
- os.makedirs(args.output_dir, exist_ok=True)
248
-
249
- # Find models
250
- models = get_model_files(args.model_dir)
251
-
252
- if not models:
253
- print(f"No ONNX models found in {args.model_dir}")
254
- return
255
-
256
- print(f"Found {len(models)} models in {args.model_dir}")
257
- print(f"Quantization mode: {args.mode.upper()}")
258
-
259
- # Filter models if specific ones requested
260
- if "all" not in args.models:
261
- name_mapping = {
262
- "dit": "dit_single_step",
263
- "dacvae_encoder": "dacvae_encoder",
264
- "dacvae_decoder": "dacvae_decoder",
265
- "t5": "t5_encoder",
266
- "vision": "vision_encoder",
267
- }
268
- selected = {name_mapping[m] for m in args.models if m in name_mapping}
269
- models = {k: v for k, v in models.items() if k in selected}
270
-
271
- # Quantize each model
272
- success_count = 0
273
- for name, model_info in models.items():
274
- if quantize_model(name, model_info, args.output_dir, args.mode):
275
- success_count += 1
276
-
277
- # Copy tokenizer files
278
- copy_tokenizer(args.model_dir, args.output_dir)
279
-
280
- print(f"\n{'='*50}")
281
- print(f"✓ Quantization complete! {success_count}/{len(models)} models processed")
282
- print(f" Output directory: {args.output_dir}")
283
-
284
-
285
- if __name__ == "__main__":
286
- main()