"""VRAM Calculator for HuggingFace Models""" from __future__ import annotations import gradio as gr from huggingface_hub import HfApi, hf_hub_download import json from functools import lru_cache api = HfApi() # Consumer GPUs: (VRAM GB, Memory Bandwidth GB/s) CONSUMER_GPUS = { "RTX 3080": (10, 760), "RTX 3080 Ti": (12, 912), "RTX 3090": (24, 936), "RTX 3090 Ti": (24, 1008), "RTX 4080": (16, 717), "RTX 4080 Super": (16, 736), "RTX 4090": (24, 1008), "RTX 5090": (32, 1792), } # Apple Silicon: (Unified Memory GB, Memory Bandwidth GB/s) APPLE_GPUS = { "M1 Max": (64, 400), "M2 Max": (96, 400), "M2 Ultra": (192, 800), "M3 Max": (128, 400), "M4 Max": (128, 546), } # Cloud/Datacenter GPUs: (VRAM GB, $/hr, Memory Bandwidth GB/s) CLOUD_GPUS = { "T4": (16, 0.35, 320), "L4": (24, 0.70, 300), "A10G": (24, 1.00, 600), "RTX A5000": (24, 0.80, 768), "RTX A6000": (48, 1.50, 768), "L40S": (48, 1.20, 864), "A100 40GB": (40, 3.00, 1555), "A100 80GB": (80, 5.00, 2039), "H100 80GB": (80, 8.00, 3350), "H100 NVL": (94, 10.00, 3938), } DTYPE_BYTES = { "F32": 4, "float32": 4, "F16": 2, "float16": 2, "BF16": 2, "bfloat16": 2, "I8": 1, "int8": 1, "U8": 1, "uint8": 1, } FRAMEWORKS = { "None (PyTorch)": 1.20, "vLLM": 1.10, "TGI": 1.15, "llama.cpp": 1.05, "Ollama": 1.08, } def bytes_to_gb(b): return b / (1024 ** 3) @lru_cache(maxsize=50) def fetch_model_info(model_id): try: return api.model_info(model_id, files_metadata=True) except Exception: return None @lru_cache(maxsize=50) def fetch_config(model_id): try: path = hf_hub_download(model_id, "config.json") with open(path) as f: return json.load(f) except Exception: return {} def get_params(info): if info and hasattr(info, 'safetensors') and info.safetensors: params = info.safetensors.total dtypes = info.safetensors.parameters if dtypes: dtype = max(dtypes, key=dtypes.get) return params, dtype return 0, "F16" def estimate_throughput(params, bandwidth_gbs, batch_size, dtype_bytes): """Estimate tokens/second based on memory bandwidth (rough approximation)""" # Simplified: tok/s ~ bandwidth / (params * dtype_bytes / batch_size) # This is a rough estimate; actual throughput depends on many factors model_gb = (params * dtype_bytes) / (1024**3) if model_gb == 0: return 0 # Rough heuristic: memory-bound inference tokens_per_sec = (bandwidth_gbs / model_gb) * batch_size * 0.5 # 50% efficiency factor return max(1, int(tokens_per_sec)) def calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank): """Main calculation function""" try: if not model_id or not model_id.strip(): return "Enter a model ID (e.g., meta-llama/Llama-3.1-8B)" model_id = model_id.strip() if "/" not in model_id: return "Model ID format: organization/model-name" info = fetch_model_info(model_id) if not info: return "Could not fetch model: " + model_id config = fetch_config(model_id) params, dtype = get_params(info) if params == 0: return "Could not read parameters (model may use .bin format)" dtype_bytes = DTYPE_BYTES.get(dtype, 2) params_b = params / 1e9 weights_gb = bytes_to_gb(params * dtype_bytes) layers = config.get("num_hidden_layers", config.get("n_layer", 32)) kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 32)) head_dim = config.get("head_dim", 128) hidden_size = config.get("hidden_size", 4096) num_heads = config.get("num_attention_heads", 32) vocab_size = config.get("vocab_size", 32000) intermediate_size = config.get("intermediate_size", hidden_size * 4) max_position = config.get("max_position_embeddings", 4096) if not head_dim: head_dim = hidden_size // num_heads if num_heads else 128 kv_bytes = 2 * layers * batch * context * kv_heads * head_dim * dtype_bytes kv_gb = bytes_to_gb(kv_bytes) out = [] out.append("## " + model_id) out.append("**" + str(round(params_b, 1)) + "B parameters** | " + dtype + " | " + str(layers) + " layers") out.append("") # Architecture details out.append("### Model Architecture") out.append("| Property | Value |") out.append("|----------|-------|") out.append("| Hidden Size | " + str(hidden_size) + " |") out.append("| Attention Heads | " + str(num_heads) + " |") out.append("| KV Heads (GQA) | " + str(kv_heads) + " |") out.append("| Layers | " + str(layers) + " |") out.append("| Vocab Size | " + str(vocab_size) + " |") out.append("| Max Context | " + str(max_position) + " |") if kv_heads != num_heads: out.append("| GQA Ratio | " + str(num_heads) + ":" + str(kv_heads) + " |") out.append("") if mode == "Training (Full)": grad_gb = weights_gb opt_gb = bytes_to_gb(params * 8) act_gb = weights_gb * 2 * batch total = weights_gb + grad_gb + opt_gb + act_gb out.append("### Training Memory Breakdown") out.append("| Component | Size |") out.append("|-----------|------|") out.append("| Weights | " + str(round(weights_gb, 1)) + " GB |") out.append("| Gradients | " + str(round(grad_gb, 1)) + " GB |") out.append("| Optimizer (AdamW) | " + str(round(opt_gb, 1)) + " GB |") out.append("| Activations | " + str(round(act_gb, 1)) + " GB |") elif mode == "LoRA": base = weights_gb lora_params = int(params * lora_rank * 0.0001) lora_gb = bytes_to_gb(lora_params * dtype_bytes) act_gb = base * 0.3 total = base + lora_gb + act_gb out.append("### LoRA Memory Breakdown") out.append("| Component | Size |") out.append("|-----------|------|") out.append("| Base model (frozen) | " + str(round(base, 1)) + " GB |") out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |") out.append("| Activations | " + str(round(act_gb, 1)) + " GB |") elif mode == "QLoRA": base = bytes_to_gb(params * 0.5) lora_params = int(params * lora_rank * 0.0001) lora_gb = bytes_to_gb(lora_params * dtype_bytes) act_gb = base * 0.3 total = base + lora_gb + act_gb out.append("### QLoRA Memory Breakdown") out.append("| Component | Size |") out.append("|-----------|------|") out.append("| Base model (4-bit) | " + str(round(base, 1)) + " GB |") out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |") out.append("| Activations | " + str(round(act_gb, 1)) + " GB |") else: overhead = FRAMEWORKS.get(framework, 1.15) extra = (weights_gb + kv_gb) * (overhead - 1) total = weights_gb + kv_gb + extra out.append("### Inference Memory Breakdown") out.append("| Component | Size |") out.append("|-----------|------|") out.append("| Model weights | " + str(round(weights_gb, 1)) + " GB |") out.append("| KV Cache (" + str(context) + " ctx) | " + str(round(kv_gb, 1)) + " GB |") out.append("| Framework overhead (" + framework + ") | " + str(round(extra, 1)) + " GB |") if num_gpus > 1: per_gpu = total / num_gpus * 1.05 out.append("") out.append("**Multi-GPU (" + str(num_gpus) + "x):** " + str(round(per_gpu, 1)) + " GB per GPU (includes 5% communication overhead)") effective = per_gpu else: effective = total out.append("") out.append("## Total Required: " + str(round(total, 1)) + " GB") # Consumer GPUs section with colors and throughput out.append("") out.append("### Consumer GPUs") out.append("| GPU | VRAM | Status | Headroom | Est. tok/s |") out.append("|-----|------|--------|----------|------------|") for gpu, (vram, bandwidth) in CONSUMER_GPUS.items(): hr = vram - effective if hr >= 2: status = "🟢 Good fit" elif hr >= 0: status = "🟡 Tight" else: status = "🔴 No" sign = "+" if hr >= 0 else "" if hr >= 0 and mode == "Inference": tps = estimate_throughput(params, bandwidth, batch, dtype_bytes) tps_str = str(tps) else: tps_str = "-" out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB | " + tps_str + " |") # Apple Silicon section out.append("") out.append("### Apple Silicon (Unified Memory)") out.append("| Chip | Memory | Status | Headroom | Est. tok/s |") out.append("|------|--------|--------|----------|------------|") for gpu, (vram, bandwidth) in APPLE_GPUS.items(): hr = vram - effective if hr >= 10: status = "🟢 Excellent" elif hr >= 0: status = "🟡 Usable" else: status = "🔴 No" sign = "+" if hr >= 0 else "" if hr >= 0 and mode == "Inference": tps = estimate_throughput(params, bandwidth, batch, dtype_bytes) tps_str = str(tps) else: tps_str = "-" out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB | " + tps_str + " |") # Cloud GPUs section with costs and throughput out.append("") out.append("### Cloud GPU Options") out.append("| GPU | VRAM | Status | $/hour | $/day | Est. tok/s |") out.append("|-----|------|--------|--------|-------|------------|") cloud_options = [] for gpu, (vram, cost, bandwidth) in CLOUD_GPUS.items(): hr = vram - effective if hr >= 2: status = "🟢 Good" elif hr >= 0: status = "🟡 Tight" else: status = "🔴 No" daily = cost * 8 if hr >= 0 and mode == "Inference": tps = estimate_throughput(params, bandwidth, batch, dtype_bytes) else: tps = 0 cloud_options.append((gpu, vram, hr, status, cost, daily, bandwidth, tps)) # Sort by cost for those that fit cloud_options.sort(key=lambda x: (x[2] < 0, x[4])) for gpu, vram, hr, status, cost, daily, bandwidth, tps in cloud_options: sign = "+" if hr >= 0 else "" tps_str = str(tps) if tps > 0 else "-" out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | $" + str(round(cost, 2)) + " | $" + str(round(daily, 2)) + " | " + tps_str + " |") # Best value recommendation fitting_gpus = [(gpu, cost, tps) for gpu, vram, hr, status, cost, daily, bw, tps in cloud_options if hr >= 0] if fitting_gpus: fitting_gpus.sort(key=lambda x: x[1]) best = fitting_gpus[0] out.append("") rec = "**Recommended:** " + best[0] + " at $" + str(round(best[1], 2)) + "/hour" if best[2] > 0: rec += " (~" + str(best[2]) + " tok/s)" out.append(rec) # Best performance option if len(fitting_gpus) > 1: fitting_gpus.sort(key=lambda x: -x[2]) fastest = fitting_gpus[0] if fastest[0] != best[0] and fastest[2] > 0: out.append("**Fastest:** " + fastest[0] + " (~" + str(fastest[2]) + " tok/s)") # Quantization suggestions if model is large if effective > 24: out.append("") out.append("### Quantization Options (to fit consumer GPUs)") out.append("| Method | Est. Size | Fits 24GB | Quality |") out.append("|--------|-----------|-----------|---------|") quant_options = [ ("INT8", 1.0, "Excellent"), ("4-bit (GPTQ/AWQ)", 0.5, "Very Good"), ("3-bit", 0.375, "Good"), ("2-bit (extreme)", 0.25, "Degraded"), ] for name, mult, quality in quant_options: size = bytes_to_gb(params * mult) * 1.1 fits = "🟢 Yes" if size <= 24 else "🔴 No" out.append("| " + name + " | " + str(round(size, 1)) + "GB | " + fits + " | " + quality + " |") # Context scaling info out.append("") out.append("### Context Length Scaling") out.append("| Context | KV Cache | Total Est. |") out.append("|---------|----------|------------|") for ctx_opt in [2048, 4096, 8192, 16384, 32768]: if ctx_opt <= max_position: kv_opt = bytes_to_gb(2 * layers * batch * ctx_opt * kv_heads * head_dim * dtype_bytes) total_opt = weights_gb + kv_opt out.append("| " + str(ctx_opt) + " | " + str(round(kv_opt, 1)) + "GB | " + str(round(total_opt, 1)) + "GB |") return "\n".join(out) except Exception as e: return "Error: " + str(e) def compare(models_text, context): """Compare multiple models""" try: if not models_text: return "Enter model IDs, one per line" models = [m.strip() for m in models_text.strip().split("\n") if m.strip()] if len(models) < 2: return "Need at least 2 models" out = [] out.append("## Model Comparison") out.append("") out.append("| Model | Params | Inference | Training | QLoRA | Fits 24GB |") out.append("|-------|--------|-----------|----------|-------|-----------|") for mid in models[:8]: try: info = fetch_model_info(mid) config = fetch_config(mid) params, dtype = get_params(info) if params == 0: out.append("| " + mid + " | Error | - | - | - | - |") continue db = DTYPE_BYTES.get(dtype, 2) w = bytes_to_gb(params * db) layers = config.get("num_hidden_layers", 32) kv_heads = config.get("num_key_value_heads", 32) kv = bytes_to_gb(2 * layers * context * kv_heads * 128 * db) inf = w + kv train = w * 4 + w * 2 qlora = bytes_to_gb(params * 0.5) * 1.5 fits = "🟢 Yes" if inf <= 24 else "🔴 No" name = mid.split("/")[-1][:25] out.append("| " + name + " | " + str(round(params / 1e9, 1)) + "B | " + str(round(inf, 1)) + "GB | " + str(round(train, 1)) + "GB | " + str(round(qlora, 1)) + "GB | " + fits + " |") except Exception: out.append("| " + mid + " | Error | - | - | - | - |") out.append("") out.append("*Context length: " + str(context) + " tokens*") return "\n".join(out) except Exception as e: return "Error: " + str(e) def generate_report(model_id, context, batch, mode, framework, num_gpus, lora_rank): """Generate a shareable text report""" try: if not model_id or not model_id.strip(): return "Enter a model ID first" result = calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank) report = [] report.append("=" * 50) report.append("VRAM CALCULATOR REPORT") report.append("=" * 50) report.append("") report.append("Settings:") report.append(" Model: " + model_id) report.append(" Mode: " + mode) report.append(" Context: " + str(context)) report.append(" Batch Size: " + str(batch)) report.append(" Framework: " + framework) report.append(" GPUs: " + str(num_gpus)) if "LoRA" in mode: report.append(" LoRA Rank: " + str(lora_rank)) report.append("") report.append("-" * 50) report.append("") report.append(result) report.append("") report.append("-" * 50) report.append("Generated by VRAM Calculator") report.append("https://huggingface.co/spaces/Livengood/Instance-VRAM-Calculator") return "\n".join(report) except Exception as e: return "Error generating report: " + str(e) # Build the interface with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo: gr.Markdown("# VRAM Calculator for LLMs") gr.Markdown("Estimate VRAM requirements and throughput for HuggingFace models") with gr.Tabs(): with gr.TabItem("Calculator"): model_in = gr.Textbox( label="Model ID", placeholder="meta-llama/Llama-3.1-8B", info="Enter a HuggingFace model ID (e.g., organization/model-name)" ) mode_in = gr.Radio( choices=["Inference", "Training (Full)", "LoRA", "QLoRA"], value="Inference", label="Mode" ) with gr.Row(): ctx_in = gr.Slider( minimum=512, maximum=131072, value=4096, step=512, label="Context Length", info="Max tokens for KV cache" ) batch_in = gr.Slider( minimum=1, maximum=64, value=1, step=1, label="Batch Size", info="Concurrent sequences" ) with gr.Accordion("Advanced Options", open=False): framework_in = gr.Dropdown( choices=list(FRAMEWORKS.keys()), value="vLLM", label="Inference Framework" ) gpus_in = gr.Slider( minimum=1, maximum=8, value=1, step=1, label="Number of GPUs", info="For tensor parallelism" ) lora_in = gr.Slider( minimum=4, maximum=128, value=16, step=4, label="LoRA Rank", info="Higher = more parameters" ) with gr.Row(): calc_btn = gr.Button("Calculate VRAM", variant="primary") export_btn = gr.Button("Export Report", variant="secondary") output = gr.Markdown() export_output = gr.Textbox(label="Exportable Report", lines=10, visible=False) calc_btn.click( fn=calculate, inputs=[model_in, ctx_in, batch_in, mode_in, framework_in, gpus_in, lora_in], outputs=output ) def show_export(model_id, context, batch, mode, framework, num_gpus, lora_rank): report = generate_report(model_id, context, batch, mode, framework, num_gpus, lora_rank) return gr.update(visible=True, value=report) export_btn.click( fn=show_export, inputs=[model_in, ctx_in, batch_in, mode_in, framework_in, gpus_in, lora_in], outputs=export_output ) gr.Markdown("### Popular Models") gr.Examples( examples=[ ["meta-llama/Llama-3.1-8B"], ["meta-llama/Llama-3.1-70B"], ["meta-llama/Llama-3.2-1B"], ["meta-llama/Llama-3.2-3B"], ["mistralai/Mistral-7B-v0.1"], ["mistralai/Mixtral-8x7B-v0.1"], ["Qwen/Qwen2.5-7B"], ["Qwen/Qwen2.5-72B"], ["google/gemma-2-9b"], ["google/gemma-2-27b"], ["microsoft/phi-3-mini-4k-instruct"], ["deepseek-ai/DeepSeek-V2-Lite"], ["NousResearch/Hermes-3-Llama-3.1-8B"], ["01-ai/Yi-1.5-34B"], ], inputs=[model_in], label="Click to load" ) with gr.TabItem("Compare Models"): gr.Markdown("Compare VRAM requirements across multiple models") cmp_in = gr.Textbox( label="Models (one per line)", lines=6, placeholder="meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B" ) cmp_ctx = gr.Slider( minimum=512, maximum=131072, value=4096, step=512, label="Context Length" ) cmp_btn = gr.Button("Compare Models", variant="primary") cmp_out = gr.Markdown() cmp_btn.click( fn=compare, inputs=[cmp_in, cmp_ctx], outputs=cmp_out ) gr.Markdown("### Quick Comparison Sets") gr.Examples( examples=[ ["meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmeta-llama/Llama-3.2-3B"], ["mistralai/Mistral-7B-v0.1\nmistralai/Mixtral-8x7B-v0.1"], ["Qwen/Qwen2.5-7B\nQwen/Qwen2.5-14B\nQwen/Qwen2.5-72B"], ["google/gemma-2-2b\ngoogle/gemma-2-9b\ngoogle/gemma-2-27b"], ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B\ngoogle/gemma-2-9b"], ], inputs=[cmp_in], label="Click to load comparison" ) with gr.TabItem("GPU Reference"): gr.Markdown("## GPU VRAM & Bandwidth Reference") gr.Markdown("Memory bandwidth significantly affects inference speed (tokens/second)") gr.Markdown("### Consumer GPUs (NVIDIA GeForce)") consumer_md = "| GPU | VRAM | Bandwidth | Best For |\n|-----|------|-----------|----------|\n" for gpu, (vram, bw) in CONSUMER_GPUS.items(): if vram <= 12: use = "Small models (3-7B)" elif vram <= 16: use = "7B models" else: use = "7B-13B models, fine-tuning" consumer_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | " + use + " |\n" gr.Markdown(consumer_md) gr.Markdown("### Apple Silicon") apple_md = "| Chip | Unified Memory | Bandwidth | Notes |\n|------|----------------|-----------|-------|\n" for gpu, (vram, bw) in APPLE_GPUS.items(): apple_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | Shared CPU/GPU |\n" gr.Markdown(apple_md) gr.Markdown("### Cloud/Datacenter GPUs") cloud_md = "| GPU | VRAM | Bandwidth | $/hr | Best For |\n|-----|------|-----------|------|----------|\n" for gpu, (vram, cost, bw) in CLOUD_GPUS.items(): if vram <= 24: use = "7B models, fine-tuning" elif vram <= 48: use = "13B-30B models" else: use = "70B+ models, training" cloud_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | $" + str(round(cost, 2)) + " | " + use + " |\n" gr.Markdown(cloud_md) gr.Markdown("### Understanding Throughput") gr.Markdown(""" **Tokens per second (tok/s)** estimates are based on memory bandwidth and model size. - **Memory-bound inference**: Most LLM inference is limited by how fast weights can be loaded from memory - **Bandwidth formula**: `tok/s ≈ (bandwidth / model_size) × batch_size × efficiency` - **Batching**: Higher batch sizes improve throughput but use more VRAM for KV cache - **Quantization**: 4-bit models load 4x faster but may have quality tradeoffs *Estimates are approximate. Actual performance depends on implementation, optimizations, and workload.* """) with gr.TabItem("Tips & Guide"): gr.Markdown(""" ## Quick Guide ### Choosing the Right Mode | Mode | Use Case | VRAM Multiplier | |------|----------|-----------------| | **Inference** | Running predictions | 1x weights + KV cache | | **Training (Full)** | Training from scratch | 4-6x weights | | **LoRA** | Fine-tuning with adapters | 1.3x weights | | **QLoRA** | Memory-efficient fine-tuning | 0.5x weights + adapters | ### VRAM Rule of Thumb - **Inference**: `params × 2 bytes` (FP16) + KV cache - **Training**: `params × 18-20 bytes` (weights + gradients + optimizer + activations) - **QLoRA**: `params × 0.5-0.6 bytes` (4-bit) + small adapter overhead ### Fitting Large Models 1. **Use quantization** (INT8, 4-bit) to reduce memory 2-4x 2. **Reduce context length** to shrink KV cache 3. **Use multi-GPU** for tensor parallelism 4. **Try QLoRA** instead of full fine-tuning ### Recommended Setups | Model Size | Inference | QLoRA Training | |------------|-----------|----------------| | 7B | RTX 3090/4090 (24GB) | RTX 3090/4090 | | 13B | A10G or 2x RTX 3090 | RTX 4090 (4-bit) | | 30B | A100 40GB or 2x RTX 4090 | A10G (4-bit) | | 70B | A100 80GB or 4x RTX 4090 | A100 40GB (4-bit) | ### Cost Optimization Tips 1. **Start small**: Test with smaller models first 2. **Use spot instances**: 60-90% cheaper for training 3. **Right-size**: Don't overpay for unused VRAM 4. **Consider Apple Silicon**: M2/M3/M4 Max good for local inference """) gr.Markdown("---") gr.Markdown("*Estimates are approximate. Actual usage varies by implementation, batch size, and optimizations.*") if __name__ == "__main__": demo.launch()