"""VRAM Calculator for HuggingFace Models"""
from __future__ import annotations

import gradio as gr
from huggingface_hub import HfApi, hf_hub_download
import json
from functools import lru_cache

api = HfApi()

# Consumer GPUs: (VRAM GB, Memory Bandwidth GB/s)
CONSUMER_GPUS = {
    "RTX 3080": (10, 760),
    "RTX 3080 Ti": (12, 912),
    "RTX 3090": (24, 936),
    "RTX 3090 Ti": (24, 1008),
    "RTX 4080": (16, 717),
    "RTX 4080 Super": (16, 736),
    "RTX 4090": (24, 1008),
    "RTX 5090": (32, 1792),
}

# Apple Silicon: (Unified Memory GB, Memory Bandwidth GB/s)
APPLE_GPUS = {
    "M1 Max": (64, 400),
    "M2 Max": (96, 400),
    "M2 Ultra": (192, 800),
    "M3 Max": (128, 400),
    "M4 Max": (128, 546),
}

# Cloud/Datacenter GPUs: (VRAM GB, $/hr, Memory Bandwidth GB/s)
CLOUD_GPUS = {
    "T4": (16, 0.35, 320),
    "L4": (24, 0.70, 300),
    "A10G": (24, 1.00, 600),
    "RTX A5000": (24, 0.80, 768),
    "RTX A6000": (48, 1.50, 768),
    "L40S": (48, 1.20, 864),
    "A100 40GB": (40, 3.00, 1555),
    "A100 80GB": (80, 5.00, 2039),
    "H100 80GB": (80, 8.00, 3350),
    "H100 NVL": (94, 10.00, 3938),
}

DTYPE_BYTES = {
    "F32": 4, "float32": 4,
    "F16": 2, "float16": 2,
    "BF16": 2, "bfloat16": 2,
    "I8": 1, "int8": 1,
    "U8": 1, "uint8": 1,
}

FRAMEWORKS = {
    "None (PyTorch)": 1.20,
    "vLLM": 1.10,
    "TGI": 1.15,
    "llama.cpp": 1.05,
    "Ollama": 1.08,
}


def bytes_to_gb(b):
    return b / (1024 ** 3)


@lru_cache(maxsize=50)
def fetch_model_info(model_id):
    try:
        return api.model_info(model_id, files_metadata=True)
    except Exception:
        return None


@lru_cache(maxsize=50)
def fetch_config(model_id):
    try:
        path = hf_hub_download(model_id, "config.json")
        with open(path) as f:
            return json.load(f)
    except Exception:
        return {}


def get_params(info):
    if info and hasattr(info, 'safetensors') and info.safetensors:
        params = info.safetensors.total
        dtypes = info.safetensors.parameters
        if dtypes:
            dtype = max(dtypes, key=dtypes.get)
            return params, dtype
    return 0, "F16"


def estimate_throughput(params, bandwidth_gbs, batch_size, dtype_bytes):
    """Estimate tokens/second based on memory bandwidth (rough approximation)"""
    # Simplified: tok/s ~ bandwidth / (params * dtype_bytes / batch_size)
    # This is a rough estimate; actual throughput depends on many factors
    model_gb = (params * dtype_bytes) / (1024**3)
    if model_gb == 0:
        return 0
    # Rough heuristic: memory-bound inference
    tokens_per_sec = (bandwidth_gbs / model_gb) * batch_size * 0.5  # 50% efficiency factor
    return max(1, int(tokens_per_sec))


def calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank):
    """Main calculation function"""
    try:
        if not model_id or not model_id.strip():
            return "Enter a model ID (e.g., meta-llama/Llama-3.1-8B)"

        model_id = model_id.strip()
        if "/" not in model_id:
            return "Model ID format: organization/model-name"

        info = fetch_model_info(model_id)
        if not info:
            return "Could not fetch model: " + model_id

        config = fetch_config(model_id)
        params, dtype = get_params(info)

        if params == 0:
            return "Could not read parameters (model may use .bin format)"

        dtype_bytes = DTYPE_BYTES.get(dtype, 2)
        params_b = params / 1e9
        weights_gb = bytes_to_gb(params * dtype_bytes)

        layers = config.get("num_hidden_layers", config.get("n_layer", 32))
        kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 32))
        head_dim = config.get("head_dim", 128)
        hidden_size = config.get("hidden_size", 4096)
        num_heads = config.get("num_attention_heads", 32)
        vocab_size = config.get("vocab_size", 32000)
        intermediate_size = config.get("intermediate_size", hidden_size * 4)
        max_position = config.get("max_position_embeddings", 4096)

        if not head_dim:
            head_dim = hidden_size // num_heads if num_heads else 128

        kv_bytes = 2 * layers * batch * context * kv_heads * head_dim * dtype_bytes
        kv_gb = bytes_to_gb(kv_bytes)

        out = []
        out.append("## " + model_id)
        out.append("**" + str(round(params_b, 1)) + "B parameters** | " + dtype + " | " + str(layers) + " layers")
        out.append("")

        # Architecture details
        out.append("### Model Architecture")
        out.append("| Property | Value |")
        out.append("|----------|-------|")
        out.append("| Hidden Size | " + str(hidden_size) + " |")
        out.append("| Attention Heads | " + str(num_heads) + " |")
        out.append("| KV Heads (GQA) | " + str(kv_heads) + " |")
        out.append("| Layers | " + str(layers) + " |")
        out.append("| Vocab Size | " + str(vocab_size) + " |")
        out.append("| Max Context | " + str(max_position) + " |")
        if kv_heads != num_heads:
            out.append("| GQA Ratio | " + str(num_heads) + ":" + str(kv_heads) + " |")
        out.append("")

        if mode == "Training (Full)":
            grad_gb = weights_gb
            opt_gb = bytes_to_gb(params * 8)
            act_gb = weights_gb * 2 * batch
            total = weights_gb + grad_gb + opt_gb + act_gb
            out.append("### Training Memory Breakdown")
            out.append("| Component | Size |")
            out.append("|-----------|------|")
            out.append("| Weights | " + str(round(weights_gb, 1)) + " GB |")
            out.append("| Gradients | " + str(round(grad_gb, 1)) + " GB |")
            out.append("| Optimizer (AdamW) | " + str(round(opt_gb, 1)) + " GB |")
            out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
        elif mode == "LoRA":
            base = weights_gb
            lora_params = int(params * lora_rank * 0.0001)
            lora_gb = bytes_to_gb(lora_params * dtype_bytes)
            act_gb = base * 0.3
            total = base + lora_gb + act_gb
            out.append("### LoRA Memory Breakdown")
            out.append("| Component | Size |")
            out.append("|-----------|------|")
            out.append("| Base model (frozen) | " + str(round(base, 1)) + " GB |")
            out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |")
            out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
        elif mode == "QLoRA":
            base = bytes_to_gb(params * 0.5)
            lora_params = int(params * lora_rank * 0.0001)
            lora_gb = bytes_to_gb(lora_params * dtype_bytes)
            act_gb = base * 0.3
            total = base + lora_gb + act_gb
            out.append("### QLoRA Memory Breakdown")
            out.append("| Component | Size |")
            out.append("|-----------|------|")
            out.append("| Base model (4-bit) | " + str(round(base, 1)) + " GB |")
            out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |")
            out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
        else:
            overhead = FRAMEWORKS.get(framework, 1.15)
            extra = (weights_gb + kv_gb) * (overhead - 1)
            total = weights_gb + kv_gb + extra
            out.append("### Inference Memory Breakdown")
            out.append("| Component | Size |")
            out.append("|-----------|------|")
            out.append("| Model weights | " + str(round(weights_gb, 1)) + " GB |")
            out.append("| KV Cache (" + str(context) + " ctx) | " + str(round(kv_gb, 1)) + " GB |")
            out.append("| Framework overhead (" + framework + ") | " + str(round(extra, 1)) + " GB |")

        if num_gpus > 1:
            per_gpu = total / num_gpus * 1.05
            out.append("")
            out.append("**Multi-GPU (" + str(num_gpus) + "x):** " + str(round(per_gpu, 1)) + " GB per GPU (includes 5% communication overhead)")
            effective = per_gpu
        else:
            effective = total

        out.append("")
        out.append("## Total Required: " + str(round(total, 1)) + " GB")

        # Consumer GPUs section with colors and throughput
        out.append("")
        out.append("### Consumer GPUs")
        out.append("| GPU | VRAM | Status | Headroom | Est. tok/s |")
        out.append("|-----|------|--------|----------|------------|")
        for gpu, (vram, bandwidth) in CONSUMER_GPUS.items():
            hr = vram - effective
            if hr >= 2:
                status = "🟢 Good fit"
            elif hr >= 0:
                status = "🟡 Tight"
            else:
                status = "🔴 No"
            sign = "+" if hr >= 0 else ""
            if hr >= 0 and mode == "Inference":
                tps = estimate_throughput(params, bandwidth, batch, dtype_bytes)
                tps_str = str(tps)
            else:
                tps_str = "-"
            out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB | " + tps_str + " |")

        # Apple Silicon section
        out.append("")
        out.append("### Apple Silicon (Unified Memory)")
        out.append("| Chip | Memory | Status | Headroom | Est. tok/s |")
        out.append("|------|--------|--------|----------|------------|")
        for gpu, (vram, bandwidth) in APPLE_GPUS.items():
            hr = vram - effective
            if hr >= 10:
                status = "🟢 Excellent"
            elif hr >= 0:
                status = "🟡 Usable"
            else:
                status = "🔴 No"
            sign = "+" if hr >= 0 else ""
            if hr >= 0 and mode == "Inference":
                tps = estimate_throughput(params, bandwidth, batch, dtype_bytes)
                tps_str = str(tps)
            else:
                tps_str = "-"
            out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB | " + tps_str + " |")

        # Cloud GPUs section with costs and throughput
        out.append("")
        out.append("### Cloud GPU Options")
        out.append("| GPU | VRAM | Status | $/hour | $/day | Est. tok/s |")
        out.append("|-----|------|--------|--------|-------|------------|")

        cloud_options = []
        for gpu, (vram, cost, bandwidth) in CLOUD_GPUS.items():
            hr = vram - effective
            if hr >= 2:
                status = "🟢 Good"
            elif hr >= 0:
                status = "🟡 Tight"
            else:
                status = "🔴 No"
            daily = cost * 8
            if hr >= 0 and mode == "Inference":
                tps = estimate_throughput(params, bandwidth, batch, dtype_bytes)
            else:
                tps = 0
            cloud_options.append((gpu, vram, hr, status, cost, daily, bandwidth, tps))

        # Sort by cost for those that fit
        cloud_options.sort(key=lambda x: (x[2] < 0, x[4]))

        for gpu, vram, hr, status, cost, daily, bandwidth, tps in cloud_options:
            sign = "+" if hr >= 0 else ""
            tps_str = str(tps) if tps > 0 else "-"
            out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | $" + str(round(cost, 2)) + " | $" + str(round(daily, 2)) + " | " + tps_str + " |")

        # Best value recommendation
        fitting_gpus = [(gpu, cost, tps) for gpu, vram, hr, status, cost, daily, bw, tps in cloud_options if hr >= 0]
        if fitting_gpus:
            fitting_gpus.sort(key=lambda x: x[1])
            best = fitting_gpus[0]
            out.append("")
            rec = "**Recommended:** " + best[0] + " at $" + str(round(best[1], 2)) + "/hour"
            if best[2] > 0:
                rec += " (~" + str(best[2]) + " tok/s)"
            out.append(rec)

            # Best performance option
            if len(fitting_gpus) > 1:
                fitting_gpus.sort(key=lambda x: -x[2])
                fastest = fitting_gpus[0]
                if fastest[0] != best[0] and fastest[2] > 0:
                    out.append("**Fastest:** " + fastest[0] + " (~" + str(fastest[2]) + " tok/s)")

        # Quantization suggestions if model is large
        if effective > 24:
            out.append("")
            out.append("### Quantization Options (to fit consumer GPUs)")
            out.append("| Method | Est. Size | Fits 24GB | Quality |")
            out.append("|--------|-----------|-----------|---------|")
            quant_options = [
                ("INT8", 1.0, "Excellent"),
                ("4-bit (GPTQ/AWQ)", 0.5, "Very Good"),
                ("3-bit", 0.375, "Good"),
                ("2-bit (extreme)", 0.25, "Degraded"),
            ]
            for name, mult, quality in quant_options:
                size = bytes_to_gb(params * mult) * 1.1
                fits = "🟢 Yes" if size <= 24 else "🔴 No"
                out.append("| " + name + " | " + str(round(size, 1)) + "GB | " + fits + " | " + quality + " |")

        # Context scaling info
        out.append("")
        out.append("### Context Length Scaling")
        out.append("| Context | KV Cache | Total Est. |")
        out.append("|---------|----------|------------|")
        for ctx_opt in [2048, 4096, 8192, 16384, 32768]:
            if ctx_opt <= max_position:
                kv_opt = bytes_to_gb(2 * layers * batch * ctx_opt * kv_heads * head_dim * dtype_bytes)
                total_opt = weights_gb + kv_opt
                out.append("| " + str(ctx_opt) + " | " + str(round(kv_opt, 1)) + "GB | " + str(round(total_opt, 1)) + "GB |")

        return "\n".join(out)
    except Exception as e:
        return "Error: " + str(e)


def compare(models_text, context):
    """Compare multiple models"""
    try:
        if not models_text:
            return "Enter model IDs, one per line"

        models = [m.strip() for m in models_text.strip().split("\n") if m.strip()]
        if len(models) < 2:
            return "Need at least 2 models"

        out = []
        out.append("## Model Comparison")
        out.append("")
        out.append("| Model | Params | Inference | Training | QLoRA | Fits 24GB |")
        out.append("|-------|--------|-----------|----------|-------|-----------|")

        for mid in models[:8]:
            try:
                info = fetch_model_info(mid)
                config = fetch_config(mid)
                params, dtype = get_params(info)
                if params == 0:
                    out.append("| " + mid + " | Error | - | - | - | - |")
                    continue

                db = DTYPE_BYTES.get(dtype, 2)
                w = bytes_to_gb(params * db)

                layers = config.get("num_hidden_layers", 32)
                kv_heads = config.get("num_key_value_heads", 32)
                kv = bytes_to_gb(2 * layers * context * kv_heads * 128 * db)

                inf = w + kv
                train = w * 4 + w * 2
                qlora = bytes_to_gb(params * 0.5) * 1.5

                fits = "🟢 Yes" if inf <= 24 else "🔴 No"
                name = mid.split("/")[-1][:25]
                out.append("| " + name + " | " + str(round(params / 1e9, 1)) + "B | " + str(round(inf, 1)) + "GB | " + str(round(train, 1)) + "GB | " + str(round(qlora, 1)) + "GB | " + fits + " |")
            except Exception:
                out.append("| " + mid + " | Error | - | - | - | - |")

        out.append("")
        out.append("*Context length: " + str(context) + " tokens*")

        return "\n".join(out)
    except Exception as e:
        return "Error: " + str(e)


def generate_report(model_id, context, batch, mode, framework, num_gpus, lora_rank):
    """Generate a shareable text report"""
    try:
        if not model_id or not model_id.strip():
            return "Enter a model ID first"

        result = calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank)

        report = []
        report.append("=" * 50)
        report.append("VRAM CALCULATOR REPORT")
        report.append("=" * 50)
        report.append("")
        report.append("Settings:")
        report.append("  Model: " + model_id)
        report.append("  Mode: " + mode)
        report.append("  Context: " + str(context))
        report.append("  Batch Size: " + str(batch))
        report.append("  Framework: " + framework)
        report.append("  GPUs: " + str(num_gpus))
        if "LoRA" in mode:
            report.append("  LoRA Rank: " + str(lora_rank))
        report.append("")
        report.append("-" * 50)
        report.append("")
        report.append(result)
        report.append("")
        report.append("-" * 50)
        report.append("Generated by VRAM Calculator")
        report.append("https://huggingface.co/spaces/Livengood/Instance-VRAM-Calculator")

        return "\n".join(report)
    except Exception as e:
        return "Error generating report: " + str(e)


# Build the interface
with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# VRAM Calculator for LLMs")
    gr.Markdown("Estimate VRAM requirements and throughput for HuggingFace models")

    with gr.Tabs():
        with gr.TabItem("Calculator"):
            model_in = gr.Textbox(
                label="Model ID",
                placeholder="meta-llama/Llama-3.1-8B",
                info="Enter a HuggingFace model ID (e.g., organization/model-name)"
            )

            mode_in = gr.Radio(
                choices=["Inference", "Training (Full)", "LoRA", "QLoRA"],
                value="Inference",
                label="Mode"
            )

            with gr.Row():
                ctx_in = gr.Slider(
                    minimum=512,
                    maximum=131072,
                    value=4096,
                    step=512,
                    label="Context Length",
                    info="Max tokens for KV cache"
                )
                batch_in = gr.Slider(
                    minimum=1,
                    maximum=64,
                    value=1,
                    step=1,
                    label="Batch Size",
                    info="Concurrent sequences"
                )

            with gr.Accordion("Advanced Options", open=False):
                framework_in = gr.Dropdown(
                    choices=list(FRAMEWORKS.keys()),
                    value="vLLM",
                    label="Inference Framework"
                )
                gpus_in = gr.Slider(
                    minimum=1,
                    maximum=8,
                    value=1,
                    step=1,
                    label="Number of GPUs",
                    info="For tensor parallelism"
                )
                lora_in = gr.Slider(
                    minimum=4,
                    maximum=128,
                    value=16,
                    step=4,
                    label="LoRA Rank",
                    info="Higher = more parameters"
                )

            with gr.Row():
                calc_btn = gr.Button("Calculate VRAM", variant="primary")
                export_btn = gr.Button("Export Report", variant="secondary")

            output = gr.Markdown()
            export_output = gr.Textbox(label="Exportable Report", lines=10, visible=False)

            calc_btn.click(
                fn=calculate,
                inputs=[model_in, ctx_in, batch_in, mode_in, framework_in, gpus_in, lora_in],
                outputs=output
            )

            def show_export(model_id, context, batch, mode, framework, num_gpus, lora_rank):
                report = generate_report(model_id, context, batch, mode, framework, num_gpus, lora_rank)
                return gr.update(visible=True, value=report)

            export_btn.click(
                fn=show_export,
                inputs=[model_in, ctx_in, batch_in, mode_in, framework_in, gpus_in, lora_in],
                outputs=export_output
            )

            gr.Markdown("### Popular Models")
            gr.Examples(
                examples=[
                    ["meta-llama/Llama-3.1-8B"],
                    ["meta-llama/Llama-3.1-70B"],
                    ["meta-llama/Llama-3.2-1B"],
                    ["meta-llama/Llama-3.2-3B"],
                    ["mistralai/Mistral-7B-v0.1"],
                    ["mistralai/Mixtral-8x7B-v0.1"],
                    ["Qwen/Qwen2.5-7B"],
                    ["Qwen/Qwen2.5-72B"],
                    ["google/gemma-2-9b"],
                    ["google/gemma-2-27b"],
                    ["microsoft/phi-3-mini-4k-instruct"],
                    ["deepseek-ai/DeepSeek-V2-Lite"],
                    ["NousResearch/Hermes-3-Llama-3.1-8B"],
                    ["01-ai/Yi-1.5-34B"],
                ],
                inputs=[model_in],
                label="Click to load"
            )

        with gr.TabItem("Compare Models"):
            gr.Markdown("Compare VRAM requirements across multiple models")
            cmp_in = gr.Textbox(
                label="Models (one per line)",
                lines=6,
                placeholder="meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B"
            )
            cmp_ctx = gr.Slider(
                minimum=512,
                maximum=131072,
                value=4096,
                step=512,
                label="Context Length"
            )
            cmp_btn = gr.Button("Compare Models", variant="primary")
            cmp_out = gr.Markdown()

            cmp_btn.click(
                fn=compare,
                inputs=[cmp_in, cmp_ctx],
                outputs=cmp_out
            )

            gr.Markdown("### Quick Comparison Sets")
            gr.Examples(
                examples=[
                    ["meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmeta-llama/Llama-3.2-3B"],
                    ["mistralai/Mistral-7B-v0.1\nmistralai/Mixtral-8x7B-v0.1"],
                    ["Qwen/Qwen2.5-7B\nQwen/Qwen2.5-14B\nQwen/Qwen2.5-72B"],
                    ["google/gemma-2-2b\ngoogle/gemma-2-9b\ngoogle/gemma-2-27b"],
                    ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B\ngoogle/gemma-2-9b"],
                ],
                inputs=[cmp_in],
                label="Click to load comparison"
            )

        with gr.TabItem("GPU Reference"):
            gr.Markdown("## GPU VRAM & Bandwidth Reference")
            gr.Markdown("Memory bandwidth significantly affects inference speed (tokens/second)")

            gr.Markdown("### Consumer GPUs (NVIDIA GeForce)")
            consumer_md = "| GPU | VRAM | Bandwidth | Best For |\n|-----|------|-----------|----------|\n"
            for gpu, (vram, bw) in CONSUMER_GPUS.items():
                if vram <= 12:
                    use = "Small models (3-7B)"
                elif vram <= 16:
                    use = "7B models"
                else:
                    use = "7B-13B models, fine-tuning"
                consumer_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | " + use + " |\n"
            gr.Markdown(consumer_md)

            gr.Markdown("### Apple Silicon")
            apple_md = "| Chip | Unified Memory | Bandwidth | Notes |\n|------|----------------|-----------|-------|\n"
            for gpu, (vram, bw) in APPLE_GPUS.items():
                apple_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | Shared CPU/GPU |\n"
            gr.Markdown(apple_md)

            gr.Markdown("### Cloud/Datacenter GPUs")
            cloud_md = "| GPU | VRAM | Bandwidth | $/hr | Best For |\n|-----|------|-----------|------|----------|\n"
            for gpu, (vram, cost, bw) in CLOUD_GPUS.items():
                if vram <= 24:
                    use = "7B models, fine-tuning"
                elif vram <= 48:
                    use = "13B-30B models"
                else:
                    use = "70B+ models, training"
                cloud_md += "| " + gpu + " | " + str(vram) + "GB | " + str(bw) + " GB/s | $" + str(round(cost, 2)) + " | " + use + " |\n"
            gr.Markdown(cloud_md)

            gr.Markdown("### Understanding Throughput")
            gr.Markdown("""
**Tokens per second (tok/s)** estimates are based on memory bandwidth and model size.

- **Memory-bound inference**: Most LLM inference is limited by how fast weights can be loaded from memory
- **Bandwidth formula**: `tok/s ≈ (bandwidth / model_size) × batch_size × efficiency`
- **Batching**: Higher batch sizes improve throughput but use more VRAM for KV cache
- **Quantization**: 4-bit models load 4x faster but may have quality tradeoffs

*Estimates are approximate. Actual performance depends on implementation, optimizations, and workload.*
""")

        with gr.TabItem("Tips & Guide"):
            gr.Markdown("""
## Quick Guide

### Choosing the Right Mode

| Mode | Use Case | VRAM Multiplier |
|------|----------|-----------------|
| **Inference** | Running predictions | 1x weights + KV cache |
| **Training (Full)** | Training from scratch | 4-6x weights |
| **LoRA** | Fine-tuning with adapters | 1.3x weights |
| **QLoRA** | Memory-efficient fine-tuning | 0.5x weights + adapters |

### VRAM Rule of Thumb

- **Inference**: `params × 2 bytes` (FP16) + KV cache
- **Training**: `params × 18-20 bytes` (weights + gradients + optimizer + activations)
- **QLoRA**: `params × 0.5-0.6 bytes` (4-bit) + small adapter overhead

### Fitting Large Models

1. **Use quantization** (INT8, 4-bit) to reduce memory 2-4x
2. **Reduce context length** to shrink KV cache
3. **Use multi-GPU** for tensor parallelism
4. **Try QLoRA** instead of full fine-tuning

### Recommended Setups

| Model Size | Inference | QLoRA Training |
|------------|-----------|----------------|
| 7B | RTX 3090/4090 (24GB) | RTX 3090/4090 |
| 13B | A10G or 2x RTX 3090 | RTX 4090 (4-bit) |
| 30B | A100 40GB or 2x RTX 4090 | A10G (4-bit) |
| 70B | A100 80GB or 4x RTX 4090 | A100 40GB (4-bit) |

### Cost Optimization Tips

1. **Start small**: Test with smaller models first
2. **Use spot instances**: 60-90% cheaper for training
3. **Right-size**: Don't overpay for unused VRAM
4. **Consider Apple Silicon**: M2/M3/M4 Max good for local inference
""")

    gr.Markdown("---")
    gr.Markdown("*Estimates are approximate. Actual usage varies by implementation, batch size, and optimizations.*")

if __name__ == "__main__":
    demo.launch()