Spaces:

fariasultana
/

MiniMind-API

Runtime error

App Files Files Community

fariasultana commited on 20 days ago

Commit

bd21ba5

verified ·

1 Parent(s): c1384b2

MiniMind Max2 API - Gradio Interface

Browse files

Files changed (8) hide show

README.md +59 -6
app.py +203 -0
model_files/configs/__init__.py +15 -0
model_files/configs/model_config.py +154 -0
model_files/model/__init__.py +52 -0
model_files/model/components.py +274 -0
model_files/model/mind2_model.py +185 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,12 +1,65 @@
 ---
-title: MiniMind API
-emoji: 😻
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
-sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MiniMind Max2 API
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: apache-2.0
+tags:
+  - text-generation
+  - moe
+  - fastapi
+  - language-model
 ---
+# 🧠 MiniMind Max2 API
+**Tiny Model, Powerful Experience** - An efficient language model API with FastAPI backend.
+## Features
+- **Mixture of Experts (MoE)**: Only 25% of parameters activated per token
+- **Grouped Query Attention**: 4:1 ratio for memory efficiency
+- **FastAPI Backend**: RESTful API with automatic docs
+- **Gradio Interface**: Interactive UI for testing
+## API Endpoints
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/docs` | GET | Swagger UI documentation |
+| `/generate` | POST | Generate text from prompt |
+| `/model-info` | GET | Get model architecture info |
+| `/health` | GET | Health check |
+| `/gradio` | GET | Interactive Gradio interface |
+## Example Usage
+```python
+import requests
+response = requests.post(
+    "https://your-space.hf.space/generate",
+    json={
+        "prompt": "Once upon a time",
+        "max_new_tokens": 100,
+        "temperature": 0.8
+    }
+)
+print(response.json()["generated_text"])
+```
+## Model Variants
+| Model | Total Params | Active Params | Target |
+|-------|-------------|---------------|--------|
+| max2-nano | 500M | 125M | IoT, Mobile |
+| max2-lite | 1.5B | 375M | Mobile, Tablet |
+| max2-pro | 3B | 750M | Desktop |
+## License
+Apache 2.0

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+MiniMind Max2 - Gradio Space
+A lightweight, efficient language model with MoE architecture.
+"""
+import os
+import sys
+from pathlib import Path
+# Add model files to path
+sys.path.insert(0, str(Path(__file__).parent / "model_files"))
+import torch
+import gradio as gr
+# Configuration
+MODEL_NAME = os.getenv("MODEL_NAME", "max2-nano")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+# Global model
+model = None
+config = None
+def load_model():
+    """Load the Max2 model."""
+    global model, config
+    from configs.model_config import get_config, estimate_params
+    from model import Max2ForCausalLM
+    print(f"🔄 Loading {MODEL_NAME} on {DEVICE}...")
+    config = get_config(MODEL_NAME)
+    model = Max2ForCausalLM(config)
+    model = model.to(device=DEVICE, dtype=DTYPE)
+    model.eval()
+    params = estimate_params(config)
+    print(f"✅ Model loaded: {params['total_params_b']:.3f}B total, {params['active_params_b']:.3f}B active")
+    return model, config
+def generate_text(prompt, max_tokens, temperature, top_k, top_p):
+    """Generate text from prompt."""
+    global model, config
+    if model is None:
+        load_model()
+    if not prompt.strip():
+        return "Please enter a prompt."
+    try:
+        # Simple character-level tokenization (demo purposes)
+        # In production, use SentencePiece or similar tokenizer
+        prompt_ids = [ord(c) % config.vocab_size for c in prompt]
+        input_ids = torch.tensor([prompt_ids], device=DEVICE)
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=int(max_tokens),
+                temperature=temperature,
+                top_k=int(top_k),
+                top_p=top_p,
+                do_sample=True,
+            )
+        # Decode generated tokens
+        generated_ids = output_ids[0, len(prompt_ids):].tolist()
+        generated_text = "".join([chr(min(max(i, 32), 126)) for i in generated_ids])
+        return prompt + generated_text
+    except Exception as e:
+        return f"Error: {str(e)}"
+def get_model_info():
+    """Get model information."""
+    global model, config
+    if model is None:
+        load_model()
+    from configs.model_config import estimate_params
+    params = estimate_params(config)
+    return f"""
+## Model: {config.model_name}
+| Property | Value |
+|----------|-------|
+| Total Parameters | {params['total_params_b']:.3f}B |
+| Active Parameters | {params['active_params_b']:.3f}B |
+| Activation Ratio | {params['activation_ratio']:.1%} |
+| Device | {DEVICE} |
+| Num Experts | {config.num_experts} |
+| Experts per Token | {config.num_experts_per_tok} |
+| Max Context | {config.max_position_embeddings} |
+"""
+# Create Gradio interface
+with gr.Blocks(title="MiniMind Max2", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🧠 MiniMind Max2
+    **Tiny Model, Powerful Experience** - An efficient language model with Mixture of Experts (MoE) architecture.
+    Only 25% of parameters are activated per token for efficient inference.
+    > ⚠️ **Note**: This demo uses character-level tokenization for simplicity.
+    > For production use, integrate a proper tokenizer (SentencePiece, etc.).
+    """)
+    with gr.Tabs():
+        with gr.TabItem("🚀 Generate"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    prompt_input = gr.Textbox(
+                        label="Prompt",
+                        placeholder="Enter your prompt here...",
+                        lines=4,
+                        value="Once upon a time"
+                    )
+                    with gr.Row():
+                        max_tokens = gr.Slider(
+                            minimum=10, maximum=256, value=100, step=10,
+                            label="Max New Tokens"
+                        )
+                        temperature = gr.Slider(
+                            minimum=0.1, maximum=2.0, value=0.8, step=0.1,
+                            label="Temperature"
+                        )
+                    with gr.Row():
+                        top_k = gr.Slider(
+                            minimum=1, maximum=100, value=50, step=1,
+                            label="Top-K"
+                        )
+                        top_p = gr.Slider(
+                            minimum=0.1, maximum=1.0, value=0.9, step=0.05,
+                            label="Top-P"
+                        )
+                    generate_btn = gr.Button("🎯 Generate", variant="primary")
+                with gr.Column(scale=2):
+                    output_text = gr.Textbox(
+                        label="Generated Text",
+                        lines=12,
+                        show_copy_button=True
+                    )
+            generate_btn.click(
+                fn=generate_text,
+                inputs=[prompt_input, max_tokens, temperature, top_k, top_p],
+                outputs=output_text
+            )
+            gr.Examples(
+                examples=[
+                    ["Once upon a time", 100, 0.8, 50, 0.9],
+                    ["The quick brown fox", 50, 0.7, 40, 0.95],
+                    ["In a galaxy far away", 150, 1.0, 60, 0.85],
+                    ["def fibonacci(n):", 80, 0.6, 30, 0.9],
+                ],
+                inputs=[prompt_input, max_tokens, temperature, top_k, top_p],
+            )
+        with gr.TabItem("ℹ️ Model Info"):
+            info_btn = gr.Button("📊 Load Model Info")
+            info_output = gr.Markdown()
+            info_btn.click(fn=get_model_info, outputs=info_output)
+    gr.Markdown("""
+    ---
+    ### 🔧 Architecture
+    - **MoE**: 8 experts, top-2 routing (25% activation)
+    - **GQA**: Grouped Query Attention (4:1 ratio)
+    - **RoPE**: Rotary Position Embeddings
+    - **SwiGLU**: Improved activation function
+    ### 📦 Model Variants
+    | Model | Total | Active | Target |
+    |-------|-------|--------|--------|
+    | max2-nano | 500M | 125M | IoT/Mobile |
+    | max2-lite | 1.5B | 375M | Mobile/Tablet |
+    | max2-pro | 3B | 750M | Desktop |
+    ---
+    **[Model Repository](https://huggingface.co/fariasultana/MiniMind)** |
+    **License**: Apache 2.0
+    """)
+# Load model on startup
+try:
+    load_model()
+except Exception as e:
+    print(f"Model will load on first request: {e}")
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

model_files/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""MiniMind Max2 Configuration Module"""
+from .model_config import Max2Config, get_config, estimate_params, MAX2_CONFIGS
+# Backward compatibility
+Mind2Config = Max2Config
+MIND2_CONFIGS = MAX2_CONFIGS
+__all__ = [
+    "Max2Config",
+    "Mind2Config",
+    "get_config",
+    "estimate_params",
+    "MAX2_CONFIGS",
+    "MIND2_CONFIGS",
+]

model_files/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+MiniMind Max2 Model Configuration
+Inspired by MiniMax M2's efficient activated parameters design
+"""
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+@dataclass
+class Max2Config:
+    """Configuration for MiniMind Max2 models."""
+    # Model identification
+    model_name: str = "max2-lite"
+    model_version: str = "1.0.0"
+    # Architecture dimensions
+    hidden_size: int = 1536
+    intermediate_size: int = 4096
+    num_hidden_layers: int = 24
+    num_attention_heads: int = 12
+    num_key_value_heads: int = 3  # GQA ratio 4:1
+    # Vocabulary and embeddings
+    vocab_size: int = 32000
+    max_position_embeddings: int = 8192
+    rope_theta: float = 10000.0
+    # MoE (Mixture of Experts) configuration
+    use_moe: bool = True
+    num_experts: int = 8
+    num_experts_per_tok: int = 2  # Only 25% activation
+    expert_hidden_size: int = 1024
+    router_aux_loss_coef: float = 0.01
+    # Normalization and activation
+    rms_norm_eps: float = 1e-6
+    hidden_act: str = "silu"
+    # Regularization
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    # Special tokens
+    pad_token_id: int = 0
+    bos_token_id: int = 1
+    eos_token_id: int = 2
+    # Initialization
+    initializer_range: float = 0.02
+    # Memory optimization
+    use_cache: bool = True
+    use_flash_attention: bool = True
+    gradient_checkpointing: bool = False
+    def to_dict(self) -> Dict[str, Any]:
+        return {k: v for k, v in self.__dict__.items()}
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "Max2Config":
+        return cls(**{k: v for k, v in config_dict.items() if k in cls.__dataclass_fields__})
+# Predefined model configurations
+MAX2_CONFIGS = {
+    "max2-nano": Max2Config(
+        model_name="max2-nano",
+        hidden_size=768,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_key_value_heads=3,
+        num_experts=4,
+        num_experts_per_tok=1,
+        expert_hidden_size=512,
+        max_position_embeddings=4096,
+    ),
+    "max2-lite": Max2Config(
+        model_name="max2-lite",
+        hidden_size=1536,
+        intermediate_size=4096,
+        num_hidden_layers=24,
+        num_attention_heads=12,
+        num_key_value_heads=3,
+        num_experts=8,
+        num_experts_per_tok=2,
+        expert_hidden_size=1024,
+        max_position_embeddings=8192,
+    ),
+    "max2-pro": Max2Config(
+        model_name="max2-pro",
+        hidden_size=2560,
+        intermediate_size=6912,
+        num_hidden_layers=32,
+        num_attention_heads=20,
+        num_key_value_heads=4,
+        num_experts=8,
+        num_experts_per_tok=2,
+        expert_hidden_size=1728,
+        max_position_embeddings=16384,
+    ),
+}
+# Aliases for backward compatibility
+Mind2Config = Max2Config
+MIND2_CONFIGS = MAX2_CONFIGS
+def get_config(model_name: str) -> Max2Config:
+    """Get predefined configuration by name."""
+    if model_name not in MAX2_CONFIGS:
+        raise ValueError(f"Unknown model: {model_name}. Available: {list(MAX2_CONFIGS.keys())}")
+    return MAX2_CONFIGS[model_name]
+def estimate_params(config: Max2Config) -> dict:
+    """Estimate parameter counts for a configuration."""
+    embed_params = config.vocab_size * config.hidden_size
+    head_dim = config.hidden_size // config.num_attention_heads
+    # Attention parameters per layer (GQA)
+    q_params = config.hidden_size * config.hidden_size
+    kv_params = 2 * config.hidden_size * (config.num_key_value_heads * head_dim)
+    o_params = config.hidden_size * config.hidden_size
+    attn_params_per_layer = q_params + kv_params + o_params
+    # MoE FFN parameters per layer
+    if config.use_moe:
+        router_params = config.hidden_size * config.num_experts
+        expert_params = 3 * config.hidden_size * config.expert_hidden_size
+        ffn_params_per_layer = router_params + (config.num_experts * expert_params)
+        active_ffn_params = router_params + (config.num_experts_per_tok * expert_params)
+    else:
+        ffn_params_per_layer = 3 * config.hidden_size * config.intermediate_size
+        active_ffn_params = ffn_params_per_layer
+    norm_params_per_layer = 2 * config.hidden_size
+    layer_params = attn_params_per_layer + ffn_params_per_layer + norm_params_per_layer
+    active_layer_params = attn_params_per_layer + active_ffn_params + norm_params_per_layer
+    total_params = embed_params + (config.num_hidden_layers * layer_params) + embed_params
+    active_params = embed_params + (config.num_hidden_layers * active_layer_params) + embed_params
+    return {
+        "total_params": total_params,
+        "active_params": active_params,
+        "activation_ratio": active_params / total_params,
+        "total_params_b": total_params / 1e9,
+        "active_params_b": active_params / 1e9,
+        "estimated_size_fp16_gb": (total_params * 2) / (1024**3),
+        "estimated_size_int4_gb": (total_params * 0.5) / (1024**3),
+    }

model_files/model/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+MiniMind Max2 Model Package
+A lightweight, efficient language model designed for edge deployment.
+"""
+from .mind2_model import (
+    Max2ForCausalLM,
+    Max2Model,
+    Mind2ForCausalLM,
+    Mind2Model,
+    create_model
+)
+from .components import (
+    Max2Attention,
+    Max2MoE,
+    Max2DecoderLayer,
+    Max2RMSNorm,
+    Max2RotaryEmbedding,
+    Max2MLP,
+    Max2Expert,
+    # Backward compatibility
+    Mind2Attention,
+    Mind2MoE,
+    Mind2DecoderLayer,
+    Mind2RMSNorm,
+    Mind2RotaryEmbedding,
+)
+__all__ = [
+    # Max2 (primary)
+    "Max2ForCausalLM",
+    "Max2Model",
+    "Max2Attention",
+    "Max2MoE",
+    "Max2DecoderLayer",
+    "Max2RMSNorm",
+    "Max2RotaryEmbedding",
+    "Max2MLP",
+    "Max2Expert",
+    # Mind2 (backward compatibility)
+    "Mind2ForCausalLM",
+    "Mind2Model",
+    "Mind2Attention",
+    "Mind2MoE",
+    "Mind2DecoderLayer",
+    "Mind2RMSNorm",
+    "Mind2RotaryEmbedding",
+    # Factory
+    "create_model",
+]
+__version__ = "1.0.0"

model_files/model/components.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+MiniMind Max2 Model Components
+Core building blocks: RMSNorm, RoPE, GQA Attention, MoE
+"""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from configs.model_config import Max2Config
+class Max2RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization (faster than LayerNorm)."""
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        input_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        return self.weight * x.to(input_dtype)
+class Max2RotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (RoPE) for efficient position encoding."""
+    def __init__(self, dim: int, max_position_embeddings: int = 8192, base: float = 10000.0):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(max_position_embeddings)
+    def _set_cos_sin_cache(self, seq_len: int):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+    def forward(self, x: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len)
+        return self.cos_cached[:seq_len].to(x.dtype), self.sin_cached[:seq_len].to(x.dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotate half the hidden dims of the input."""
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply rotary position embeddings to query and key tensors."""
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Max2Attention(nn.Module):
+    """Grouped Query Attention (GQA) - fewer KV heads than Q heads for memory efficiency."""
+    def __init__(self, config: Max2Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_kv_heads
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = Max2RotaryEmbedding(self.head_dim, config.max_position_embeddings, config.rope_theta)
+        self.attention_dropout = config.attention_dropout
+    def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        if n_rep == 1:
+            return hidden_states
+        bs, num_kv_heads, seq_len, head_dim = hidden_states.shape
+        hidden_states = hidden_states[:, :, None, :, :].expand(bs, num_kv_heads, n_rep, seq_len, head_dim)
+        return hidden_states.reshape(bs, num_kv_heads * n_rep, seq_len, head_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        batch_size, seq_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = self._repeat_kv(key_states, self.num_key_value_groups)
+        value_states = self._repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
+class Max2MLP(nn.Module):
+    """SwiGLU Feed-Forward Network."""
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class Max2Expert(nn.Module):
+    """Single expert in the Mixture of Experts layer."""
+    def __init__(self, hidden_size: int, expert_hidden_size: int):
+        super().__init__()
+        self.mlp = Max2MLP(hidden_size, expert_hidden_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class Max2MoE(nn.Module):
+    """
+    Mixture of Experts (MoE) layer.
+    Efficient parameter activation - only top-k experts are used per token.
+    Inspired by MiniMax M2's efficient activated parameters design.
+    """
+    def __init__(self, config: Max2Config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.expert_hidden_size = config.expert_hidden_size
+        self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([
+            Max2Expert(self.hidden_size, self.expert_hidden_size)
+            for _ in range(self.num_experts)
+        ])
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        hidden_states_flat = hidden_states.view(-1, hidden_dim)
+        router_logits = self.gate(hidden_states_flat)
+        router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+        router_weights, selected_experts = torch.topk(router_probs, self.num_experts_per_tok, dim=-1)
+        router_weights = router_weights.to(hidden_states.dtype)
+        router_weights = router_weights / router_weights.sum(dim=-1, keepdim=True)
+        final_hidden_states = torch.zeros_like(hidden_states_flat)
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        for expert_idx in range(self.num_experts):
+            expert = self.experts[expert_idx]
+            for top_k_idx in range(self.num_experts_per_tok):
+                token_indices = expert_mask[expert_idx, top_k_idx].nonzero(as_tuple=True)[0]
+                if token_indices.numel() > 0:
+                    expert_input = hidden_states_flat[token_indices]
+                    expert_output = expert(expert_input)
+                    weights = router_weights[token_indices, top_k_idx].unsqueeze(-1)
+                    final_hidden_states[token_indices] += weights * expert_output
+        final_hidden_states = final_hidden_states.view(batch_size, seq_len, hidden_dim)
+        num_tokens = router_probs.shape[0]
+        expert_mask_float = F.one_hot(selected_experts, num_classes=self.num_experts).float()
+        tokens_per_expert = expert_mask_float.sum(dim=(0, 1)) / num_tokens
+        router_prob_per_expert = router_probs.mean(dim=0)
+        aux_loss = self.num_experts * (tokens_per_expert * router_prob_per_expert).sum() * self.router_aux_loss_coef
+        return final_hidden_states, aux_loss
+class Max2DecoderLayer(nn.Module):
+    """Single transformer decoder layer with GQA attention and MoE FFN."""
+    def __init__(self, config: Max2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Max2Attention(config, layer_idx)
+        if config.use_moe:
+            self.mlp = Max2MoE(config)
+            self.use_moe = True
+        else:
+            self.mlp = Max2MLP(config.hidden_size, config.intermediate_size)
+            self.use_moe = False
+        self.input_layernorm = Max2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Max2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, present_key_value = self.self_attn(hidden_states, attention_mask, past_key_value, use_cache)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.use_moe:
+            hidden_states, aux_loss = self.mlp(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
+            aux_loss = torch.tensor(0.0, device=hidden_states.device)
+        hidden_states = residual + hidden_states
+        return hidden_states, present_key_value, aux_loss
+# Backward compatibility aliases
+Mind2RMSNorm = Max2RMSNorm
+Mind2RotaryEmbedding = Max2RotaryEmbedding
+Mind2Attention = Max2Attention
+Mind2MLP = Max2MLP
+Mind2Expert = Max2Expert
+Mind2MoE = Max2MoE
+Mind2DecoderLayer = Max2DecoderLayer

model_files/model/mind2_model.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+MiniMind Max2 Main Model
+Complete implementation of the Max2 language model.
+"""
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from configs.model_config import Max2Config, get_config
+from .components import Max2DecoderLayer, Max2RMSNorm
+class Max2Model(nn.Module):
+    """Max2 Transformer Model - outputs raw hidden states."""
+    def __init__(self, config: Max2Config):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
+        self.layers = nn.ModuleList([Max2DecoderLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.norm = Max2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    def _make_causal_mask(self, seq_len: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+        mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+        mask = torch.triu(mask, diagonal=1)
+        return mask.unsqueeze(0).unsqueeze(0)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]], torch.Tensor]:
+        batch_size, seq_len = input_ids.shape
+        hidden_states = self.embed_tokens(input_ids)
+        causal_mask = self._make_causal_mask(seq_len, hidden_states.dtype, hidden_states.device)
+        if attention_mask is not None:
+            padding_mask = (1.0 - attention_mask[:, None, None, :].to(hidden_states.dtype)) * float("-inf")
+            causal_mask = causal_mask + padding_mask
+        next_cache = [] if use_cache else None
+        total_aux_loss = torch.tensor(0.0, device=hidden_states.device)
+        for idx, layer in enumerate(self.layers):
+            past_kv = past_key_values[idx] if past_key_values else None
+            hidden_states, present_kv, aux_loss = layer(hidden_states, causal_mask, past_kv, use_cache)
+            if use_cache:
+                next_cache.append(present_kv)
+            total_aux_loss = total_aux_loss + aux_loss
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, next_cache, total_aux_loss
+class Max2ForCausalLM(nn.Module):
+    """Max2 Model with Language Modeling head for text generation."""
+    def __init__(self, config: Max2Config):
+        super().__init__()
+        self.config = config
+        self.model = Max2Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head.weight = self.model.embed_tokens.weight
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List], torch.Tensor]:
+        hidden_states, next_cache, aux_loss = self.model(input_ids, attention_mask, past_key_values, use_cache)
+        logits = self.lm_head(hidden_states).float()
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = CrossEntropyLoss()(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+            loss = loss + aux_loss
+        return loss, logits, next_cache, aux_loss
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        do_sample: bool = True,
+    ) -> torch.LongTensor:
+        """Simple generation with top-k/top-p sampling."""
+        generated = input_ids
+        past_key_values = None
+        for _ in range(max_new_tokens):
+            if past_key_values is None:
+                _, logits, past_key_values, _ = self(generated, use_cache=True)
+            else:
+                _, logits, past_key_values, _ = self(generated[:, -1:], past_key_values=past_key_values, use_cache=True)
+            next_token_logits = logits[:, -1, :] / temperature
+            if do_sample:
+                if top_k > 0:
+                    indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                    next_token_logits[indices_to_remove] = float('-inf')
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    next_token_logits[indices_to_remove] = float('-inf')
+                probs = F.softmax(next_token_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            generated = torch.cat([generated, next_token], dim=1)
+            if (next_token == self.config.eos_token_id).all():
+                break
+        return generated
+# Backward compatibility aliases
+Mind2Model = Max2Model
+Mind2ForCausalLM = Max2ForCausalLM
+def create_model(model_name: str = "max2-lite", device: str = "cuda", dtype: torch.dtype = torch.float16) -> Max2ForCausalLM:
+    """Factory function to create a Max2 model."""
+    config = get_config(model_name)
+    model = Max2ForCausalLM(config)
+    return model.to(device=device, dtype=dtype) if torch.cuda.is_available() else model
+if __name__ == "__main__":
+    for model_name in ["max2-nano", "max2-lite", "max2-pro"]:
+        print(f"\n{'='*50}\nTesting {model_name}\n{'='*50}")
+        config = get_config(model_name)
+        model = Max2ForCausalLM(config)
+        total_params = sum(p.numel() for p in model.parameters())
+        print(f"Total Parameters: {total_params / 1e9:.3f}B")
+        input_ids = torch.randint(0, config.vocab_size, (2, 128))
+        model.eval()
+        with torch.no_grad():
+            loss, logits, _, aux_loss = model(input_ids, labels=input_ids)
+        print(f"Logits shape: {logits.shape}")
+        print(f"Loss: {loss:.4f}, Aux loss: {aux_loss:.6f}")
+        print("Forward pass successful!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch>=2.0.0
2	+ gradio>=4.0.0