File size: 7,460 Bytes

dff8d35

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db17f2cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/rameyjm7/workspace/TML/lpu/llm-preference-unlearning/lpu-env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "`torch_dtype` is deprecated! Use `dtype` instead!\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Loaded Qwen/Qwen2.5-3B-Instruct on cuda with 36 transformer layers.\n",
      "[INFO] Saved activations for prompt 1: 36 layers × 2 versions (full & pooled)\n",
      "[INFO] Saved activations for prompt 2: 36 layers × 2 versions (full & pooled)\n",
      "[INFO] Saved activations for prompt 3: 36 layers × 2 versions (full & pooled)\n",
      "[INFO] Saved activations for prompt 4: 36 layers × 2 versions (full & pooled)\n",
      "[INFO] Saved activations for prompt 5: 36 layers × 2 versions (full & pooled)\n",
      "[INFO] Activation extraction complete → activations/\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "#!/usr/bin/env python3\n",
    "\"\"\"\n",
    "activation_probe_detailed.py — Phase 3.1–3.2 (Final)\n",
    "Captures both full token-wise and mean-pooled activations\n",
    "from all transformer layers of Qwen2.5-3B-Instruct.\n",
    "\n",
    "Output structure:\n",
    "activations/\n",
    " ├─ prompt01/\n",
    " │   ├─ layer00_full.npy\n",
    " │   ├─ layer00_pooled.npy\n",
    " │   ├─ ...\n",
    " │   └─ layer35_pooled.npy\n",
    " ├─ prompt02/\n",
    " │   └─ ...\n",
    "\"\"\"\n",
    "import os\n",
    "import json\n",
    "import torch\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------\n",
    "# 1. Model Loading\n",
    "# ---------------------------------------------------------------------\n",
    "def load_model(model_name=\"Qwen/Qwen2.5-3B-Instruct\"):\n",
    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "    model = AutoModelForCausalLM.from_pretrained(\n",
    "        model_name,\n",
    "        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,\n",
    "        device_map=\"auto\"\n",
    "    )\n",
    "    model.eval()\n",
    "    n_layers = len(model.model.layers)\n",
    "    print(f\"[INFO] Loaded {model_name} on {device} with {n_layers} transformer layers.\")\n",
    "    return model, tokenizer, device, n_layers\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------\n",
    "# 2. Hook registration (safe)\n",
    "# ---------------------------------------------------------------------\n",
    "def register_hooks(model, store):\n",
    "    \"\"\"Attach forward hooks that safely copy activations to CPU.\"\"\"\n",
    "    handles = []\n",
    "    for idx, layer in enumerate(model.model.layers):\n",
    "        def hook_fn(module, inp, out, layer_idx=idx):\n",
    "            store[layer_idx] = out[0].detach().cpu()\n",
    "        handles.append(layer.register_forward_hook(hook_fn))\n",
    "    return handles\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------\n",
    "# 3. Activation Capture\n",
    "# ---------------------------------------------------------------------\n",
    "def capture_activations(model, tokenizer, device, prompts, save_dir=\"activations\"):\n",
    "    os.makedirs(save_dir, exist_ok=True)\n",
    "    store = {}\n",
    "    hooks = register_hooks(model, store)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        for i, prompt in enumerate(prompts, start=1):\n",
    "            store.clear()\n",
    "            inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
    "            _ = model(**inputs)\n",
    "\n",
    "            prompt_dir = os.path.join(save_dir, f\"prompt{i:02d}\")\n",
    "            os.makedirs(prompt_dir, exist_ok=True)\n",
    "\n",
    "            for layer_idx, tensor in store.items():\n",
    "                # Save full token activations: (seq_len, hidden_dim)\n",
    "                full = tensor.squeeze(0).cpu().numpy()\n",
    "                np.save(f\"{prompt_dir}/layer{layer_idx:02d}_full.npy\", full)\n",
    "\n",
    "                # Save mean-pooled activations: (hidden_dim,)\n",
    "                pooled = full.mean(axis=0)\n",
    "                np.save(f\"{prompt_dir}/layer{layer_idx:02d}_pooled.npy\", pooled)\n",
    "\n",
    "            print(f\"[INFO] Saved activations for prompt {i}: \"\n",
    "                  f\"{len(store)} layers × 2 versions (full & pooled)\")\n",
    "\n",
    "    # Remove hooks after all prompts processed\n",
    "    for h in hooks:\n",
    "        h.remove()\n",
    "\n",
    "    print(f\"[INFO] Activation extraction complete → {save_dir}/\")\n",
    "\n",
    "\n",
    "# ---------------------------------------------------------------------\n",
    "# 4. Main Entry\n",
    "# ---------------------------------------------------------------------\n",
    "def main():\n",
    "    # Load latest recommender JSON log\n",
    "    log_dir = \"logs\"\n",
    "    log_files = sorted([\n",
    "        f for f in os.listdir(log_dir)\n",
    "        if f.startswith(\"recommender_\") and f.endswith(\".json\")\n",
    "    ])\n",
    "    if not log_files:\n",
    "        raise FileNotFoundError(\"No recommender_*.json log found.\")\n",
    "    latest_log = os.path.join(log_dir, log_files[-1])\n",
    "\n",
    "    with open(latest_log, \"r\", encoding=\"utf-8\") as f:\n",
    "        data = json.load(f)\n",
    "    prompts = [r[\"question\"] for r in data[\"records\"]]\n",
    "\n",
    "    model, tokenizer, device, n_layers = load_model()\n",
    "    capture_activations(model, tokenizer, device, prompts)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lpu-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}