AustingDong
commited on
Commit
·
7e57874
1
Parent(s):
217eab6
improved Janus
Browse files- app.py +21 -24
- demo/cam.py +0 -674
- demo/visualization.py +1 -1
- janus/models/modeling_vlm.py +3 -2
- janus/models/siglip_vit.py +3 -3
- questions/VLAT.py +3 -3
app.py
CHANGED
|
@@ -27,7 +27,8 @@ clip_utils.init_Clip()
|
|
| 27 |
model_utils, vl_gpt, tokenizer = None, None, None
|
| 28 |
model_name = "Clip"
|
| 29 |
language_model_max_layer = 24
|
| 30 |
-
|
|
|
|
| 31 |
vision_model_best_layer = 24
|
| 32 |
|
| 33 |
def clean():
|
|
@@ -215,7 +216,7 @@ def multimodal_understanding(model_type,
|
|
| 215 |
# Gradio interface
|
| 216 |
|
| 217 |
def model_slider_change(model_type):
|
| 218 |
-
global model_utils, vl_gpt, tokenizer, clip_utils, model_name, language_model_max_layer,
|
| 219 |
model_name = model_type
|
| 220 |
|
| 221 |
|
|
@@ -226,13 +227,6 @@ def model_slider_change(model_type):
|
|
| 226 |
gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
| 227 |
]
|
| 228 |
|
| 229 |
-
visual_res = [
|
| 230 |
-
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
| 231 |
-
gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
|
| 232 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
| 233 |
-
gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
|
| 234 |
-
]
|
| 235 |
-
|
| 236 |
language_res = [
|
| 237 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
| 238 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
|
@@ -253,7 +247,7 @@ def model_slider_change(model_type):
|
|
| 253 |
return tuple(encoder_only_res + sliders)
|
| 254 |
|
| 255 |
elif model_type.split('-')[0] == "Janus":
|
| 256 |
-
|
| 257 |
clean()
|
| 258 |
set_seed()
|
| 259 |
model_utils = Janus_Utils()
|
|
@@ -262,13 +256,14 @@ def model_slider_change(model_type):
|
|
| 262 |
layer.self_attn = ModifiedLlamaAttention(layer.self_attn)
|
| 263 |
|
| 264 |
language_model_max_layer = 24
|
| 265 |
-
|
|
|
|
| 266 |
|
| 267 |
sliders = [
|
| 268 |
-
gr.Slider(minimum=1, maximum=24, value=
|
| 269 |
-
gr.Slider(minimum=1, maximum=24, value=
|
| 270 |
]
|
| 271 |
-
return tuple(
|
| 272 |
|
| 273 |
elif model_type.split('-')[0] == "LLaVA":
|
| 274 |
|
|
@@ -278,11 +273,12 @@ def model_slider_change(model_type):
|
|
| 278 |
version = model_type.split('-')[1]
|
| 279 |
vl_gpt, tokenizer = model_utils.init_LLaVA(version=version)
|
| 280 |
language_model_max_layer = 32 if version == "1.5" else 28
|
| 281 |
-
|
|
|
|
| 282 |
|
| 283 |
sliders = [
|
| 284 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 285 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 286 |
]
|
| 287 |
return tuple(language_res + sliders)
|
| 288 |
|
|
@@ -295,11 +291,12 @@ def model_slider_change(model_type):
|
|
| 295 |
layer.self_attn = ModifiedGemmaAttention(layer.self_attn)
|
| 296 |
language_model_max_layer = 18
|
| 297 |
vision_model_best_layer = 19
|
| 298 |
-
|
|
|
|
| 299 |
|
| 300 |
sliders = [
|
| 301 |
-
gr.Slider(minimum=1, maximum=
|
| 302 |
-
gr.Slider(minimum=1, maximum=
|
| 303 |
]
|
| 304 |
return tuple(language_res + sliders)
|
| 305 |
|
|
@@ -320,15 +317,15 @@ def focus_change(focus):
|
|
| 320 |
if response_type.value == "answer + visualization":
|
| 321 |
res = (
|
| 322 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
| 323 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 324 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 325 |
)
|
| 326 |
return res
|
| 327 |
else:
|
| 328 |
res = (
|
| 329 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
| 330 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 331 |
-
gr.Slider(minimum=1, maximum=language_model_max_layer, value=
|
| 332 |
)
|
| 333 |
return res
|
| 334 |
|
|
|
|
| 27 |
model_utils, vl_gpt, tokenizer = None, None, None
|
| 28 |
model_name = "Clip"
|
| 29 |
language_model_max_layer = 24
|
| 30 |
+
language_model_best_layer_min = 8
|
| 31 |
+
language_model_best_layer_max = 8
|
| 32 |
vision_model_best_layer = 24
|
| 33 |
|
| 34 |
def clean():
|
|
|
|
| 216 |
# Gradio interface
|
| 217 |
|
| 218 |
def model_slider_change(model_type):
|
| 219 |
+
global model_utils, vl_gpt, tokenizer, clip_utils, model_name, language_model_max_layer, language_model_best_layer_min, language_model_best_layer_max, vision_model_best_layer
|
| 220 |
model_name = model_type
|
| 221 |
|
| 222 |
|
|
|
|
| 227 |
gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
| 228 |
]
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
language_res = [
|
| 231 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
| 232 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
|
|
|
| 247 |
return tuple(encoder_only_res + sliders)
|
| 248 |
|
| 249 |
elif model_type.split('-')[0] == "Janus":
|
| 250 |
+
# best seed: 70
|
| 251 |
clean()
|
| 252 |
set_seed()
|
| 253 |
model_utils = Janus_Utils()
|
|
|
|
| 256 |
layer.self_attn = ModifiedLlamaAttention(layer.self_attn)
|
| 257 |
|
| 258 |
language_model_max_layer = 24
|
| 259 |
+
language_model_best_layer_min = 8
|
| 260 |
+
language_model_best_layer_max = 10
|
| 261 |
|
| 262 |
sliders = [
|
| 263 |
+
gr.Slider(minimum=1, maximum=24, value=language_model_best_layer_min, step=1, label="visualization layers min"),
|
| 264 |
+
gr.Slider(minimum=1, maximum=24, value=language_model_best_layer_max, step=1, label="visualization layers max"),
|
| 265 |
]
|
| 266 |
+
return tuple(language_res + sliders)
|
| 267 |
|
| 268 |
elif model_type.split('-')[0] == "LLaVA":
|
| 269 |
|
|
|
|
| 273 |
version = model_type.split('-')[1]
|
| 274 |
vl_gpt, tokenizer = model_utils.init_LLaVA(version=version)
|
| 275 |
language_model_max_layer = 32 if version == "1.5" else 28
|
| 276 |
+
language_model_best_layer_min = 10
|
| 277 |
+
language_model_best_layer_max = 10
|
| 278 |
|
| 279 |
sliders = [
|
| 280 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_min, step=1, label="visualization layers min"),
|
| 281 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_max, step=1, label="visualization layers max"),
|
| 282 |
]
|
| 283 |
return tuple(language_res + sliders)
|
| 284 |
|
|
|
|
| 291 |
layer.self_attn = ModifiedGemmaAttention(layer.self_attn)
|
| 292 |
language_model_max_layer = 18
|
| 293 |
vision_model_best_layer = 19
|
| 294 |
+
language_model_best_layer_min = 11
|
| 295 |
+
language_model_best_layer_max = 15
|
| 296 |
|
| 297 |
sliders = [
|
| 298 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_min, step=1, label="visualization layers min"),
|
| 299 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_max, step=1, label="visualization layers max"),
|
| 300 |
]
|
| 301 |
return tuple(language_res + sliders)
|
| 302 |
|
|
|
|
| 317 |
if response_type.value == "answer + visualization":
|
| 318 |
res = (
|
| 319 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
| 320 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_min, step=1, label="visualization layers min"),
|
| 321 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_max, step=1, label="visualization layers max")
|
| 322 |
)
|
| 323 |
return res
|
| 324 |
else:
|
| 325 |
res = (
|
| 326 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
| 327 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_min, step=1, label="visualization layers min"),
|
| 328 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer_max, step=1, label="visualization layers max")
|
| 329 |
)
|
| 330 |
return res
|
| 331 |
|
demo/cam.py
DELETED
|
@@ -1,674 +0,0 @@
|
|
| 1 |
-
import cv2
|
| 2 |
-
import numpy as np
|
| 3 |
-
import types
|
| 4 |
-
import torch
|
| 5 |
-
import torch.nn.functional as F
|
| 6 |
-
import matplotlib.pyplot as plt
|
| 7 |
-
from PIL import Image
|
| 8 |
-
from torch import nn
|
| 9 |
-
import spaces
|
| 10 |
-
from demo.modify_llama import *
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class AttentionGuidedCAM:
|
| 14 |
-
def __init__(self, model, register=True):
|
| 15 |
-
self.model = model
|
| 16 |
-
self.gradients = []
|
| 17 |
-
self.activations = []
|
| 18 |
-
self.hooks = []
|
| 19 |
-
if register:
|
| 20 |
-
self._register_hooks()
|
| 21 |
-
|
| 22 |
-
def _register_hooks(self):
|
| 23 |
-
for layer in self.target_layers:
|
| 24 |
-
self.hooks.append(layer.register_forward_hook(self._forward_hook))
|
| 25 |
-
self.hooks.append(layer.register_backward_hook(self._backward_hook))
|
| 26 |
-
|
| 27 |
-
def _forward_hook(self, module, input, output):
|
| 28 |
-
self.activations.append(output)
|
| 29 |
-
|
| 30 |
-
def _backward_hook(self, module, grad_in, grad_out):
|
| 31 |
-
self.gradients.append(grad_out[0])
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def remove_hooks(self):
|
| 35 |
-
for hook in self.hooks:
|
| 36 |
-
hook.remove()
|
| 37 |
-
|
| 38 |
-
@spaces.GPU(duration=120)
|
| 39 |
-
def generate_cam(self, input_tensor, class_idx=None):
|
| 40 |
-
raise NotImplementedError
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class AttentionGuidedCAMClip(AttentionGuidedCAM):
|
| 46 |
-
def __init__(self, model, target_layers):
|
| 47 |
-
self.target_layers = target_layers
|
| 48 |
-
super().__init__(model)
|
| 49 |
-
|
| 50 |
-
@spaces.GPU(duration=120)
|
| 51 |
-
def generate_cam(self, input_tensor, class_idx=None, visual_pooling_method="CLS"):
|
| 52 |
-
""" Generates Grad-CAM heatmap for ViT. """
|
| 53 |
-
|
| 54 |
-
# Forward pass
|
| 55 |
-
output_full = self.model(**input_tensor)
|
| 56 |
-
|
| 57 |
-
if class_idx is None:
|
| 58 |
-
class_idx = torch.argmax(output_full.logits, dim=1).item()
|
| 59 |
-
|
| 60 |
-
if visual_pooling_method == "CLS":
|
| 61 |
-
output = output_full.image_embeds
|
| 62 |
-
elif visual_pooling_method == "avg":
|
| 63 |
-
output = self.model.visual_projection(output_full.vision_model_output.last_hidden_state).mean(dim=1)
|
| 64 |
-
else:
|
| 65 |
-
# project -> pooling
|
| 66 |
-
output, _ = self.model.visual_projection(output_full.vision_model_output.last_hidden_state).max(dim=1)
|
| 67 |
-
|
| 68 |
-
# pooling -> project
|
| 69 |
-
# output_mx, _ = output_full.vision_model_output.last_hidden_state.max(dim=1)
|
| 70 |
-
# output = self.model.visual_projection(output_mx)
|
| 71 |
-
|
| 72 |
-
output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
|
| 73 |
-
|
| 74 |
-
# Aggregate activations and gradients from ALL layers
|
| 75 |
-
self.model.zero_grad()
|
| 76 |
-
cam_sum = None
|
| 77 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 78 |
-
|
| 79 |
-
# act = torch.sigmoid(act[0])
|
| 80 |
-
act = F.relu(act[0])
|
| 81 |
-
|
| 82 |
-
grad_weights = grad.mean(dim=-1, keepdim=True)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
print("act shape", act.shape)
|
| 86 |
-
print("grad_weights shape", grad_weights.shape)
|
| 87 |
-
|
| 88 |
-
# cam = (act * grad_weights).sum(dim=-1)
|
| 89 |
-
cam, _ = (act * grad_weights).max(dim=-1)
|
| 90 |
-
# cam, _ = act.max(dim=-1)
|
| 91 |
-
# cam = cam.unsqueeze(0)
|
| 92 |
-
# cam, _ = grad_weights.max(dim=-1)
|
| 93 |
-
print("cam_shape: ", cam.shape)
|
| 94 |
-
|
| 95 |
-
# Sum across all layers
|
| 96 |
-
if cam_sum is None:
|
| 97 |
-
cam_sum = cam
|
| 98 |
-
else:
|
| 99 |
-
cam_sum += cam
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
# Normalize
|
| 103 |
-
cam_sum = F.relu(cam_sum)
|
| 104 |
-
|
| 105 |
-
# thresholding
|
| 106 |
-
cam_sum = cam_sum.to(torch.float32)
|
| 107 |
-
percentile = torch.quantile(cam_sum, 0.2) # Adjust threshold dynamically
|
| 108 |
-
cam_sum[cam_sum < percentile] = 0
|
| 109 |
-
|
| 110 |
-
# Reshape
|
| 111 |
-
print("cam_sum shape: ", cam_sum.shape)
|
| 112 |
-
cam_sum = cam_sum[0, 1:]
|
| 113 |
-
|
| 114 |
-
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 115 |
-
grid_size = int(num_patches ** 0.5)
|
| 116 |
-
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 117 |
-
|
| 118 |
-
cam_sum = cam_sum.view(grid_size, grid_size).detach()
|
| 119 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 120 |
-
|
| 121 |
-
return cam_sum, output_full, grid_size
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
| 125 |
-
def __init__(self, model, target_layers):
|
| 126 |
-
self.target_layers = target_layers
|
| 127 |
-
super().__init__(model)
|
| 128 |
-
self._modify_layers()
|
| 129 |
-
self._register_hooks_activations()
|
| 130 |
-
|
| 131 |
-
def _modify_layers(self):
|
| 132 |
-
for layer in self.target_layers:
|
| 133 |
-
setattr(layer, "attn_gradients", None)
|
| 134 |
-
setattr(layer, "attention_map", None)
|
| 135 |
-
|
| 136 |
-
layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
|
| 137 |
-
layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
|
| 138 |
-
layer.save_attn_map = types.MethodType(save_attn_map, layer)
|
| 139 |
-
layer.get_attn_map = types.MethodType(get_attn_map, layer)
|
| 140 |
-
|
| 141 |
-
def _forward_activate_hooks(self, module, input, output):
|
| 142 |
-
attn_output, attn_weights = output # Unpack outputs
|
| 143 |
-
module.save_attn_map(attn_weights)
|
| 144 |
-
attn_weights.register_hook(module.save_attn_gradients)
|
| 145 |
-
|
| 146 |
-
def _register_hooks_activations(self):
|
| 147 |
-
for layer in self.target_layers:
|
| 148 |
-
if hasattr(layer, "q_proj"): # is an attention layer
|
| 149 |
-
self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
|
| 150 |
-
|
| 151 |
-
@spaces.GPU(duration=120)
|
| 152 |
-
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
|
| 153 |
-
|
| 154 |
-
torch.autograd.set_detect_anomaly(True)
|
| 155 |
-
for param in self.model.parameters():
|
| 156 |
-
param.requires_grad = False
|
| 157 |
-
|
| 158 |
-
for layer in self.target_layers:
|
| 159 |
-
for param in layer.parameters():
|
| 160 |
-
param.requires_grad = True
|
| 161 |
-
|
| 162 |
-
# Forward pass
|
| 163 |
-
image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
input_ids = input_tensor.input_ids
|
| 167 |
-
|
| 168 |
-
if focus == "Visual Encoder":
|
| 169 |
-
# Pooling
|
| 170 |
-
# if visual_pooling_method == "CLS":
|
| 171 |
-
# image_embeddings_pooled = image_embeddings[:, 0, :]
|
| 172 |
-
# elif visual_pooling_method == "avg":
|
| 173 |
-
# image_embeddings_pooled = image_embeddings[:, 1:, :].mean(dim=1)
|
| 174 |
-
# elif visual_pooling_method == "max":
|
| 175 |
-
# image_embeddings_pooled, _ = image_embeddings[:, 1:, :].max(dim=1)
|
| 176 |
-
|
| 177 |
-
# print("image_embeddings_shape: ", image_embeddings_pooled.shape)
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
start_idx = 620
|
| 181 |
-
# inputs_embeddings_pooled = inputs_embeddings[:, start_idx: -4].mean(dim=1)
|
| 182 |
-
self.model.zero_grad()
|
| 183 |
-
# image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
|
| 184 |
-
|
| 185 |
-
loss = outputs.logits.max(dim=-1).values[0, start_idx + class_idx]
|
| 186 |
-
loss.backward()
|
| 187 |
-
|
| 188 |
-
cam_sum = None
|
| 189 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 190 |
-
# act = torch.sigmoid(act)
|
| 191 |
-
act = F.relu(act[0])
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
# Compute mean of gradients
|
| 195 |
-
print("grad shape:", grad.shape)
|
| 196 |
-
grad_weights = grad.mean(dim=-1, keepdim=True)
|
| 197 |
-
|
| 198 |
-
print("act shape", act.shape)
|
| 199 |
-
print("grad_weights shape", grad_weights.shape)
|
| 200 |
-
|
| 201 |
-
cam, _ = (act * grad_weights).max(dim=-1)
|
| 202 |
-
# cam, _ = grad_weights.max(dim=-1)
|
| 203 |
-
print(cam.shape)
|
| 204 |
-
|
| 205 |
-
# Sum across all layers
|
| 206 |
-
if cam_sum is None:
|
| 207 |
-
cam_sum = cam
|
| 208 |
-
else:
|
| 209 |
-
cam_sum += cam
|
| 210 |
-
|
| 211 |
-
# Normalize
|
| 212 |
-
cam_sum = F.relu(cam_sum)
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
# thresholding
|
| 216 |
-
cam_sum = cam_sum.to(torch.float32)
|
| 217 |
-
percentile = torch.quantile(cam_sum, 0.2) # Adjust threshold dynamically
|
| 218 |
-
cam_sum[cam_sum < percentile] = 0
|
| 219 |
-
|
| 220 |
-
# Reshape
|
| 221 |
-
# if visual_pooling_method == "CLS":
|
| 222 |
-
cam_sum = cam_sum[0, 1:]
|
| 223 |
-
print("cam_sum shape: ", cam_sum.shape)
|
| 224 |
-
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 225 |
-
grid_size = int(num_patches ** 0.5)
|
| 226 |
-
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 227 |
-
|
| 228 |
-
cam_sum = cam_sum.view(grid_size, grid_size)
|
| 229 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 230 |
-
cam_sum = cam_sum.detach().to("cpu")
|
| 231 |
-
|
| 232 |
-
return cam_sum, grid_size, start_idx
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
elif focus == "Language Model":
|
| 240 |
-
self.model.zero_grad()
|
| 241 |
-
loss = outputs.logits.max(dim=-1).values.sum()
|
| 242 |
-
loss.backward()
|
| 243 |
-
|
| 244 |
-
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 245 |
-
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 246 |
-
|
| 247 |
-
cam_sum = None
|
| 248 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 249 |
-
# act = torch.sigmoid(act)
|
| 250 |
-
print("act_shape:", act.shape)
|
| 251 |
-
# print("act1_shape:", act[1].shape)
|
| 252 |
-
|
| 253 |
-
act = act.mean(dim=1)
|
| 254 |
-
|
| 255 |
-
# Compute mean of gradients
|
| 256 |
-
print("grad_shape:", grad.shape)
|
| 257 |
-
grad_weights = F.relu(grad.mean(dim=1))
|
| 258 |
-
|
| 259 |
-
cam = act * grad_weights
|
| 260 |
-
print(cam.shape)
|
| 261 |
-
|
| 262 |
-
# Sum across all layers
|
| 263 |
-
if cam_sum is None:
|
| 264 |
-
cam_sum = cam
|
| 265 |
-
else:
|
| 266 |
-
cam_sum += cam
|
| 267 |
-
|
| 268 |
-
# Normalize
|
| 269 |
-
cam_sum = F.relu(cam_sum)
|
| 270 |
-
|
| 271 |
-
# thresholding
|
| 272 |
-
cam_sum = cam_sum.to(torch.float32)
|
| 273 |
-
percentile = torch.quantile(cam_sum, 0.2) # Adjust threshold dynamically
|
| 274 |
-
cam_sum[cam_sum < percentile] = 0
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
# cam_sum shape: [1, seq_len, seq_len]
|
| 278 |
-
cam_sum_lst = []
|
| 279 |
-
cam_sum_raw = cam_sum
|
| 280 |
-
start = 620
|
| 281 |
-
for i in range(start, cam_sum_raw.shape[1]):
|
| 282 |
-
cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
|
| 283 |
-
cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
|
| 284 |
-
print("cam_sum shape: ", cam_sum.shape)
|
| 285 |
-
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 286 |
-
grid_size = int(num_patches ** 0.5)
|
| 287 |
-
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 288 |
-
|
| 289 |
-
# Fix the reshaping step dynamically
|
| 290 |
-
|
| 291 |
-
cam_sum = cam_sum.view(grid_size, grid_size)
|
| 292 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 293 |
-
cam_sum = cam_sum.detach().to("cpu")
|
| 294 |
-
cam_sum_lst.append(cam_sum)
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
return cam_sum_lst, grid_size, start
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
| 302 |
-
def __init__(self, model, target_layers):
|
| 303 |
-
self.target_layers = target_layers
|
| 304 |
-
super().__init__(model, register=False)
|
| 305 |
-
self._modify_layers()
|
| 306 |
-
self._register_hooks_activations()
|
| 307 |
-
|
| 308 |
-
def _modify_layers(self):
|
| 309 |
-
for layer in self.target_layers:
|
| 310 |
-
setattr(layer, "attn_gradients", None)
|
| 311 |
-
setattr(layer, "attention_map", None)
|
| 312 |
-
|
| 313 |
-
layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
|
| 314 |
-
layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
|
| 315 |
-
layer.save_attn_map = types.MethodType(save_attn_map, layer)
|
| 316 |
-
layer.get_attn_map = types.MethodType(get_attn_map, layer)
|
| 317 |
-
|
| 318 |
-
def _forward_activate_hooks(self, module, input, output):
|
| 319 |
-
attn_output, attn_weights = output # Unpack outputs
|
| 320 |
-
attn_weights.requires_grad_()
|
| 321 |
-
module.save_attn_map(attn_weights)
|
| 322 |
-
attn_weights.register_hook(module.save_attn_gradients)
|
| 323 |
-
|
| 324 |
-
def _register_hooks_activations(self):
|
| 325 |
-
for layer in self.target_layers:
|
| 326 |
-
if hasattr(layer, "q_proj"): # is an attention layer
|
| 327 |
-
self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
|
| 328 |
-
|
| 329 |
-
@spaces.GPU(duration=120)
|
| 330 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
|
| 331 |
-
|
| 332 |
-
# Forward pass
|
| 333 |
-
torch.autograd.set_detect_anomaly(True)
|
| 334 |
-
for param in self.model.parameters():
|
| 335 |
-
param.requires_grad = False
|
| 336 |
-
|
| 337 |
-
for layer in self.target_layers:
|
| 338 |
-
for param in layer.parameters():
|
| 339 |
-
param.requires_grad = True
|
| 340 |
-
|
| 341 |
-
outputs_raw = self.model(**inputs)
|
| 342 |
-
|
| 343 |
-
self.model.zero_grad()
|
| 344 |
-
print("outputs_raw", outputs_raw)
|
| 345 |
-
|
| 346 |
-
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 347 |
-
loss.backward()
|
| 348 |
-
|
| 349 |
-
# get image masks
|
| 350 |
-
image_mask = []
|
| 351 |
-
last = 0
|
| 352 |
-
for i in range(inputs["input_ids"].shape[1]):
|
| 353 |
-
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
| 354 |
-
if (decoded_token == "<image>"):
|
| 355 |
-
image_mask.append(True)
|
| 356 |
-
last = i
|
| 357 |
-
else:
|
| 358 |
-
image_mask.append(False)
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
# Aggregate activations and gradients from ALL layers
|
| 362 |
-
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 363 |
-
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 364 |
-
cam_sum = None
|
| 365 |
-
|
| 366 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 367 |
-
|
| 368 |
-
print("act shape", act.shape)
|
| 369 |
-
print("grad shape", grad.shape)
|
| 370 |
-
|
| 371 |
-
grad = F.relu(grad)
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
cam = act * grad # shape: [1, heads, seq_len, seq_len]
|
| 375 |
-
cam = cam.sum(dim=1) # shape: [1, seq_len, seq_len]
|
| 376 |
-
|
| 377 |
-
# Sum across all layers
|
| 378 |
-
if cam_sum is None:
|
| 379 |
-
cam_sum = cam
|
| 380 |
-
else:
|
| 381 |
-
cam_sum += cam
|
| 382 |
-
|
| 383 |
-
cam_sum = F.relu(cam_sum)
|
| 384 |
-
cam_sum = cam_sum.to(torch.float32)
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
# cam_sum shape: [1, seq_len, seq_len]
|
| 388 |
-
cam_sum_lst = []
|
| 389 |
-
cam_sum_raw = cam_sum
|
| 390 |
-
start_idx = last + 1
|
| 391 |
-
for i in range(start_idx, cam_sum_raw.shape[1]):
|
| 392 |
-
cam_sum = cam_sum_raw[0, i, :] # shape: [1: seq_len]
|
| 393 |
-
|
| 394 |
-
cam_sum = cam_sum[image_mask].unsqueeze(0) # shape: [1, img_seq_len]
|
| 395 |
-
print("cam_sum shape: ", cam_sum.shape)
|
| 396 |
-
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 397 |
-
grid_size = int(num_patches ** 0.5)
|
| 398 |
-
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 399 |
-
|
| 400 |
-
cam_sum = cam_sum.view(grid_size, grid_size)
|
| 401 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 402 |
-
cam_sum_lst.append(cam_sum)
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
return cam_sum_lst, grid_size, start_idx
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
| 413 |
-
def __init__(self, model, target_layers):
|
| 414 |
-
self.target_layers = target_layers
|
| 415 |
-
super().__init__(model, register=True)
|
| 416 |
-
self._modify_layers()
|
| 417 |
-
self._register_hooks_activations()
|
| 418 |
-
|
| 419 |
-
def _modify_layers(self):
|
| 420 |
-
for layer in self.target_layers:
|
| 421 |
-
setattr(layer, "attn_gradients", None)
|
| 422 |
-
setattr(layer, "attention_map", None)
|
| 423 |
-
|
| 424 |
-
layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
|
| 425 |
-
layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
|
| 426 |
-
layer.save_attn_map = types.MethodType(save_attn_map, layer)
|
| 427 |
-
layer.get_attn_map = types.MethodType(get_attn_map, layer)
|
| 428 |
-
|
| 429 |
-
def _forward_activate_hooks(self, module, input, output):
|
| 430 |
-
attn_output, attn_weights = output # Unpack outputs
|
| 431 |
-
print("attn_output shape:", attn_output.shape)
|
| 432 |
-
print("attn_weights shape:", attn_weights.shape)
|
| 433 |
-
module.save_attn_map(attn_weights)
|
| 434 |
-
attn_weights.register_hook(module.save_attn_gradients)
|
| 435 |
-
|
| 436 |
-
def _register_hooks_activations(self):
|
| 437 |
-
for layer in self.target_layers:
|
| 438 |
-
if hasattr(layer, "q_proj"): # is an attention layer
|
| 439 |
-
self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
|
| 440 |
-
|
| 441 |
-
@spaces.GPU(duration=120)
|
| 442 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
|
| 443 |
-
|
| 444 |
-
# Forward pass
|
| 445 |
-
torch.autograd.set_detect_anomaly(True)
|
| 446 |
-
for param in self.model.parameters():
|
| 447 |
-
param.requires_grad = False
|
| 448 |
-
|
| 449 |
-
for layer in self.target_layers:
|
| 450 |
-
for param in layer.parameters():
|
| 451 |
-
param.requires_grad = True
|
| 452 |
-
|
| 453 |
-
outputs_raw = self.model(**inputs, output_hidden_states=True)
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
# get image masks
|
| 458 |
-
image_mask = []
|
| 459 |
-
last = 0
|
| 460 |
-
for i in range(inputs["input_ids"].shape[1]):
|
| 461 |
-
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
| 462 |
-
if (decoded_token == "<image>"):
|
| 463 |
-
image_mask.append(True)
|
| 464 |
-
last = i
|
| 465 |
-
else:
|
| 466 |
-
image_mask.append(False)
|
| 467 |
-
start_idx = last + 1
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
if focus == "Visual Encoder":
|
| 472 |
-
# image_embeddings = outputs_raw.image_hidden_states
|
| 473 |
-
# inputs_embeddings = outputs_raw.hidden_states[0]
|
| 474 |
-
# # Pooling
|
| 475 |
-
# if visual_pooling_method == "avg":
|
| 476 |
-
# image_embeddings_pooled = image_embeddings.mean(dim=1) # end of image: 618
|
| 477 |
-
# elif visual_pooling_method == "max":
|
| 478 |
-
# image_embeddings_pooled, _ = image_embeddings.max(dim=1)
|
| 479 |
-
|
| 480 |
-
# print("image_embeddings_shape: ", image_embeddings_pooled.shape)
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
# inputs_embeddings_pooled = inputs_embeddings[:, start_idx:].mean(dim=1)
|
| 485 |
-
self.model.zero_grad()
|
| 486 |
-
# image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
|
| 487 |
-
|
| 488 |
-
loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + class_idx]
|
| 489 |
-
loss.backward()
|
| 490 |
-
|
| 491 |
-
cam_sum = None
|
| 492 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 493 |
-
# act = torch.sigmoid(act)
|
| 494 |
-
act = F.relu(act[0])
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
# Compute mean of gradients
|
| 498 |
-
print("grad shape:", grad.shape)
|
| 499 |
-
grad_weights = grad.mean(dim=-1, keepdim=True)
|
| 500 |
-
|
| 501 |
-
print("act shape", act.shape)
|
| 502 |
-
print("grad_weights shape", grad_weights.shape)
|
| 503 |
-
|
| 504 |
-
cam = (act * grad_weights).sum(dim=-1)
|
| 505 |
-
# cam, _ = (act * grad_weights).max(dim=-1)
|
| 506 |
-
# cam, _ = grad_weights.max(dim=-1)
|
| 507 |
-
print(cam.shape)
|
| 508 |
-
|
| 509 |
-
# Sum across all layers
|
| 510 |
-
if cam_sum is None:
|
| 511 |
-
cam_sum = cam
|
| 512 |
-
else:
|
| 513 |
-
cam_sum += cam
|
| 514 |
-
|
| 515 |
-
# Normalize
|
| 516 |
-
cam_sum = F.relu(cam_sum)
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
# thresholding
|
| 520 |
-
cam_sum = cam_sum.to(torch.float32).detach().cpu()
|
| 521 |
-
percentile = torch.quantile(cam_sum, 0.2) # Adjust threshold dynamically
|
| 522 |
-
cam_sum[cam_sum < percentile] = 0
|
| 523 |
-
|
| 524 |
-
# Reshape
|
| 525 |
-
print("cam_sum shape: ", cam_sum.shape)
|
| 526 |
-
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 527 |
-
grid_size = int(num_patches ** 0.5)
|
| 528 |
-
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 529 |
-
|
| 530 |
-
cam_sum = cam_sum.view(grid_size, grid_size)
|
| 531 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 532 |
-
|
| 533 |
-
return cam_sum, grid_size, start_idx
|
| 534 |
-
|
| 535 |
-
elif focus == "Language Model":
|
| 536 |
-
self.model.zero_grad()
|
| 537 |
-
print("logits shape:", outputs_raw.logits.shape)
|
| 538 |
-
# loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 539 |
-
if class_idx == -1:
|
| 540 |
-
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 541 |
-
else:
|
| 542 |
-
loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + class_idx]
|
| 543 |
-
loss.backward()
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
# Aggregate activations and gradients from ALL layers
|
| 548 |
-
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 549 |
-
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 550 |
-
print(f"layers shape: {len(self.target_layers)}")
|
| 551 |
-
print("activations & gradients shape", len(self.activations), len(self.gradients))
|
| 552 |
-
|
| 553 |
-
cams = []
|
| 554 |
-
|
| 555 |
-
# Ver 2
|
| 556 |
-
for act, grad in zip(self.activations, self.gradients):
|
| 557 |
-
|
| 558 |
-
print("act shape", act.shape)
|
| 559 |
-
print("grad shape", grad.shape)
|
| 560 |
-
|
| 561 |
-
grad = F.relu(grad)
|
| 562 |
-
|
| 563 |
-
# cam = grad
|
| 564 |
-
cam = act * grad # shape: [1, heads, seq_len, seq_len]
|
| 565 |
-
cam = cam.sum(dim=1) # shape: [1, seq_len, seq_len]
|
| 566 |
-
cam = cam.to(torch.float32).detach().cpu()
|
| 567 |
-
cams.append(cam)
|
| 568 |
-
|
| 569 |
-
# cam_sum = F.relu(cam_sum)
|
| 570 |
-
# cam_sum = cam_sum.to(torch.float32)
|
| 571 |
-
|
| 572 |
-
# cams shape: [layers, 1, seq_len, seq_len]
|
| 573 |
-
cam_sum_lst = []
|
| 574 |
-
|
| 575 |
-
start_idx = last + 1
|
| 576 |
-
for i in range(start_idx, cams[0].shape[1]):
|
| 577 |
-
cam_sum = None
|
| 578 |
-
for layer, cam_l in enumerate(cams):
|
| 579 |
-
cam_l_i = cam_l[0, i, :] # shape: [1: seq_len]
|
| 580 |
-
|
| 581 |
-
cam_l_i = cam_l_i[image_mask].unsqueeze(0) # shape: [1, img_seq_len]
|
| 582 |
-
# print(f"layer: {layer}, token index: {i}")
|
| 583 |
-
# print("cam_sum shape: ", cam_l_i.shape)
|
| 584 |
-
num_patches = cam_l_i.shape[-1] # Last dimension of CAM output
|
| 585 |
-
grid_size = int(num_patches ** 0.5)
|
| 586 |
-
# print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 587 |
-
|
| 588 |
-
# Fix the reshaping step dynamically
|
| 589 |
-
cam_reshaped = cam_l_i.view(grid_size, grid_size)
|
| 590 |
-
# print(f"max: {cam_reshaped.max()}, min: {cam_reshaped.min()}")
|
| 591 |
-
# cam_reshaped = (cam_reshaped - cam_reshaped.min()) / (cam_reshaped.max() - cam_reshaped.min())
|
| 592 |
-
if cam_sum == None:
|
| 593 |
-
cam_sum = cam_reshaped
|
| 594 |
-
else:
|
| 595 |
-
cam_sum += cam_reshaped
|
| 596 |
-
# print(f"normalized: max: {cam_normalized.max()}, min: {cam_normalized.min()}")
|
| 597 |
-
|
| 598 |
-
# print(f"sum: max: {cam_sum.max()}, min: {cam_sum.min()}")
|
| 599 |
-
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 600 |
-
cam_sum_lst.append(cam_sum)
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
return cam_sum_lst, grid_size, start_idx
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
def generate_gradcam(
|
| 621 |
-
cam,
|
| 622 |
-
image,
|
| 623 |
-
size = (384, 384),
|
| 624 |
-
alpha=0.5,
|
| 625 |
-
colormap=cv2.COLORMAP_JET,
|
| 626 |
-
aggregation='mean',
|
| 627 |
-
normalize=False
|
| 628 |
-
):
|
| 629 |
-
"""
|
| 630 |
-
Generates a Grad-CAM heatmap overlay on top of the input image.
|
| 631 |
-
|
| 632 |
-
Parameters:
|
| 633 |
-
attributions (torch.Tensor): A tensor of shape (C, H, W) representing the
|
| 634 |
-
intermediate activations or gradients at the target layer.
|
| 635 |
-
image (PIL.Image): The original image.
|
| 636 |
-
alpha (float): The blending factor for the heatmap overlay (default 0.5).
|
| 637 |
-
colormap (int): OpenCV colormap to apply (default cv2.COLORMAP_JET).
|
| 638 |
-
aggregation (str): How to aggregate across channels; either 'mean' or 'sum'.
|
| 639 |
-
|
| 640 |
-
Returns:
|
| 641 |
-
PIL.Image: The image overlaid with the Grad-CAM heatmap.
|
| 642 |
-
"""
|
| 643 |
-
# print("Generating Grad-CAM with shape:", cam.shape)
|
| 644 |
-
|
| 645 |
-
if normalize:
|
| 646 |
-
cam_min, cam_max = cam.min(), cam.max()
|
| 647 |
-
cam = cam - cam_min
|
| 648 |
-
cam = cam / (cam_max - cam_min)
|
| 649 |
-
# Convert tensor to numpy array
|
| 650 |
-
cam = torch.nn.functional.interpolate(cam.unsqueeze(0).unsqueeze(0), size=size, mode='bilinear').squeeze()
|
| 651 |
-
cam_np = cam.squeeze().detach().cpu().numpy()
|
| 652 |
-
|
| 653 |
-
# Apply Gaussian blur for smoother heatmaps
|
| 654 |
-
cam_np = cv2.GaussianBlur(cam_np, (5,5), sigmaX=0.8)
|
| 655 |
-
|
| 656 |
-
# Resize the cam to match the image size
|
| 657 |
-
width, height = size
|
| 658 |
-
cam_resized = cv2.resize(cam_np, (width, height))
|
| 659 |
-
|
| 660 |
-
# Convert the normalized map to a heatmap (0-255 uint8)
|
| 661 |
-
heatmap = np.uint8(255 * cam_resized)
|
| 662 |
-
heatmap = cv2.applyColorMap(heatmap, colormap)
|
| 663 |
-
# OpenCV produces heatmaps in BGR, so convert to RGB for consistency
|
| 664 |
-
heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
|
| 665 |
-
|
| 666 |
-
# Convert original image to a numpy array
|
| 667 |
-
image_np = np.array(image)
|
| 668 |
-
image_np = cv2.resize(image_np, (width, height))
|
| 669 |
-
|
| 670 |
-
# Blend the heatmap with the original image
|
| 671 |
-
overlay = cv2.addWeighted(image_np, 1 - alpha, heatmap, alpha, 0)
|
| 672 |
-
|
| 673 |
-
return Image.fromarray(overlay)
|
| 674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo/visualization.py
CHANGED
|
@@ -145,7 +145,7 @@ class Visualization:
|
|
| 145 |
return cams
|
| 146 |
|
| 147 |
|
| 148 |
-
def process(self, cam_sum, thresholding=True, remove_cls=
|
| 149 |
|
| 150 |
cam_sum = cam_sum.to(torch.float32)
|
| 151 |
|
|
|
|
| 145 |
return cams
|
| 146 |
|
| 147 |
|
| 148 |
+
def process(self, cam_sum, thresholding=True, remove_cls=False, normalize=True):
|
| 149 |
|
| 150 |
cam_sum = cam_sum.to(torch.float32)
|
| 151 |
|
janus/models/modeling_vlm.py
CHANGED
|
@@ -256,7 +256,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
|
|
| 256 |
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
|
| 257 |
|
| 258 |
# replace with the image embeddings
|
| 259 |
-
images_embeds = images_embeds[:, 1:, :]
|
| 260 |
inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
|
| 261 |
|
| 262 |
return inputs_embeds
|
|
@@ -293,7 +293,8 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
|
|
| 293 |
inputs_embeds = self.language_model.get_input_embeddings()(input_tensor.input_ids)
|
| 294 |
# print("input_embeddings: ", inputs_embeds)
|
| 295 |
|
| 296 |
-
images_embeds_rest = images_embeds[:, 1:, :]
|
|
|
|
| 297 |
|
| 298 |
# images_embeds_pooled = images_embeds.mean(dim=1)
|
| 299 |
|
|
|
|
| 256 |
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
|
| 257 |
|
| 258 |
# replace with the image embeddings
|
| 259 |
+
# images_embeds = images_embeds[:, 1:, :]
|
| 260 |
inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
|
| 261 |
|
| 262 |
return inputs_embeds
|
|
|
|
| 293 |
inputs_embeds = self.language_model.get_input_embeddings()(input_tensor.input_ids)
|
| 294 |
# print("input_embeddings: ", inputs_embeds)
|
| 295 |
|
| 296 |
+
# images_embeds_rest = images_embeds[:, 1:, :]
|
| 297 |
+
images_embeds_rest = images_embeds[:, :, :]
|
| 298 |
|
| 299 |
# images_embeds_pooled = images_embeds.mean(dim=1)
|
| 300 |
|
janus/models/siglip_vit.py
CHANGED
|
@@ -655,9 +655,9 @@ def create_siglip_vit(
|
|
| 655 |
else:
|
| 656 |
layers = min(vision_cfg.layers, select_layer)
|
| 657 |
|
| 658 |
-
#
|
| 659 |
-
vision_cfg.class_token = True
|
| 660 |
-
print("Usage Class Token: ", vision_cfg.class_token)
|
| 661 |
|
| 662 |
model = VisionTransformer(
|
| 663 |
img_size=image_size,
|
|
|
|
| 655 |
else:
|
| 656 |
layers = min(vision_cfg.layers, select_layer)
|
| 657 |
|
| 658 |
+
# Require CLS token
|
| 659 |
+
# vision_cfg.class_token = True
|
| 660 |
+
# print("Usage Class Token: ", vision_cfg.class_token)
|
| 661 |
|
| 662 |
model = VisionTransformer(
|
| 663 |
img_size=image_size,
|
questions/VLAT.py
CHANGED
|
@@ -49,7 +49,7 @@ VLAT_questions=[
|
|
| 49 |
|
| 50 |
[
|
| 51 |
"StackedArea",
|
| 52 |
-
"The number of girls named 'Olivia' was
|
| 53 |
"images/mini-VLAT/StackedArea.png"
|
| 54 |
],
|
| 55 |
|
|
@@ -115,7 +115,7 @@ VLAT_questions=[
|
|
| 115 |
|
| 116 |
[
|
| 117 |
"LineChart",
|
| 118 |
-
"Over the course of the first quarter of 2020, the price of a barrel of oil was
|
| 119 |
"images/mini-VLAT/LineChart.png"
|
| 120 |
],
|
| 121 |
|
|
@@ -175,7 +175,7 @@ VLAT_questions=[
|
|
| 175 |
|
| 176 |
[
|
| 177 |
"AreaChart",
|
| 178 |
-
"Over the first six months of 2018, the price of a pound of coffee beans was roughly
|
| 179 |
"images/mini-VLAT/AreaChart.png"
|
| 180 |
],
|
| 181 |
|
|
|
|
| 49 |
|
| 50 |
[
|
| 51 |
"StackedArea",
|
| 52 |
+
"The number of girls named 'Olivia' was increasing or decreasing from 2009 to 2012?",
|
| 53 |
"images/mini-VLAT/StackedArea.png"
|
| 54 |
],
|
| 55 |
|
|
|
|
| 115 |
|
| 116 |
[
|
| 117 |
"LineChart",
|
| 118 |
+
"Over the course of the first quarter of 2020, the price of a barrel of oil was increasing or decreasing?",
|
| 119 |
"images/mini-VLAT/LineChart.png"
|
| 120 |
],
|
| 121 |
|
|
|
|
| 175 |
|
| 176 |
[
|
| 177 |
"AreaChart",
|
| 178 |
+
"Over the first six months of 2018, the price of a pound of coffee beans was roughly decreasing or increasing?",
|
| 179 |
"images/mini-VLAT/AreaChart.png"
|
| 180 |
],
|
| 181 |
|