AustingDong
commited on
Commit
·
b1faf64
1
Parent(s):
1ca9e3b
add ChartGemma
Browse files- app.py +47 -12
- demo/cam.py +188 -1
- demo/model_utils.py +56 -1
app.py
CHANGED
|
@@ -3,8 +3,8 @@ import torch
|
|
| 3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
| 4 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 5 |
from janus.utils.io import load_pil_images
|
| 6 |
-
from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMLLaVA
|
| 7 |
-
from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, add_title_to_image
|
| 8 |
|
| 9 |
import numpy as np
|
| 10 |
import matplotlib.pyplot as plt
|
|
@@ -22,7 +22,8 @@ clip_utils = Clip_Utils()
|
|
| 22 |
clip_utils.init_Clip()
|
| 23 |
model_utils, vl_gpt, tokenizer = None, None, None
|
| 24 |
model_name = "Clip"
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
def clean():
|
| 28 |
global model_utils, vl_gpt, tokenizer, clip_utils
|
|
@@ -109,7 +110,12 @@ def multimodal_understanding(model_type,
|
|
| 109 |
|
| 110 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
| 111 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if saliency_map_method == "GradCAM":
|
| 115 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
@@ -127,8 +133,13 @@ def multimodal_understanding(model_type,
|
|
| 127 |
gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
|
| 128 |
elif model_name.split('-')[0] == "LLaVA":
|
| 129 |
gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
|
|
|
|
|
|
|
|
|
|
| 130 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
| 131 |
gradcam.remove_hooks()
|
|
|
|
|
|
|
| 132 |
if focus == "Visual Encoder":
|
| 133 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
| 134 |
cam = [generate_gradcam(cam_grid, image)]
|
|
@@ -144,7 +155,7 @@ def multimodal_understanding(model_type,
|
|
| 144 |
else:
|
| 145 |
cam = []
|
| 146 |
for i, cam_tensor in enumerate(cam_tensors):
|
| 147 |
-
cam_grid = cam_tensor.reshape(
|
| 148 |
cam_i = generate_gradcam(cam_grid, image)
|
| 149 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
| 150 |
|
|
@@ -158,7 +169,7 @@ def multimodal_understanding(model_type,
|
|
| 158 |
# Gradio interface
|
| 159 |
|
| 160 |
def model_slider_change(model_type):
|
| 161 |
-
global model_utils, vl_gpt, tokenizer, clip_utils, model_name
|
| 162 |
model_name = model_type
|
| 163 |
if model_type == "Clip":
|
| 164 |
clean()
|
|
@@ -179,6 +190,8 @@ def model_slider_change(model_type):
|
|
| 179 |
set_seed()
|
| 180 |
model_utils = Janus_Utils()
|
| 181 |
vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
|
|
|
|
|
|
|
| 182 |
|
| 183 |
res = (
|
| 184 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
@@ -195,6 +208,8 @@ def model_slider_change(model_type):
|
|
| 195 |
set_seed()
|
| 196 |
model_utils = LLaVA_Utils()
|
| 197 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
|
|
|
|
|
|
| 198 |
|
| 199 |
res = (
|
| 200 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
@@ -204,9 +219,29 @@ def model_slider_change(model_type):
|
|
| 204 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
| 205 |
)
|
| 206 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
def focus_change(focus):
|
| 209 |
-
global model_name
|
| 210 |
if model_name == "Clip":
|
| 211 |
res = (
|
| 212 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
|
@@ -219,15 +254,15 @@ def focus_change(focus):
|
|
| 219 |
if response_type.value == "answer + visualization":
|
| 220 |
res = (
|
| 221 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
| 222 |
-
gr.Slider(minimum=1, maximum=
|
| 223 |
-
gr.Slider(minimum=1, maximum=
|
| 224 |
)
|
| 225 |
return res
|
| 226 |
else:
|
| 227 |
res = (
|
| 228 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
| 229 |
-
gr.Slider(minimum=1, maximum=
|
| 230 |
-
gr.Slider(minimum=1, maximum=
|
| 231 |
)
|
| 232 |
return res
|
| 233 |
|
|
@@ -251,7 +286,7 @@ with gr.Blocks() as demo:
|
|
| 251 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
| 252 |
|
| 253 |
with gr.Column():
|
| 254 |
-
model_selector = gr.Dropdown(choices=["Clip", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
| 255 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 256 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 257 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
|
|
|
| 3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
| 4 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 5 |
from janus.utils.io import load_pil_images
|
| 6 |
+
from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMChartGemma, AttentionGuidedCAMLLaVA
|
| 7 |
+
from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, ChartGemma_Utils, add_title_to_image
|
| 8 |
|
| 9 |
import numpy as np
|
| 10 |
import matplotlib.pyplot as plt
|
|
|
|
| 22 |
clip_utils.init_Clip()
|
| 23 |
model_utils, vl_gpt, tokenizer = None, None, None
|
| 24 |
model_name = "Clip"
|
| 25 |
+
language_model_max_layer = 24
|
| 26 |
+
language_model_best_layer = 8
|
| 27 |
|
| 28 |
def clean():
|
| 29 |
global model_utils, vl_gpt, tokenizer, clip_utils
|
|
|
|
| 110 |
|
| 111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
| 112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
| 113 |
+
if model_name.split('-')[0] == "Janus":
|
| 114 |
+
start = 620
|
| 115 |
+
elif model_name.split('-')[0] == "ChartGemma":
|
| 116 |
+
start = 1024
|
| 117 |
+
else:
|
| 118 |
+
start = 512
|
| 119 |
|
| 120 |
if saliency_map_method == "GradCAM":
|
| 121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
|
|
| 133 |
gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
|
| 134 |
elif model_name.split('-')[0] == "LLaVA":
|
| 135 |
gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
|
| 136 |
+
elif model_name.split('-')[0] == "ChartGemma":
|
| 137 |
+
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
| 138 |
+
|
| 139 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
| 140 |
gradcam.remove_hooks()
|
| 141 |
+
|
| 142 |
+
|
| 143 |
if focus == "Visual Encoder":
|
| 144 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
| 145 |
cam = [generate_gradcam(cam_grid, image)]
|
|
|
|
| 155 |
else:
|
| 156 |
cam = []
|
| 157 |
for i, cam_tensor in enumerate(cam_tensors):
|
| 158 |
+
cam_grid = cam_tensor.reshape(grid_size, grid_size)
|
| 159 |
cam_i = generate_gradcam(cam_grid, image)
|
| 160 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
| 161 |
|
|
|
|
| 169 |
# Gradio interface
|
| 170 |
|
| 171 |
def model_slider_change(model_type):
|
| 172 |
+
global model_utils, vl_gpt, tokenizer, clip_utils, model_name, language_model_max_layer, language_model_best_layer
|
| 173 |
model_name = model_type
|
| 174 |
if model_type == "Clip":
|
| 175 |
clean()
|
|
|
|
| 190 |
set_seed()
|
| 191 |
model_utils = Janus_Utils()
|
| 192 |
vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
|
| 193 |
+
language_model_max_layer = 24
|
| 194 |
+
language_model_best_layer = 8
|
| 195 |
|
| 196 |
res = (
|
| 197 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
|
|
| 208 |
set_seed()
|
| 209 |
model_utils = LLaVA_Utils()
|
| 210 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
| 211 |
+
language_model_max_layer = 24
|
| 212 |
+
language_model_best_layer = 8
|
| 213 |
|
| 214 |
res = (
|
| 215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
|
|
| 219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
| 220 |
)
|
| 221 |
return res
|
| 222 |
+
|
| 223 |
+
elif model_type.split('-')[0] == "ChartGemma":
|
| 224 |
+
clean()
|
| 225 |
+
set_seed()
|
| 226 |
+
model_utils = ChartGemma_Utils()
|
| 227 |
+
vl_gpt, tokenizer = model_utils.init_ChartGemma()
|
| 228 |
+
language_model_max_layer = 18
|
| 229 |
+
language_model_best_layer = 12
|
| 230 |
+
|
| 231 |
+
res = (
|
| 232 |
+
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
| 233 |
+
gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers min"),
|
| 234 |
+
gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers max"),
|
| 235 |
+
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
| 236 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
| 237 |
+
)
|
| 238 |
+
return res
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
|
| 242 |
|
| 243 |
def focus_change(focus):
|
| 244 |
+
global model_name, language_model_max_layer
|
| 245 |
if model_name == "Clip":
|
| 246 |
res = (
|
| 247 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
|
|
|
| 254 |
if response_type.value == "answer + visualization":
|
| 255 |
res = (
|
| 256 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
| 257 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
| 258 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
| 259 |
)
|
| 260 |
return res
|
| 261 |
else:
|
| 262 |
res = (
|
| 263 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
| 264 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
| 265 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
| 266 |
)
|
| 267 |
return res
|
| 268 |
|
|
|
|
| 286 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
| 287 |
|
| 288 |
with gr.Column():
|
| 289 |
+
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-2B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
| 290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 292 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
demo/cam.py
CHANGED
|
@@ -229,8 +229,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
| 229 |
|
| 230 |
|
| 231 |
elif focus == "Language Model":
|
| 232 |
-
loss = self.target_layers[-1].attention_map.sum()
|
| 233 |
self.model.zero_grad()
|
|
|
|
| 234 |
loss.backward()
|
| 235 |
|
| 236 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
|
@@ -429,6 +429,193 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
| 429 |
|
| 430 |
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
def generate_gradcam(
|
| 433 |
cam,
|
| 434 |
image,
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
elif focus == "Language Model":
|
|
|
|
| 232 |
self.model.zero_grad()
|
| 233 |
+
loss = outputs.logits.max(dim=-1).values.sum()
|
| 234 |
loss.backward()
|
| 235 |
|
| 236 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
|
|
|
| 429 |
|
| 430 |
|
| 431 |
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
| 445 |
+
def __init__(self, model, target_layers):
|
| 446 |
+
self.target_layers = target_layers
|
| 447 |
+
super().__init__(model)
|
| 448 |
+
self._modify_layers()
|
| 449 |
+
self._register_hooks_activations()
|
| 450 |
+
|
| 451 |
+
def _modify_layers(self):
|
| 452 |
+
for layer in self.target_layers:
|
| 453 |
+
setattr(layer, "attn_gradients", None)
|
| 454 |
+
setattr(layer, "attention_map", None)
|
| 455 |
+
|
| 456 |
+
layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
|
| 457 |
+
layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
|
| 458 |
+
layer.save_attn_map = types.MethodType(save_attn_map, layer)
|
| 459 |
+
layer.get_attn_map = types.MethodType(get_attn_map, layer)
|
| 460 |
+
|
| 461 |
+
def _forward_activate_hooks(self, module, input, output):
|
| 462 |
+
attn_output, attn_weights = output # Unpack outputs
|
| 463 |
+
print("attn_output shape:", attn_output.shape)
|
| 464 |
+
print("attn_weights shape:", attn_weights.shape)
|
| 465 |
+
module.save_attn_map(attn_weights)
|
| 466 |
+
attn_weights.register_hook(module.save_attn_gradients)
|
| 467 |
+
|
| 468 |
+
def _register_hooks_activations(self):
|
| 469 |
+
for layer in self.target_layers:
|
| 470 |
+
if hasattr(layer, "q_proj"): # is an attention layer
|
| 471 |
+
self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
|
| 472 |
+
|
| 473 |
+
@spaces.GPU(duration=120)
|
| 474 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
|
| 475 |
+
""" Generates Grad-CAM heatmap for ViT. """
|
| 476 |
+
|
| 477 |
+
# Forward pass
|
| 478 |
+
outputs_raw = self.model(**inputs)
|
| 479 |
+
|
| 480 |
+
image_embeddings = outputs_raw.image_hidden_states
|
| 481 |
+
inputs_embeddings = self.model.get_input_embeddings()(inputs['input_ids'])
|
| 482 |
+
|
| 483 |
+
# Pooling
|
| 484 |
+
image_embeddings_pooled = image_embeddings.mean(dim=1)
|
| 485 |
+
|
| 486 |
+
inputs_embeddings_pooled = inputs_embeddings.mean(dim=1) # end of image: 618
|
| 487 |
+
# inputs_embeddings_pooled = inputs_embeddings[
|
| 488 |
+
# torch.arange(inputs_embeddings.shape[0], device=inputs_embeddings.device),
|
| 489 |
+
# input_ids.to(dtype=torch.int, device=inputs_embeddings.device).argmax(dim=-1),
|
| 490 |
+
# ]
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
# Backpropagate to get gradients
|
| 494 |
+
# image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
|
| 495 |
+
# similarity = F.cosine_similarity(image_embeddings_mean, inputs_embeddings_mean, dim=-1)
|
| 496 |
+
# similarity.backward()
|
| 497 |
+
self.model.zero_grad()
|
| 498 |
+
print(outputs_raw)
|
| 499 |
+
# loss = self.target_layers[-1].attention_map.sum()
|
| 500 |
+
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 501 |
+
loss.backward()
|
| 502 |
+
|
| 503 |
+
# get image masks
|
| 504 |
+
image_mask = []
|
| 505 |
+
last = 0
|
| 506 |
+
for i in range(inputs["input_ids"].shape[1]):
|
| 507 |
+
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
| 508 |
+
print(decoded_token)
|
| 509 |
+
if (decoded_token == "<image>"):
|
| 510 |
+
image_mask.append(True)
|
| 511 |
+
last = i
|
| 512 |
+
else:
|
| 513 |
+
image_mask.append(False)
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
# Aggregate activations and gradients from ALL layers
|
| 517 |
+
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 518 |
+
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 519 |
+
cam_sum = None
|
| 520 |
+
# Ver 1
|
| 521 |
+
# for act, grad in zip(self.activations, self.gradients):
|
| 522 |
+
# # act = torch.sigmoid(act)
|
| 523 |
+
# print("act:", act)
|
| 524 |
+
# print(len(act))
|
| 525 |
+
# print("act_shape:", act.shape)
|
| 526 |
+
# # print("act1_shape:", act[1].shape)
|
| 527 |
+
|
| 528 |
+
# act = F.relu(act.mean(dim=1))
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
# # Compute mean of gradients
|
| 532 |
+
# print("grad:", grad)
|
| 533 |
+
# print(len(grad))
|
| 534 |
+
# print("grad_shape:", grad.shape)
|
| 535 |
+
# grad_weights = grad.mean(dim=1)
|
| 536 |
+
|
| 537 |
+
# print("act shape", act.shape)
|
| 538 |
+
# print("grad_weights shape", grad_weights.shape)
|
| 539 |
+
|
| 540 |
+
# cam = act * grad_weights
|
| 541 |
+
# # cam = act
|
| 542 |
+
# print(cam.shape)
|
| 543 |
+
|
| 544 |
+
# # Sum across all layers
|
| 545 |
+
# if cam_sum is None:
|
| 546 |
+
# cam_sum = cam
|
| 547 |
+
# else:
|
| 548 |
+
# cam_sum += cam
|
| 549 |
+
|
| 550 |
+
# Ver 2
|
| 551 |
+
for act, grad in zip(self.activations, self.gradients):
|
| 552 |
+
|
| 553 |
+
print("act shape", act.shape)
|
| 554 |
+
print("grad shape", grad.shape)
|
| 555 |
+
|
| 556 |
+
act = F.relu(act)
|
| 557 |
+
grad = F.relu(grad)
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
cam = act * grad # shape: [1, heads, seq_len, seq_len]
|
| 561 |
+
cam = cam.sum(dim=1) # shape: [1, seq_len, seq_len]
|
| 562 |
+
|
| 563 |
+
# Sum across all layers
|
| 564 |
+
if cam_sum is None:
|
| 565 |
+
cam_sum = cam
|
| 566 |
+
else:
|
| 567 |
+
cam_sum += cam
|
| 568 |
+
|
| 569 |
+
cam_sum = F.relu(cam_sum)
|
| 570 |
+
cam_sum = cam_sum.to(torch.float32)
|
| 571 |
+
|
| 572 |
+
# thresholding
|
| 573 |
+
# percentile = torch.quantile(cam_sum, 0.4) # Adjust threshold dynamically
|
| 574 |
+
# cam_sum[cam_sum < percentile] = 0
|
| 575 |
+
|
| 576 |
+
# Reshape
|
| 577 |
+
# if visual_pooling_method == "CLS":
|
| 578 |
+
# cam_sum = cam_sum[0, 1:]
|
| 579 |
+
|
| 580 |
+
# cam_sum shape: [1, seq_len, seq_len]
|
| 581 |
+
cam_sum_lst = []
|
| 582 |
+
cam_sum_raw = cam_sum
|
| 583 |
+
start_idx = 1024
|
| 584 |
+
for i in range(start_idx, cam_sum_raw.shape[1]):
|
| 585 |
+
cam_sum = cam_sum_raw[0, i, :] # shape: [1: seq_len]
|
| 586 |
+
# cam_sum_min = cam_sum.min()
|
| 587 |
+
# cam_sum_max = cam_sum.max()
|
| 588 |
+
# cam_sum = (cam_sum - cam_sum_min) / (cam_sum_max - cam_sum_min)
|
| 589 |
+
cam_sum = cam_sum[image_mask].unsqueeze(0) # shape: [1, 1024]
|
| 590 |
+
print("cam_sum shape: ", cam_sum.shape)
|
| 591 |
+
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
| 592 |
+
grid_size = int(num_patches ** 0.5)
|
| 593 |
+
print(f"Detected grid size: {grid_size}x{grid_size}")
|
| 594 |
+
|
| 595 |
+
# Fix the reshaping step dynamically
|
| 596 |
+
|
| 597 |
+
cam_sum = cam_sum.view(grid_size, grid_size)
|
| 598 |
+
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 599 |
+
cam_sum_lst.append(cam_sum)
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
return cam_sum_lst, grid_size
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
|
| 619 |
def generate_gradcam(
|
| 620 |
cam,
|
| 621 |
image,
|
demo/model_utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import torch
|
|
| 2 |
import numpy as np
|
| 3 |
import spaces
|
| 4 |
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
-
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor
|
| 6 |
from transformers import CLIPProcessor, CLIPModel
|
| 7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 8 |
|
|
@@ -170,6 +170,61 @@ class LLaVA_Utils(Model_Utils):
|
|
| 170 |
)
|
| 171 |
|
| 172 |
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
def add_title_to_image(image, title, font_size=20):
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import spaces
|
| 4 |
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor, PaliGemmaForConditionalGeneration
|
| 6 |
from transformers import CLIPProcessor, CLIPModel
|
| 7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 8 |
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
return outputs
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
class ChartGemma_Utils(Model_Utils):
|
| 179 |
+
def __init__(self):
|
| 180 |
+
super().__init__()
|
| 181 |
+
|
| 182 |
+
def init_ChartGemma(self):
|
| 183 |
+
|
| 184 |
+
model_path = "ahmed-masry/chartgemma"
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
self.vl_gpt = PaliGemmaForConditionalGeneration.from_pretrained(
|
| 188 |
+
model_path,
|
| 189 |
+
torch_dtype=torch.float16,
|
| 190 |
+
attn_implementation="eager",
|
| 191 |
+
output_attentions=True
|
| 192 |
+
)
|
| 193 |
+
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
| 194 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
| 195 |
+
self.tokenizer = self.processor.tokenizer
|
| 196 |
+
|
| 197 |
+
return self.vl_gpt, self.tokenizer
|
| 198 |
+
|
| 199 |
+
@spaces.GPU(duration=120)
|
| 200 |
+
def prepare_inputs(self, question, image):
|
| 201 |
+
|
| 202 |
+
pil_image = Image.fromarray(image)
|
| 203 |
+
prepare_inputs = self.processor(
|
| 204 |
+
images=pil_image, text=[question], return_tensors="pt"
|
| 205 |
+
).to(self.cuda_device, dtype=self.dtype)
|
| 206 |
+
|
| 207 |
+
return prepare_inputs
|
| 208 |
+
|
| 209 |
+
@spaces.GPU(duration=120)
|
| 210 |
+
def generate_inputs_embeddings(self, prepare_inputs):
|
| 211 |
+
return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
|
| 212 |
+
|
| 213 |
+
@spaces.GPU(duration=120)
|
| 214 |
+
def generate_outputs(self, prepare_inputs, temperature, top_p):
|
| 215 |
+
|
| 216 |
+
outputs = self.vl_gpt.generate(
|
| 217 |
+
**prepare_inputs,
|
| 218 |
+
max_new_tokens=512,
|
| 219 |
+
do_sample=False if temperature == 0 else True,
|
| 220 |
+
use_cache=True,
|
| 221 |
+
return_dict_in_generate=True,
|
| 222 |
+
output_attentions=True
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
return outputs
|
| 226 |
+
|
| 227 |
+
|
| 228 |
|
| 229 |
|
| 230 |
def add_title_to_image(image, title, font_size=20):
|