AustingDong
commited on
Commit
·
a594e78
1
Parent(s):
8bfad75
extend
Browse files- app.py +17 -12
- demo/cam.py +5 -4
- demo/model_utils.py +24 -14
app.py
CHANGED
|
@@ -110,12 +110,12 @@ def multimodal_understanding(model_type,
|
|
| 110 |
|
| 111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
| 112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
| 113 |
-
if model_name.split('-')[0] == "Janus":
|
| 114 |
-
|
| 115 |
-
elif model_name.split('-')[0] == "ChartGemma":
|
| 116 |
-
|
| 117 |
-
elif model_name.split('-')[0] == "LLaVA":
|
| 118 |
-
|
| 119 |
|
| 120 |
if activation_map_method == "GradCAM":
|
| 121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
@@ -136,7 +136,11 @@ def multimodal_understanding(model_type,
|
|
| 136 |
elif model_name.split('-')[0] == "ChartGemma":
|
| 137 |
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
gradcam.remove_hooks()
|
| 141 |
|
| 142 |
|
|
@@ -207,14 +211,15 @@ def model_slider_change(model_type):
|
|
| 207 |
clean()
|
| 208 |
set_seed()
|
| 209 |
model_utils = LLaVA_Utils()
|
| 210 |
-
|
| 211 |
-
|
|
|
|
| 212 |
language_model_best_layer = 10
|
| 213 |
|
| 214 |
res = (
|
| 215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
| 216 |
-
gr.Slider(minimum=1, maximum=
|
| 217 |
-
gr.Slider(minimum=1, maximum=
|
| 218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
| 219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
| 220 |
)
|
|
@@ -286,7 +291,7 @@ with gr.Blocks() as demo:
|
|
| 286 |
activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
|
| 287 |
|
| 288 |
with gr.Column():
|
| 289 |
-
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
| 290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 292 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
|
|
|
| 110 |
|
| 111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
| 112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
| 113 |
+
# if model_name.split('-')[0] == "Janus":
|
| 114 |
+
# start = 620
|
| 115 |
+
# elif model_name.split('-')[0] == "ChartGemma":
|
| 116 |
+
# start = 1024
|
| 117 |
+
# elif model_name.split('-')[0] == "LLaVA":
|
| 118 |
+
# start = 581
|
| 119 |
|
| 120 |
if activation_map_method == "GradCAM":
|
| 121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
|
|
| 136 |
elif model_name.split('-')[0] == "ChartGemma":
|
| 137 |
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
| 138 |
|
| 139 |
+
start = 0
|
| 140 |
+
if focus == "Visual Encoder":
|
| 141 |
+
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
| 142 |
+
else:
|
| 143 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
| 144 |
gradcam.remove_hooks()
|
| 145 |
|
| 146 |
|
|
|
|
| 211 |
clean()
|
| 212 |
set_seed()
|
| 213 |
model_utils = LLaVA_Utils()
|
| 214 |
+
version = model_type.split('-')[1]
|
| 215 |
+
vl_gpt, tokenizer = model_utils.init_LLaVA(version=version)
|
| 216 |
+
language_model_max_layer = 32 if version == "1.5" else 28
|
| 217 |
language_model_best_layer = 10
|
| 218 |
|
| 219 |
res = (
|
| 220 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
| 221 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
| 222 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max"),
|
| 223 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
| 224 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
| 225 |
)
|
|
|
|
| 291 |
activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
|
| 292 |
|
| 293 |
with gr.Column():
|
| 294 |
+
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B", "LLaVA-onevision-qwen2-7b-si"], value="Clip", label="model")
|
| 295 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 296 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 297 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
demo/cam.py
CHANGED
|
@@ -274,7 +274,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
| 274 |
# cam_sum shape: [1, seq_len, seq_len]
|
| 275 |
cam_sum_lst = []
|
| 276 |
cam_sum_raw = cam_sum
|
| 277 |
-
|
|
|
|
| 278 |
cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
|
| 279 |
cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
|
| 280 |
print("cam_sum shape: ", cam_sum.shape)
|
|
@@ -290,7 +291,7 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
| 290 |
cam_sum_lst.append(cam_sum)
|
| 291 |
|
| 292 |
|
| 293 |
-
return cam_sum_lst, grid_size
|
| 294 |
|
| 295 |
# Aggregate activations and gradients from ALL layers
|
| 296 |
|
|
@@ -407,7 +408,7 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
| 407 |
cam_sum_lst.append(cam_sum)
|
| 408 |
|
| 409 |
|
| 410 |
-
return cam_sum_lst, grid_size
|
| 411 |
|
| 412 |
|
| 413 |
|
|
@@ -556,7 +557,7 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
|
| 556 |
cam_sum_lst.append(cam_sum)
|
| 557 |
|
| 558 |
|
| 559 |
-
return cam_sum_lst, grid_size
|
| 560 |
|
| 561 |
|
| 562 |
|
|
|
|
| 274 |
# cam_sum shape: [1, seq_len, seq_len]
|
| 275 |
cam_sum_lst = []
|
| 276 |
cam_sum_raw = cam_sum
|
| 277 |
+
start = 620
|
| 278 |
+
for i in range(start, cam_sum_raw.shape[1]):
|
| 279 |
cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
|
| 280 |
cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
|
| 281 |
print("cam_sum shape: ", cam_sum.shape)
|
|
|
|
| 291 |
cam_sum_lst.append(cam_sum)
|
| 292 |
|
| 293 |
|
| 294 |
+
return cam_sum_lst, grid_size, start
|
| 295 |
|
| 296 |
# Aggregate activations and gradients from ALL layers
|
| 297 |
|
|
|
|
| 408 |
cam_sum_lst.append(cam_sum)
|
| 409 |
|
| 410 |
|
| 411 |
+
return cam_sum_lst, grid_size, start_idx
|
| 412 |
|
| 413 |
|
| 414 |
|
|
|
|
| 557 |
cam_sum_lst.append(cam_sum)
|
| 558 |
|
| 559 |
|
| 560 |
+
return cam_sum_lst, grid_size, start_idx
|
| 561 |
|
| 562 |
|
| 563 |
|
demo/model_utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import torch
|
|
| 2 |
import numpy as np
|
| 3 |
import spaces
|
| 4 |
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
-
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
|
| 6 |
from transformers import CLIPProcessor, CLIPModel
|
| 7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 8 |
|
|
@@ -117,19 +117,29 @@ class LLaVA_Utils(Model_Utils):
|
|
| 117 |
def __init__(self):
|
| 118 |
super().__init__()
|
| 119 |
|
| 120 |
-
def init_LLaVA(self):
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
return self.vl_gpt, self.tokenizer
|
| 135 |
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import spaces
|
| 4 |
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
|
| 6 |
from transformers import CLIPProcessor, CLIPModel
|
| 7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
| 8 |
|
|
|
|
| 117 |
def __init__(self):
|
| 118 |
super().__init__()
|
| 119 |
|
| 120 |
+
def init_LLaVA(self, version):
|
| 121 |
+
if version == "1.5":
|
| 122 |
+
model_path = "llava-hf/llava-1.5-7b-hf"
|
| 123 |
+
config = AutoConfig.from_pretrained(model_path)
|
| 124 |
+
|
| 125 |
+
self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path,
|
| 126 |
+
low_cpu_mem_usage=True,
|
| 127 |
+
attn_implementation = 'eager',
|
| 128 |
+
output_attentions=True
|
| 129 |
+
)
|
| 130 |
+
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
| 131 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
| 132 |
+
self.tokenizer = self.processor.tokenizer
|
| 133 |
+
|
| 134 |
+
else:
|
| 135 |
+
model_path = "llava-hf/llava-onevision-qwen2-7b-si-hf"
|
| 136 |
+
|
| 137 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
| 138 |
+
|
| 139 |
+
self.vl_gpt = LlavaOnevisionForConditionalGeneration.from_pretrained(model_path,
|
| 140 |
+
torch_dtype=torch.float16,
|
| 141 |
+
low_cpu_mem_usage=True)
|
| 142 |
+
self.tokenizer = self.processor.tokenizer
|
| 143 |
|
| 144 |
return self.vl_gpt, self.tokenizer
|
| 145 |
|