ccclemenfff commited on
Commit
2e0bbea
·
1 Parent(s): af262cd

Add videollama2 model code

Browse files
Files changed (41) hide show
  1. videollama2/__init__.py +114 -0
  2. videollama2/constants.py +32 -0
  3. videollama2/conversation.py +507 -0
  4. videollama2/eval/eval_video_cap_msvc_correctness.py +259 -0
  5. videollama2/eval/eval_video_cap_msvc_detailedness.py +257 -0
  6. videollama2/eval/eval_video_mcqa_mvbench.py +64 -0
  7. videollama2/eval/eval_video_mcqa_videomme.py +277 -0
  8. videollama2/eval/eval_video_oqa_activitynet.py +210 -0
  9. videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py +210 -0
  10. videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py +210 -0
  11. videollama2/eval/eval_video_oqa_vcgpt_3_context.py +212 -0
  12. videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py +206 -0
  13. videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py +218 -0
  14. videollama2/eval/inference_video_cap_msvc.py +120 -0
  15. videollama2/eval/inference_video_mcqa_egoschema.py +153 -0
  16. videollama2/eval/inference_video_mcqa_mvbench.py +203 -0
  17. videollama2/eval/inference_video_mcqa_perception_test_mcqa.py +169 -0
  18. videollama2/eval/inference_video_mcqa_videomme.py +304 -0
  19. videollama2/eval/inference_video_oqa_activitynet.py +150 -0
  20. videollama2/eval/inference_video_oqa_vcgpt_consistency.py +150 -0
  21. videollama2/eval/inference_video_oqa_vcgpt_general.py +130 -0
  22. videollama2/mm_utils.py +345 -0
  23. videollama2/model/__init__.py +193 -0
  24. videollama2/model/encoder.py +164 -0
  25. videollama2/model/projector.py +250 -0
  26. videollama2/model/videollama2_arch.py +263 -0
  27. videollama2/model/videollama2_llama.py +155 -0
  28. videollama2/model/videollama2_mistral.py +157 -0
  29. videollama2/model/videollama2_mixtral.py +152 -0
  30. videollama2/model/videollama2_qwen2.py +151 -0
  31. videollama2/serve/cli.py +139 -0
  32. videollama2/serve/controller.py +298 -0
  33. videollama2/serve/gradio_web_server.py +499 -0
  34. videollama2/serve/gradio_web_server_adhoc.py +318 -0
  35. videollama2/serve/model_worker.py +397 -0
  36. videollama2/serve/register_worker.py +26 -0
  37. videollama2/serve/sglang_worker.py +244 -0
  38. videollama2/serve/test_message.py +62 -0
  39. videollama2/train.py +574 -0
  40. videollama2/utils.py +126 -0
  41. videollama2/videollama2_trainer.py +369 -0
videollama2/__init__.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import warnings
4
+ import shutil
5
+ from functools import partial
6
+
7
+ import torch
8
+
9
+ from .model import load_pretrained_model
10
+ from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria
11
+ from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP
12
+
13
+
14
+ def model_init(model_path=None, **kwargs):
15
+ model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
16
+ model_name = get_model_name_from_path(model_path)
17
+ tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
18
+
19
+ if tokenizer.pad_token is None and tokenizer.unk_token is not None:
20
+ tokenizer.pad_token = tokenizer.unk_token
21
+
22
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
23
+
24
+ processor = {
25
+ 'image': partial(process_image, processor=processor, aspect_ratio=None),
26
+ 'video': partial(process_video, processor=processor, aspect_ratio=None, num_frames=num_frames),
27
+ }
28
+
29
+ return model, processor, tokenizer
30
+
31
+
32
+ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs):
33
+ """inference api of VideoLLaMA2 for video understanding.
34
+
35
+ Args:
36
+ model: VideoLLaMA2 model.
37
+ image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
38
+ instruct (str): text instruction for understanding video.
39
+ tokenizer: tokenizer.
40
+ do_sample (bool): whether to sample.
41
+ modal (str): inference modality.
42
+ Returns:
43
+ str: response of the model.
44
+ """
45
+
46
+ # 1. text preprocess (tag process & generate prompt).
47
+ if modal == 'image':
48
+ modal_token = DEFAULT_IMAGE_TOKEN
49
+ elif modal == 'video':
50
+ modal_token = DEFAULT_VIDEO_TOKEN
51
+ elif modal == 'text':
52
+ modal_token = ''
53
+ else:
54
+ raise ValueError(f"Unsupported modal: {modal}")
55
+
56
+ # 1. vision preprocess (load & transform image or video).
57
+ if modal == 'text':
58
+ tensor = None
59
+ else:
60
+ tensor = image_or_video.half().cuda()
61
+ tensor = [(tensor, modal)]
62
+
63
+ # 2. text preprocess (tag process & generate prompt).
64
+ if isinstance(instruct, str):
65
+ message = [{'role': 'user', 'content': modal_token + '\n' + instruct}]
66
+ elif isinstance(instruct, list):
67
+ message = copy.deepcopy(instruct)
68
+ message[0]['content'] = modal_token + '\n' + message[0]['content']
69
+ else:
70
+ raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
71
+
72
+ if model.config.model_type in ['videollama2', 'videollama2_mistral', 'videollama2_mixtral']:
73
+ system_message = [
74
+ {'role': 'system', 'content': (
75
+ """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
76
+ """\n"""
77
+ """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
78
+ }
79
+ ]
80
+ else:
81
+ system_message = []
82
+
83
+ message = system_message + message
84
+ prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
85
+
86
+ input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
87
+ attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
88
+
89
+ # 3. generate response according to visual signals and prompts.
90
+ keywords = [tokenizer.eos_token]
91
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
92
+
93
+ do_sample = kwargs.get('do_sample', False)
94
+ temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
95
+ top_p = kwargs.get('top_p', 0.9)
96
+ max_new_tokens = kwargs.get('max_new_tokens', 2048)
97
+
98
+ with torch.inference_mode():
99
+ output_ids = model.generate(
100
+ input_ids,
101
+ attention_mask=attention_masks,
102
+ images=tensor,
103
+ do_sample=do_sample,
104
+ temperature=temperature,
105
+ max_new_tokens=max_new_tokens,
106
+ top_p=top_p,
107
+ use_cache=True,
108
+ stopping_criteria=[stopping_criteria],
109
+ pad_token_id=tokenizer.eos_token_id,
110
+ )
111
+
112
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
113
+
114
+ return outputs
videollama2/constants.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+
9
+ # Image arguments
10
+ IMAGE_TOKEN_INDEX = -200
11
+ DEFAULT_IMAGE_TOKEN = "<image>"
12
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13
+ DEFAULT_IM_START_TOKEN = "<im_start>"
14
+ DEFAULT_IM_END_TOKEN = "<im_end>"
15
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
16
+
17
+ # Video arguments
18
+ VIDEO_TOKEN_INDEX = -201
19
+ DEFAULT_VIDEO_TOKEN = "<video>"
20
+ NUM_FRAMES = 8
21
+ MAX_FRAMES = 32
22
+ NUM_FRAMES_PER_SECOND = 1
23
+
24
+ # Audio arguments
25
+ AUDIO_TOKEN_INDEX = -202
26
+ DEFAULT_AUDIO_TOKEN = "<audio>"
27
+
28
+ MODAL_INDEX_MAP = {
29
+ "<image>": -200,
30
+ "<video>": -201,
31
+ "<audio>": -202,
32
+ }
videollama2/conversation.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import dataclasses
3
+ from io import BytesIO
4
+ from enum import auto, Enum
5
+ from typing import List, Tuple
6
+
7
+ from PIL import Image
8
+ from .constants import LOGDIR, NUM_FRAMES
9
+
10
+
11
+ class SeparatorStyle(Enum):
12
+ """Different separator style."""
13
+ SINGLE = auto()
14
+ TWO = auto()
15
+ PLAIN = auto()
16
+ LLAMA2 = auto()
17
+ QWEN = auto()
18
+
19
+ @dataclasses.dataclass
20
+ class Conversation:
21
+ """A class that keeps all conversation history."""
22
+ system: str
23
+ roles: List[str]
24
+ messages: List[List[str]]
25
+ offset: int
26
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
27
+ sep: str = "###"
28
+ sep2: str = None
29
+ version: str = "Unknown"
30
+
31
+ skip_next: bool = False
32
+ modality: str = "image"
33
+
34
+ def get_prompt(self):
35
+ messages = self.messages
36
+ modality_token = f"<{self.modality}>"
37
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
38
+ messages = self.messages.copy()
39
+ init_role, init_msg = messages[0].copy()
40
+ init_msg = init_msg[0].replace(modality_token, "").strip()
41
+ if 'mmtag' in self.version:
42
+ messages[0] = (init_role, init_msg)
43
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
44
+ messages.insert(1, (self.roles[1], "Received."))
45
+ else:
46
+ messages[0] = (init_role, f"{modality_token}\n" + init_msg)
47
+
48
+ if self.sep_style == SeparatorStyle.SINGLE:
49
+ ret = self.system + self.sep
50
+ for role, message in messages:
51
+ if message:
52
+ if type(message) is tuple:
53
+ message, _, _ = message
54
+ ret += role + ": " + message + self.sep
55
+ else:
56
+ ret += role + ":"
57
+ elif self.sep_style == SeparatorStyle.TWO:
58
+ seps = [self.sep, self.sep2]
59
+ ret = self.system + seps[0]
60
+ for i, (role, message) in enumerate(messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message, _, _ = message
64
+ ret += role + ": " + message + seps[i % 2]
65
+ else:
66
+ ret += role + ":"
67
+ elif self.sep_style == SeparatorStyle.LLAMA2:
68
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
69
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
70
+ ret = ""
71
+
72
+ for i, (role, message) in enumerate(messages):
73
+ if i == 0:
74
+ assert message, "first message should not be none"
75
+ assert role == self.roles[0], "first message should come from user"
76
+ if message:
77
+ if type(message) is tuple:
78
+ message, _, _ = message
79
+ if i == 0: message = wrap_sys(self.system) + message
80
+ if i % 2 == 0:
81
+ message = wrap_inst(message)
82
+ ret += self.sep + message
83
+ else:
84
+ ret += " " + message + " " + self.sep2
85
+ else:
86
+ ret += ""
87
+ ret = ret.lstrip(self.sep)
88
+ elif self.sep_style == SeparatorStyle.QWEN:
89
+ ret = ""
90
+ # 1. Add system prompt
91
+ ret += self.system + self.sep + "\n"
92
+ # 2. Iterate message
93
+ for i, (role, message) in enumerate(messages):
94
+ if i == 0:
95
+ assert message, "first message should not be none"
96
+ assert role == self.roles[0], "first message should come from user"
97
+ if message:
98
+ if type(message) is tuple:
99
+ message, _, _ = message
100
+ # 2.1 Add role and message
101
+ ret += role + message + self.sep + "\n"
102
+ else:
103
+ # 2.2 Add generation prompt
104
+ ret += role
105
+ elif self.sep_style == SeparatorStyle.PLAIN:
106
+ seps = [self.sep, self.sep2]
107
+ ret = self.system
108
+ for i, (role, message) in enumerate(messages):
109
+ if message:
110
+ if type(message) is tuple:
111
+ message, _, _ = message
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ else:
116
+ raise ValueError(f"Invalid style: {self.sep_style}")
117
+
118
+ return ret
119
+
120
+ def append_message(self, role, message):
121
+ self.messages.append([role, message])
122
+
123
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
124
+ if image_process_mode == "Pad":
125
+ def expand2square(pil_img, background_color=(122, 116, 104)):
126
+ width, height = pil_img.size
127
+ if width == height:
128
+ return pil_img
129
+ elif width > height:
130
+ result = Image.new(pil_img.mode, (width, width), background_color)
131
+ result.paste(pil_img, (0, (width - height) // 2))
132
+ return result
133
+ else:
134
+ result = Image.new(pil_img.mode, (height, height), background_color)
135
+ result.paste(pil_img, ((height - width) // 2, 0))
136
+ return result
137
+ image = expand2square(image)
138
+ elif image_process_mode in ["Default", "Crop"]:
139
+ pass
140
+ elif image_process_mode == "Resize":
141
+ image = image.resize((336, 336))
142
+ else:
143
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
144
+ if max(image.size) > max_len:
145
+ max_hw, min_hw = max(image.size), min(image.size)
146
+ aspect_ratio = max_hw / min_hw
147
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
148
+ longest_edge = int(shortest_edge * aspect_ratio)
149
+ W, H = image.size
150
+ if H > W:
151
+ H, W = longest_edge, shortest_edge
152
+ else:
153
+ H, W = shortest_edge, longest_edge
154
+ image = image.resize((W, H))
155
+ if return_pil:
156
+ return image
157
+ else:
158
+ buffered = BytesIO()
159
+ image.save(buffered, format=image_format)
160
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
161
+ return img_b64_str
162
+
163
+
164
+ def get_videos(self, return_pil=False):
165
+ video_frames = []
166
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
167
+ if i % 2 == 0:
168
+ if type(msg) is tuple:
169
+ from decord import VideoReader, cpu
170
+ import numpy as np
171
+ # here video is the file path of input video
172
+ msg, video, image_process_mode = msg
173
+ if not return_pil:
174
+ # return filepath
175
+ video_frames.append(video)
176
+ else:
177
+ # read video using decord.VideoReader
178
+ decord_vr = VideoReader(uri=video, ctx=cpu(0))
179
+ duration = len(decord_vr)
180
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
181
+ # convert the extracted image frames into PIL objects
182
+ all_images = [Image.fromarray(f) for f in decord_vr.get_batch(frame_id_list).asnumpy()]
183
+ video_frames.extend([self.process_image(image, image_process_mode, return_pil=return_pil) for image in all_images])
184
+ return video_frames
185
+
186
+
187
+ def get_images(self, return_pil=False):
188
+ images = []
189
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
190
+ if i % 2 == 0:
191
+ if type(msg) is tuple:
192
+ msg, image, image_process_mode = msg
193
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
194
+ images.append(image)
195
+
196
+ # import base64
197
+ # from io import BytesIO
198
+ # from PIL import Image
199
+ # # here image is a PIL object
200
+ # msg, image, image_process_mode = msg
201
+ # if image_process_mode == "Pad":
202
+ # def expand2square(pil_img, background_color=(122, 116, 104)):
203
+ # width, height = pil_img.size
204
+ # if width == height:
205
+ # return pil_img
206
+ # elif width > height:
207
+ # result = Image.new(pil_img.mode, (width, width), background_color)
208
+ # result.paste(pil_img, (0, (width - height) // 2))
209
+ # return result
210
+ # else:
211
+ # result = Image.new(pil_img.mode, (height, height), background_color)
212
+ # result.paste(pil_img, ((height - width) // 2, 0))
213
+ # return result
214
+ # image = expand2square(image)
215
+ # elif image_process_mode in ["Default", "Crop"]:
216
+ # pass
217
+ # elif image_process_mode == "Resize":
218
+ # image = image.resize((336, 336))
219
+ # else:
220
+ # raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
221
+ # max_hw, min_hw = max(image.size), min(image.size)
222
+ # aspect_ratio = max_hw / min_hw
223
+ # max_len, min_len = 800, 400
224
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
225
+ # longest_edge = int(shortest_edge * aspect_ratio)
226
+ # W, H = image.size
227
+ # if longest_edge != max(image.size):
228
+ # if H > W:
229
+ # H, W = longest_edge, shortest_edge
230
+ # else:
231
+ # H, W = shortest_edge, longest_edge
232
+ # image = image.resize((W, H))
233
+ # if return_pil:
234
+ # images.append(image)
235
+ # else:
236
+ # buffered = BytesIO()
237
+ # image.save(buffered, format="PNG")
238
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
239
+ # images.append(img_b64_str)
240
+ return images
241
+
242
+ def to_gradio_chatbot(self):
243
+ ret = []
244
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
245
+ if i % 2 == 0:
246
+ if type(msg) is tuple:
247
+ # import base64
248
+ # from io import BytesIO
249
+ # from PIL import Image
250
+ # msg, image, image_process_mode = msg
251
+ # max_hw, min_hw = max(image.size), min(image.size)
252
+ # aspect_ratio = max_hw / min_hw
253
+ # max_len, min_len = 800, 400
254
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
255
+ # longest_edge = int(shortest_edge * aspect_ratio)
256
+ # W, H = image.size
257
+ # if H > W:
258
+ # H, W = longest_edge, shortest_edge
259
+ # else:
260
+ # H, W = shortest_edge, longest_edge
261
+ # image = image.resize((W, H))
262
+ # buffered = BytesIO()
263
+ # image.save(buffered, format="JPEG")
264
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
265
+ # img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
266
+ # display image/video in the textbox
267
+ msg, image_or_video, image_process_mode = msg
268
+ ##print("imagebox:", image)
269
+ if isinstance(image_or_video, Image.Image):
270
+ # image is PIL object
271
+ img_b64_str = self.process_image(image_or_video, "Default", return_pil=False, image_format='JPEG')
272
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
273
+ msg = img_str + msg.replace('<image>', '').strip()
274
+ else:
275
+ # video is file path
276
+ vid_str = f'<video controls playsinline width="500" style="display: inline-block;" src="./file={image_or_video}"></video><br>'
277
+ msg = vid_str + msg.replace('<video>', '').strip()
278
+ ret.append([msg, None])
279
+ else:
280
+ ret.append([msg, None])
281
+ else:
282
+ ret[-1][-1] = msg
283
+ return ret
284
+
285
+ def copy(self):
286
+ return Conversation(
287
+ system=self.system,
288
+ roles=self.roles,
289
+ messages=[[x, y] for x, y in self.messages],
290
+ offset=self.offset,
291
+ sep_style=self.sep_style,
292
+ sep=self.sep,
293
+ sep2=self.sep2,
294
+ version=self.version)
295
+
296
+ def dict(self):
297
+ if (self.modality == "image" and len(self.get_images()) > 0) or \
298
+ (self.modality == "video" and len(self.get_videos()) > 0):
299
+ return {
300
+ "system": self.system,
301
+ "roles": self.roles,
302
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
303
+ "offset": self.offset,
304
+ "sep": self.sep,
305
+ "sep2": self.sep2,
306
+ "modality": self.modality
307
+ }
308
+ return {
309
+ "system": self.system,
310
+ "roles": self.roles,
311
+ "messages": self.messages,
312
+ "offset": self.offset,
313
+ "sep": self.sep,
314
+ "sep2": self.sep2,
315
+ }
316
+
317
+
318
+ conv_vicuna_v0 = Conversation(
319
+ system="A chat between a curious human and an artificial intelligence assistant. "
320
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
321
+ roles=("Human", "Assistant"),
322
+ messages=(
323
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
324
+ ("Assistant",
325
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
326
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
327
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
328
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
329
+ "renewable and non-renewable energy sources:\n"
330
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
331
+ "energy sources are finite and will eventually run out.\n"
332
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
333
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
334
+ "and other negative effects.\n"
335
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
336
+ "have lower operational costs than non-renewable sources.\n"
337
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
338
+ "locations than non-renewable sources.\n"
339
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
340
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
341
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
342
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
343
+ ),
344
+ offset=2,
345
+ sep_style=SeparatorStyle.SINGLE,
346
+ sep="###",
347
+ )
348
+
349
+ conv_llava_plain = Conversation(
350
+ system="",
351
+ roles=("", ""),
352
+ messages=(),
353
+ offset=0,
354
+ sep_style=SeparatorStyle.PLAIN,
355
+ sep="",
356
+ sep2="\n"
357
+ )
358
+
359
+ conv_llava_v0_mmtag = Conversation(
360
+ system="A chat between a curious user and an artificial intelligence assistant. "
361
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
362
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
363
+ roles=("Human", "Assistant"),
364
+ messages=(
365
+ ),
366
+ offset=0,
367
+ sep_style=SeparatorStyle.SINGLE,
368
+ sep="###",
369
+ version="v0_mmtag",
370
+ )
371
+
372
+ conv_llava_v0 = Conversation(
373
+ system="A chat between a curious human and an artificial intelligence assistant. "
374
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
375
+ roles=("Human", "Assistant"),
376
+ messages=(
377
+ ),
378
+ offset=0,
379
+ sep_style=SeparatorStyle.SINGLE,
380
+ sep="###",
381
+ )
382
+
383
+ conv_vicuna_v1 = Conversation(
384
+ system="A chat between a curious user and an artificial intelligence assistant. "
385
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
386
+ roles=("USER", "ASSISTANT"),
387
+ version="v1",
388
+ messages=(),
389
+ offset=0,
390
+ sep_style=SeparatorStyle.TWO,
391
+ sep=" ",
392
+ sep2="</s>",
393
+ )
394
+
395
+ conv_llava_v1_mmtag = Conversation(
396
+ system="A chat between a curious user and an artificial intelligence assistant. "
397
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
398
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
399
+ roles=("USER", "ASSISTANT"),
400
+ messages=(),
401
+ offset=0,
402
+ sep_style=SeparatorStyle.TWO,
403
+ sep=" ",
404
+ sep2="</s>",
405
+ version="v1_mmtag",
406
+ )
407
+
408
+ conv_llava_v1 = Conversation(
409
+ system="A chat between a curious human and an artificial intelligence assistant. "
410
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
411
+ roles=("USER", "ASSISTANT"),
412
+ version="v1",
413
+ messages=(),
414
+ offset=0,
415
+ sep_style=SeparatorStyle.TWO,
416
+ sep=" ",
417
+ sep2="</s>",
418
+ )
419
+
420
+ conv_llava_llama2 = Conversation(
421
+ system="You are a helpful language and vision assistant. "
422
+ "You are able to understand the visual content that the user provides, "
423
+ "and assist the user with a variety of tasks using natural language.",
424
+ roles=("USER", "ASSISTANT"),
425
+ version="llama2",
426
+ messages=(),
427
+ offset=0,
428
+ sep_style=SeparatorStyle.LLAMA2,
429
+ sep="<s>",
430
+ sep2="</s>",
431
+ )
432
+
433
+ conv_llama2 = Conversation(
434
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
435
+
436
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
437
+ roles=("USER", "ASSISTANT"),
438
+ version="llama2",
439
+ messages=(),
440
+ offset=0,
441
+ sep_style=SeparatorStyle.LLAMA2,
442
+ sep="<s>",
443
+ sep2="</s>",
444
+ )
445
+
446
+ conv_mistral = Conversation(
447
+ system="A chat between a curious user and an artificial intelligence assistant. "
448
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
449
+ roles=("USER", "ASSISTANT"),
450
+ version="llama2",
451
+ messages=(),
452
+ offset=0,
453
+ sep_style=SeparatorStyle.LLAMA2,
454
+ sep="",
455
+ sep2="</s>",
456
+ )
457
+
458
+ conv_qwen = Conversation(
459
+ system="<|im_start|>system\nYou are a helpful assistant.",
460
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
461
+ messages=(),
462
+ offset=0,
463
+ sep_style=SeparatorStyle.QWEN,
464
+ sep="<|im_end|>",
465
+ version="qwen",
466
+ )
467
+
468
+ conv_qwen_plain = Conversation(
469
+ system="",
470
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
471
+ messages=(),
472
+ offset=0,
473
+ sep_style=SeparatorStyle.PLAIN,
474
+ sep="<|im_end|>",
475
+ sep2="<|im_end|>",
476
+ version="qwen_plain",
477
+ )
478
+
479
+ default_conversation = conv_mistral
480
+ conv_templates = {
481
+ "default": conv_vicuna_v0,
482
+ # pretrain template
483
+ "plain": conv_llava_plain,
484
+ # llava v0
485
+ "v0": conv_vicuna_v0,
486
+ "v0_plain": conv_llava_plain,
487
+ "v0_mmtag": conv_llava_v0_mmtag,
488
+ "llava_v0": conv_llava_v0,
489
+ # llava v1
490
+ "v1": conv_vicuna_v1,
491
+ "v1_mmtag": conv_llava_v1_mmtag,
492
+ "llava_v1": conv_llava_v1,
493
+ "vicuna_v1": conv_vicuna_v1,
494
+ # llava v1.5
495
+ "llava_llama2": conv_llava_llama2,
496
+ # llama2
497
+ "llama2": conv_llama2,
498
+ # mistral
499
+ "mistral": conv_mistral,
500
+ # qwen
501
+ "qwen": conv_qwen,
502
+ "qwen_plain": conv_qwen_plain,
503
+ }
504
+
505
+
506
+ if __name__ == "__main__":
507
+ print(default_conversation.get_prompt())
videollama2/eval/eval_video_cap_msvc_correctness.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import ast
4
+ import time
5
+ import json
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ import openai
11
+ from openai import AzureOpenAI
12
+
13
+
14
+ def init():
15
+ client = AzureOpenAI(
16
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
18
+ api_version="2024-02-15-preview"
19
+ )
20
+
21
+ return client
22
+
23
+
24
+ def interaction(client, message_text):
25
+ completion = client.chat.completions.create(
26
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
27
+ messages = message_text,
28
+ temperature=0.7,
29
+ max_tokens=800,
30
+ top_p=0.95,
31
+ frequency_penalty=0,
32
+ presence_penalty=0,
33
+ stop=None
34
+ )
35
+
36
+ return completion
37
+
38
+
39
+ def annotate(prediction_set, caption_files, output_dir):
40
+ """
41
+ Evaluates question and answer pairs using GPT-3
42
+ Returns a score for correctness.
43
+ """
44
+
45
+ for file in tqdm(caption_files):
46
+ key = file[:-5] # Strip file extension
47
+ qa_set = prediction_set[key]
48
+ question = qa_set['q']
49
+ answer = str(qa_set['a'])
50
+ pred = qa_set['pred']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
57
+ "Your task is to compare the predicted answer with these correct answers and determine if they are factually consistent. Here's how you can accomplish the task:"
58
+ "------"
59
+ "##INSTRUCTIONS: "
60
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
61
+ "- The predicted answer must be factually accurate and align with the video content.\n"
62
+ "- Consider synonyms or paraphrases as valid matches.\n"
63
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
64
+ },
65
+ {
66
+ "role": "user",
67
+ "content":
68
+ "Please evaluate the following video-based question-answer pair:\n\n"
69
+ f"Question: {question}\n"
70
+ f"Correct Answers: {answer}\n"
71
+ f"Predicted Answer: {pred}\n\n"
72
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
73
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
74
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
75
+ "For example, your response should look like this: {''score': 4.8}."
76
+ }
77
+ ]
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+ # # Save the question-answer pairs to a json file.
84
+ with open(f"{output_dir}/{key}.json", "w") as f:
85
+ json.dump(result_qa_pair, f)
86
+
87
+ except Exception as e:
88
+ print(f"Error processing file '{key}': {e}")
89
+
90
+ time.sleep(1)
91
+
92
+
93
+ def longest_repeating_substring(s):
94
+ n = len(s)
95
+ dp = [[0] * (n+1) for _ in range(n+1)]
96
+ res = ""
97
+ res_length = 0
98
+
99
+ index = 0
100
+ for i in range(1, n+1):
101
+ for j in range(i+1, n+1):
102
+ if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
103
+ dp[i][j] = dp[i-1][j-1] + 1
104
+ if dp[i][j] > res_length:
105
+ res_length = dp[i][j]
106
+ index = max(i, index)
107
+ else:
108
+ dp[i][j] = 0
109
+
110
+ if res_length > 0:
111
+ for i in range(index-res_length+1, index+1):
112
+ res = res + s[i-1]
113
+
114
+ return res
115
+
116
+
117
+ def main(args):
118
+ if args.num_chunks > 1:
119
+ pred_contents = []
120
+ for _idx in range(args.num_chunks):
121
+ file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
122
+ pred_contents += [json.loads(line) for line in open(file)]
123
+ else:
124
+ pred_contents = [json.loads(line) for line in open(args.pred_path)]
125
+
126
+ # Dictionary to store the count of occurrences for each video_id
127
+ video_id_counts = {}
128
+ new_pred_contents = []
129
+
130
+ # Iterate through each sample in pred_contents
131
+ for sample in pred_contents:
132
+ video_id = sample["video_name"]
133
+ if video_id in video_id_counts:
134
+ video_id_counts[video_id] += 1
135
+ else:
136
+ video_id_counts[video_id] = 0
137
+
138
+ # Create a new sample with the modified key
139
+ new_sample = sample
140
+ new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
141
+ new_pred_contents.append(new_sample)
142
+
143
+ # Generating list of id's and corresponding files
144
+ id_list = [x["video_name"] for x in new_pred_contents]
145
+ caption_files = [f"{id}.json" for id in id_list]
146
+
147
+ output_dir = args.output_dir
148
+ # Generate output directory if not exists.
149
+ if not os.path.exists(output_dir):
150
+ os.makedirs(output_dir)
151
+
152
+ # Preparing dictionary of question-answer sets
153
+ prediction_set = {}
154
+ for sample in new_pred_contents:
155
+ id = sample["video_name"]
156
+ # print(sample)
157
+ question = sample["question"]
158
+ answer = sample["answer"]
159
+ pred = sample["pred"]
160
+ qa_set = {"q": question, "a": answer, "pred": pred}
161
+ prediction_set[id] = qa_set
162
+
163
+ # # Set the OpenAI API key.
164
+ # openai.api_key = args.api_key # Your API key here
165
+ # if args.api_base:
166
+ # openai.api_base = args.api_base # Your API base here
167
+ num_tasks = args.num_tasks
168
+
169
+ # While loop to ensure that all captions are processed.
170
+ while True:
171
+ try:
172
+ # Files that have not been processed yet.
173
+ completed_files = os.listdir(output_dir)
174
+ print(f"completed_files: {len(completed_files)}")
175
+
176
+ # Files that have not been processed yet.
177
+ incomplete_files = [f for f in caption_files if f not in completed_files]
178
+ print(f"incomplete_files: {len(incomplete_files)}")
179
+
180
+ # Break the loop when there are no incomplete files
181
+ if len(incomplete_files) == 0:
182
+ break
183
+ if len(incomplete_files) <= num_tasks:
184
+ num_tasks = 1
185
+
186
+ # Split tasks into parts.
187
+ part_len = len(incomplete_files) // num_tasks
188
+ all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
189
+ task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
190
+ print("Generate", len(all_parts), "subprocess.")
191
+
192
+ # Use a pool of workers to process the files in parallel.
193
+ # with Pool() as pool:
194
+ # pool.starmap(annotate, task_args)
195
+ # import pdb;pdb.set_trace()
196
+ annotate(*task_args[0])
197
+
198
+ except Exception as e:
199
+ print(f"Error: {e}")
200
+
201
+ # Combine all the processed files into one
202
+ combined_contents = {}
203
+ json_path = args.output_json
204
+
205
+ # Iterate through json files
206
+ for file_name in os.listdir(output_dir):
207
+ if file_name.endswith(".json"):
208
+ file_path = os.path.join(output_dir, file_name)
209
+ with open(file_path, "r") as json_file:
210
+ try:
211
+ content = json.load(json_file)
212
+ combined_contents[file_name[:-5]] = content
213
+ except Exception as e:
214
+ print(f"Error: {e}")
215
+ pass
216
+
217
+ # Calculate average score
218
+ score_sum = 0
219
+ count = 0
220
+ for key, result in combined_contents.items():
221
+ count += 1
222
+ try:
223
+ # key = result[0].keys()[0]
224
+ # import pdb; pdb.set_trace()
225
+ for _ in result[0].keys():
226
+ score_match = result[0][_]
227
+ score = int(score_match)
228
+ score_sum += score
229
+ break
230
+ except Exception as e:
231
+ print(f"Error processing file '{key}': {e}")
232
+ import pdb; pdb.set_trace()
233
+ average_score = score_sum / count
234
+ combined_contents["average_score"] = average_score
235
+ with open(json_path, "w") as json_file:
236
+ json.dump(combined_contents, json_file, indent=4)
237
+ print("Average score for correctness:", average_score)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
242
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
243
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
244
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
245
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
246
+ parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
247
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
248
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
249
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
250
+ args = parser.parse_args()
251
+
252
+ # Set the OpenAI API key.
253
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
254
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
255
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
256
+
257
+ client = init()
258
+
259
+ main(args)
videollama2/eval/eval_video_cap_msvc_detailedness.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import ast
4
+ import time
5
+ import json
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ import openai
11
+ from openai import AzureOpenAI
12
+
13
+
14
+ def init():
15
+ client = AzureOpenAI(
16
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
18
+ api_version="2024-02-15-preview"
19
+ )
20
+
21
+ return client
22
+
23
+
24
+ def interaction(client, message_text):
25
+ completion = client.chat.completions.create(
26
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
27
+ messages = message_text,
28
+ temperature=0.7,
29
+ max_tokens=800,
30
+ top_p=0.95,
31
+ frequency_penalty=0,
32
+ presence_penalty=0,
33
+ stop=None
34
+ )
35
+
36
+ return completion
37
+
38
+
39
+ def annotate(prediction_set, caption_files, output_dir):
40
+ """
41
+ Evaluates question and answer pairs using GPT-3
42
+ Returns a score for correctness.
43
+ """
44
+
45
+ for file in tqdm(caption_files):
46
+ key = file[:-5] # Strip file extension
47
+ qa_set = prediction_set[key]
48
+ question = qa_set['q']
49
+ answer = str(qa_set['a'])
50
+ pred = qa_set['pred']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content": "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with these correct answers and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
60
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity.",
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content": "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answers: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}.",
74
+ },
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+ # # Save the question-answer pairs to a json file.
82
+ with open(f"{output_dir}/{key}.json", "w") as f:
83
+ json.dump(result_qa_pair, f)
84
+
85
+ except Exception as e:
86
+ print(f"Error processing file '{key}': {e}")
87
+
88
+ time.sleep(1)
89
+
90
+
91
+ def longest_repeating_substring(s):
92
+ n = len(s)
93
+ dp = [[0] * (n+1) for _ in range(n+1)]
94
+ res = ""
95
+ res_length = 0
96
+
97
+ index = 0
98
+ for i in range(1, n+1):
99
+ for j in range(i+1, n+1):
100
+ if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
101
+ dp[i][j] = dp[i-1][j-1] + 1
102
+ if dp[i][j] > res_length:
103
+ res_length = dp[i][j]
104
+ index = max(i, index)
105
+ else:
106
+ dp[i][j] = 0
107
+
108
+ if res_length > 0:
109
+ for i in range(index-res_length+1, index+1):
110
+ res = res + s[i-1]
111
+
112
+ return res
113
+
114
+
115
+ def main(args):
116
+ if args.num_chunks > 1:
117
+ pred_contents = []
118
+ for _idx in range(args.num_chunks):
119
+ file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
120
+ pred_contents += [json.loads(line) for line in open(file)]
121
+ else:
122
+ pred_contents = [json.loads(line) for line in open(args.pred_path)]
123
+
124
+ # Dictionary to store the count of occurrences for each video_id
125
+ video_id_counts = {}
126
+ new_pred_contents = []
127
+
128
+ # Iterate through each sample in pred_contents
129
+ for sample in pred_contents:
130
+ video_id = sample["video_name"]
131
+ if video_id in video_id_counts:
132
+ video_id_counts[video_id] += 1
133
+ else:
134
+ video_id_counts[video_id] = 0
135
+
136
+ # Create a new sample with the modified key
137
+ new_sample = sample
138
+ new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
139
+ new_pred_contents.append(new_sample)
140
+
141
+ # Generating list of id's and corresponding files
142
+ id_list = [x["video_name"] for x in new_pred_contents]
143
+ caption_files = [f"{id}.json" for id in id_list]
144
+
145
+ output_dir = args.output_dir
146
+ # Generate output directory if not exists.
147
+ if not os.path.exists(output_dir):
148
+ os.makedirs(output_dir)
149
+
150
+ # Preparing dictionary of question-answer sets
151
+ prediction_set = {}
152
+ for sample in new_pred_contents:
153
+ id = sample["video_name"]
154
+ # print(sample)
155
+ question = sample["question"]
156
+ answer = sample["answer"]
157
+ pred = sample["pred"]
158
+ qa_set = {"q": question, "a": answer, "pred": pred}
159
+ prediction_set[id] = qa_set
160
+
161
+ # # Set the OpenAI API key.
162
+ # openai.api_key = args.api_key # Your API key here
163
+ # if args.api_base:
164
+ # openai.api_base = args.api_base # Your API base here
165
+ num_tasks = args.num_tasks
166
+
167
+ # While loop to ensure that all captions are processed.
168
+ while True:
169
+ try:
170
+ # Files that have not been processed yet.
171
+ completed_files = os.listdir(output_dir)
172
+ print(f"completed_files: {len(completed_files)}")
173
+
174
+ # Files that have not been processed yet.
175
+ incomplete_files = [f for f in caption_files if f not in completed_files]
176
+ print(f"incomplete_files: {len(incomplete_files)}")
177
+
178
+ # Break the loop when there are no incomplete files
179
+ if len(incomplete_files) == 0:
180
+ break
181
+ if len(incomplete_files) <= num_tasks:
182
+ num_tasks = 1
183
+
184
+ # Split tasks into parts.
185
+ part_len = len(incomplete_files) // num_tasks
186
+ all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
187
+ task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
188
+ print("Generate", len(all_parts), "subprocess.")
189
+
190
+ # Use a pool of workers to process the files in parallel.
191
+ # with Pool() as pool:
192
+ # pool.starmap(annotate, task_args)
193
+ # import pdb;pdb.set_trace()
194
+ annotate(*task_args[0])
195
+
196
+ except Exception as e:
197
+ print(f"Error: {e}")
198
+
199
+ # Combine all the processed files into one
200
+ combined_contents = {}
201
+ json_path = args.output_json
202
+
203
+ # Iterate through json files
204
+ for file_name in os.listdir(output_dir):
205
+ if file_name.endswith(".json"):
206
+ file_path = os.path.join(output_dir, file_name)
207
+ with open(file_path, "r") as json_file:
208
+ try:
209
+ content = json.load(json_file)
210
+ combined_contents[file_name[:-5]] = content
211
+ except Exception as e:
212
+ print(f"Error: {e}")
213
+ pass
214
+
215
+ # Calculate average score
216
+ score_sum = 0
217
+ count = 0
218
+ for key, result in combined_contents.items():
219
+ count += 1
220
+ try:
221
+ # key = result[0].keys()[0]
222
+ # import pdb; pdb.set_trace()
223
+ for _ in result[0].keys():
224
+ score_match = result[0][_]
225
+ score = int(score_match)
226
+ score_sum += score
227
+ break
228
+ except Exception as e:
229
+ print(f"Error processing file '{key}': {e}")
230
+ import pdb; pdb.set_trace()
231
+ average_score = score_sum / count
232
+ combined_contents["average_score"] = average_score
233
+ with open(json_path, "w") as json_file:
234
+ json.dump(combined_contents, json_file, indent=4)
235
+ print("Average score for detailedness:", average_score)
236
+
237
+
238
+ if __name__ == "__main__":
239
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
240
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
241
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
242
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
243
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
244
+ parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
245
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
246
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
247
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
248
+ args = parser.parse_args()
249
+
250
+ # Set the OpenAI API key.
251
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
252
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
253
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
254
+
255
+ client = init()
256
+
257
+ main(args)
videollama2/eval/eval_video_mcqa_mvbench.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tabulate import tabulate
4
+
5
+
6
+ tasks = {
7
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
8
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
9
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
10
+ "Fine-grained Action": ("fine_grained_action.json", "pMoments_in_Time_Raw/videos/", "video", False),
11
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
12
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
13
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
14
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
15
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
16
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
17
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
18
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
19
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
20
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
21
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
22
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
23
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
24
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
25
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
26
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
27
+ }
28
+
29
+
30
+ def main():
31
+ args = parse_args()
32
+ res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
33
+ task_types = tasks.keys()
34
+ task_acc = {x: [] for x in task_types}
35
+ acc = []
36
+ for i, x in enumerate(res):
37
+ value = 1
38
+ if x['pred'] != x['gt']:
39
+ value = 0
40
+ acc.append(value)
41
+ task_acc[x['task_type']].append(value)
42
+ acc = sum(acc) * 100 / len(acc)
43
+ task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
44
+ print(f"{args.pred_path}:", acc)
45
+ task_names = list(tasks.keys())
46
+
47
+ table_data = []
48
+ for i in range(len(task_names) // 4):
49
+ row_task_names = task_names[i * 4: (i + 1) * 4]
50
+ row_task_acc = [task_acc[x] for x in row_task_names]
51
+ table_data.append(row_task_names)
52
+ table_data.append(row_task_acc)
53
+ print(tabulate(table_data, floatfmt=".1f"), '\n')
54
+
55
+
56
+ def parse_args():
57
+ parser = argparse.ArgumentParser(description="Evaluate video captioning.")
58
+ parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
59
+ args = parser.parse_args()
60
+ return args
61
+
62
+
63
+ if __name__ == '__main__':
64
+ main()
videollama2/eval/eval_video_mcqa_videomme.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import argparse
5
+ from typing import List, Dict, Optional, Union
6
+
7
+ CATEGORIES = [
8
+ "Knowledge",
9
+ "Film & Television",
10
+ "Sports Competition",
11
+ "Artistic Performance",
12
+ "Life Record",
13
+ "Multilingual"
14
+ ]
15
+
16
+ SUB_CATEGORIES = [
17
+ "Humanity & History",
18
+ "Literature & Art",
19
+ "Biology & Medicine",
20
+ "Finance & Commerce",
21
+ "Astronomy",
22
+ "Geography",
23
+ "Law",
24
+ "Life Tip",
25
+ "Technology",
26
+ "Animation",
27
+ "Movie & TV Show",
28
+ "Documentary",
29
+ "News Report",
30
+ "Esports",
31
+ "Basketball",
32
+ "Football",
33
+ "Athletics",
34
+ "Other Sports",
35
+ "Stage Play",
36
+ "Magic Show",
37
+ "Variety Show",
38
+ "Acrobatics",
39
+ "Handicraft",
40
+ "Food",
41
+ "Fashion",
42
+ "Daily Life",
43
+ "Travel",
44
+ "Pet & Animal",
45
+ "Exercise",
46
+ "Multilingual"
47
+ ]
48
+
49
+ TASK_CATEGORIES = [
50
+ "Temporal Perception",
51
+ "Spatial Perception",
52
+ "Attribute Perception",
53
+ "Action Recognition",
54
+ "Object Recognition",
55
+ "OCR Problems",
56
+ "Counting Problem",
57
+ "Temporal Reasoning",
58
+ "Spatial Reasoning",
59
+ "Action Reasoning",
60
+ "Object Reasoning",
61
+ "Information Synopsis",
62
+ ]
63
+
64
+
65
+ def extract_characters_regex(s):
66
+ s = s.strip()
67
+ answer_prefixes = [
68
+ "The best answer is",
69
+ "The correct answer is",
70
+ "The answer is",
71
+ "The answer",
72
+ "The best option is"
73
+ "The correct option is",
74
+ "Best answer:"
75
+ "Best option:",
76
+ ]
77
+ for answer_prefix in answer_prefixes:
78
+ s = s.replace(answer_prefix, "")
79
+
80
+ if len(s.split()) > 10 and not re.search("[ABCD]", s):
81
+ return ""
82
+ matches = re.search(r'[ABCD]', s)
83
+ if matches is None:
84
+ return ""
85
+ return matches[0]
86
+
87
+
88
+ def eval_your_results(
89
+ your_results_path: str,
90
+ video_types: Optional[Union[List[str], str]] = None,
91
+ skip_missing: Optional[bool] = True,
92
+ return_categories_accuracy: Optional[bool] = True,
93
+ return_sub_categories_accuracy: Optional[bool] = False,
94
+ return_task_types_accuracy: Optional[bool] = False,
95
+ gt_answer_key: Optional[str] = "answer",
96
+ your_answer_key: Optional[str] = "response"
97
+
98
+ ):
99
+ """
100
+ Evaluate your results against the ground truth
101
+
102
+ Args:
103
+ - your_results_path (str): Path to your results file
104
+ - video_types (Optional[List[str], str]): List of video types to evaluate.
105
+ - skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files.
106
+ - return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned.
107
+ - return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned.
108
+ - return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned.
109
+ - gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file.
110
+ - your_answer_key (Optional[str]): Key to access your answer in the results file.
111
+ """
112
+
113
+ # Load your results
114
+ with open(your_results_path, 'r') as f:
115
+ your_results = json.load(f)
116
+
117
+ if isinstance(video_types, str):
118
+ video_types = video_types.split(",")
119
+
120
+ q_type_dict = {}
121
+ v_type_dict = {}
122
+ v_sub_type_dict = {}
123
+
124
+
125
+ for video_type in video_types:
126
+
127
+ # Filter your results based on video types
128
+ your_results_video_type = [item for item in your_results if item["duration"] == video_type]
129
+
130
+ # Task Categories
131
+ q_type_dict[video_type] = {}
132
+ for q_type in TASK_CATEGORIES:
133
+ q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0}
134
+
135
+ # Video categories
136
+ v_type_dict[video_type] = {}
137
+ for v_type in CATEGORIES:
138
+ v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0}
139
+
140
+ v_sub_type_dict[video_type] = {}
141
+ for v_sub_type in SUB_CATEGORIES:
142
+ v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0}
143
+
144
+ if not skip_missing:
145
+ # Check if the number of files in your results and ground truth are the same
146
+ assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files."
147
+
148
+ for item in your_results_video_type:
149
+
150
+ if skip_missing and item["missing"]:
151
+ continue
152
+
153
+ # Get the video category, sub category and question category
154
+ video_category = item["domain"]
155
+ video_sub_category = item["sub_category"]
156
+
157
+ questions = item["questions"]
158
+
159
+ for question in questions:
160
+ q_type = question["task_type"]
161
+
162
+ # Get the ground truth and your response
163
+ gt_answer = question[gt_answer_key]
164
+ response = question[your_answer_key]
165
+
166
+ # Extract the answer from the response
167
+ extration = extract_characters_regex(response)
168
+
169
+ if extration != "":
170
+ q_type_dict[video_type][q_type]["answered"] += 1
171
+ q_type_dict[video_type][q_type]["correct"] += extration == gt_answer
172
+
173
+ v_type_dict[video_type][video_category]["answered"] += 1
174
+ v_type_dict[video_type][video_category]["correct"] += extration == gt_answer
175
+
176
+ v_sub_type_dict[video_type][video_sub_category]["answered"] += 1
177
+ v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer
178
+
179
+
180
+ # Print the results for each video type
181
+ for video_type in video_types:
182
+
183
+ print("=====================================")
184
+ print(f"Evaluation on video Type: {video_type}")
185
+ print("=====================================")
186
+ if return_categories_accuracy:
187
+ print("-------------------------------------")
188
+ print("Video Domains")
189
+ print("-------------------------------------")
190
+ for v_type in v_type_dict[video_type]:
191
+ print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%")
192
+ if return_sub_categories_accuracy:
193
+ print("-------------------------------------")
194
+ print("Video Sub Categories")
195
+ print("-------------------------------------")
196
+ for v_sub_type in v_sub_type_dict[video_type]:
197
+ print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%")
198
+ if return_task_types_accuracy:
199
+ print("-------------------------------------")
200
+ print("Task Categories")
201
+ print("-------------------------------------")
202
+ for q_type in q_type_dict[video_type]:
203
+ print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%")
204
+
205
+ print("-------------------------------------")
206
+ print("Overall Performance")
207
+ print("-------------------------------------")
208
+ total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES])
209
+ total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES])
210
+ print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
211
+
212
+ print("\n")
213
+
214
+ # Print the results for the entire dataset
215
+ print("=====================================")
216
+ print("Evaluation on the entire dataset")
217
+ print("=====================================")
218
+
219
+ if return_categories_accuracy:
220
+ print("-------------------------------------")
221
+ print("Video Categories")
222
+ print("-------------------------------------")
223
+ for v_type in CATEGORIES:
224
+ total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types])
225
+ total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types])
226
+ print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
227
+
228
+
229
+ if return_sub_categories_accuracy:
230
+ print("-------------------------------------")
231
+ print("Video Sub Categories")
232
+ print("-------------------------------------")
233
+
234
+ for v_sub_type in SUB_CATEGORIES:
235
+ total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types])
236
+ total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types])
237
+ print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
238
+
239
+
240
+ if return_task_types_accuracy:
241
+ print("-------------------------------------")
242
+ print("Task Categories")
243
+ print("-------------------------------------")
244
+ for q_type in TASK_CATEGORIES:
245
+
246
+ total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types])
247
+ total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types])
248
+ print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
249
+
250
+ print("-------------------------------------")
251
+ print("Overall Performance")
252
+ print("-------------------------------------")
253
+ total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
254
+ total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
255
+ print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
256
+
257
+
258
+
259
+ if __name__ == "__main__":
260
+ parser = argparse.ArgumentParser()
261
+ parser.add_argument("--results_file", type=str, required=True)
262
+ parser.add_argument("--video_duration_type", type=str, required=True)
263
+ parser.add_argument("--return_categories_accuracy", action="store_true")
264
+ parser.add_argument("--return_sub_categories_accuracy", action="store_true")
265
+ parser.add_argument("--return_task_types_accuracy", action="store_true")
266
+ parser.add_argument("--skip_missing", action="store_true")
267
+
268
+ args = parser.parse_args()
269
+
270
+ eval_your_results(
271
+ args.results_file,
272
+ video_types=args.video_duration_type,
273
+ skip_missing=args.skip_missing,
274
+ return_categories_accuracy=args.return_categories_accuracy,
275
+ return_sub_categories_accuracy=args.return_sub_categories_accuracy,
276
+ return_task_types_accuracy=args.return_task_types_accuracy,
277
+ )
videollama2/eval/eval_video_oqa_activitynet.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import time
5
+ import argparse
6
+ import traceback
7
+ from tqdm import tqdm
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ def init():
14
+ client = AzureOpenAI(
15
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
16
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
17
+ api_version="2024-02-15-preview"
18
+ )
19
+
20
+ return client
21
+
22
+
23
+ def interaction(client, message_text):
24
+ completion = client.chat.completions.create(
25
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
26
+ messages = message_text,
27
+ temperature=0.7,
28
+ max_tokens=800,
29
+ top_p=0.95,
30
+ frequency_penalty=0,
31
+ presence_penalty=0,
32
+ stop=None
33
+ )
34
+
35
+ return completion
36
+
37
+
38
+ def prompt_gpt(question, answer, pred, key, qa_set, output_dir):
39
+ message = [
40
+ {
41
+ "role": "system",
42
+ "content":
43
+ "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
44
+ "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
45
+ "------"
46
+ "##INSTRUCTIONS: "
47
+ "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
48
+ "- Consider synonyms or paraphrases as valid matches.\n"
49
+ "- Evaluate the correctness of the prediction compared to the answer."
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content":
54
+ "Please evaluate the following video-based question-answer pair:\n\n"
55
+ f"Question: {question}\n"
56
+ f"Correct Answer: {answer}\n"
57
+ f"Predicted Answer: {pred}\n\n"
58
+ "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
59
+ "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
60
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
61
+ "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
62
+ }
63
+ ]
64
+ completion = interaction(client, message)
65
+ # Convert response to a Python dictionary.
66
+ response_message = completion.choices[0].message.content
67
+ response_dict = ast.literal_eval(response_message)
68
+ result_qa_pair = [response_dict, qa_set]
69
+ # # Save the question-answer pairs to a json file.
70
+ with open(f"{output_dir}/{key}.json", "w") as f:
71
+ json.dump(result_qa_pair, f)
72
+
73
+
74
+ def annotate(task_arg):
75
+ """
76
+ Evaluates question and answer pairs using GPT-3
77
+ Returns a score for correctness.
78
+ """
79
+ prediction_set, caption_files, output_dir, args = task_arg
80
+
81
+ for file in tqdm(caption_files):
82
+ key = file[:-5] # Strip file extension
83
+ qa_set = prediction_set[key]
84
+ question = qa_set['q']
85
+ answer = qa_set['a']
86
+ pred = qa_set['p']
87
+ try:
88
+ prompt_gpt(question, answer, pred, key, qa_set, output_dir)
89
+ except Exception as e:
90
+ prompt_gpt(question, answer, pred[:50], key, qa_set, output_dir)
91
+ traceback.print_exc()
92
+
93
+ time.sleep(1)
94
+
95
+
96
+ def main(args):
97
+
98
+ file = open(args.pred_path)
99
+ new_pred_contents = [eval(i.strip()) for i in file.readlines()]
100
+
101
+ # Generating list of id's and corresponding files
102
+ id_list = [x['id'] for x in new_pred_contents]
103
+ caption_files = [f"{id}.json" for id in id_list]
104
+
105
+ output_dir = args.output_dir
106
+ # Generate output directory if not exists.
107
+ if not os.path.exists(output_dir):
108
+ os.makedirs(output_dir)
109
+
110
+ # Preparing dictionary of question-answer sets
111
+ prediction_set = {}
112
+ for sample in new_pred_contents:
113
+ id = sample['id']
114
+ question = sample['question']
115
+ answer = sample['answer']
116
+ pred = sample['pred']
117
+ qa_set = {"q": question, "a": answer, "p": pred}
118
+ prediction_set[id] = qa_set
119
+
120
+ num_tasks = args.num_tasks
121
+
122
+ # While loop to ensure that all captions are processed.
123
+ while True:
124
+ try:
125
+ # Files that have not been processed yet.
126
+ completed_files = os.listdir(output_dir)
127
+ print(f"completed_files: {len(completed_files)}")
128
+
129
+ # Files that have not been processed yet.
130
+ incomplete_files = [f for f in caption_files if f not in completed_files]
131
+ print(f"incomplete_files: {len(incomplete_files)}")
132
+
133
+ # Break the loop when there are no incomplete files
134
+ if len(incomplete_files) == 0:
135
+ break
136
+ if len(incomplete_files) <= num_tasks:
137
+ num_tasks = 1
138
+
139
+ # Split tasks into parts.
140
+ part_len = len(incomplete_files) // num_tasks
141
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
142
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
143
+
144
+ # Use a pool of workers to process the files in parallel.
145
+ with ThreadPoolExecutor(max_workers=args.num_tasks) as executor:
146
+ list(tqdm(executor.map(annotate, task_args), total=len(task_args)))
147
+
148
+ except Exception as e:
149
+ print(f"Error: {e}")
150
+
151
+ # multiprocessing to combine json files
152
+ def combine_json(file_name):
153
+ file_path = os.path.join(output_dir, file_name)
154
+ with open(file_path, "r") as json_file:
155
+ content = json.load(json_file)
156
+ return (file_name[:-5], content)
157
+
158
+ files = os.listdir(output_dir)
159
+ with ThreadPoolExecutor(max_workers=64) as executor:
160
+ combined_contents = list(tqdm(executor.map(combine_json, files), total=len(files)))
161
+
162
+ # Calculate average score and accuracy
163
+ score_sum = 0
164
+ count = 0
165
+ yes_count = 0
166
+ no_count = 0
167
+ for key, result in tqdm(combined_contents):
168
+ try:
169
+ # Computing score
170
+ count += 1
171
+ score_match = result[0]['score']
172
+ score = int(score_match)
173
+ score_sum += score
174
+
175
+ # Computing accuracy
176
+ pred = result[0]['pred']
177
+ if "yes" in pred.lower():
178
+ yes_count += 1
179
+ elif "no" in pred.lower():
180
+ no_count += 1
181
+ except:
182
+ print(result)
183
+
184
+ average_score = score_sum / count
185
+ accuracy = yes_count / (yes_count + no_count)
186
+ print("Yes count:", yes_count)
187
+ print("No count:", no_count)
188
+ print("Accuracy:", accuracy)
189
+ print("Average score:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3
40
+ Returns a score for correctness.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ message = [
51
+ {
52
+ "role": "system",
53
+ "content":
54
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
55
+ "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
56
+ "------"
57
+ "##INSTRUCTIONS: "
58
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
59
+ "- The predicted answer must be factually accurate and align with the video content.\n"
60
+ "- Consider synonyms or paraphrases as valid matches.\n"
61
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content":
66
+ "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answer: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}."
74
+ }
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ traceback.print_exc()
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for correctness:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ from tqdm import tqdm
6
+ from multiprocessing.pool import Pool
7
+
8
+ from openai import AzureOpenAI
9
+
10
+
11
+ def init():
12
+ client = AzureOpenAI(
13
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
14
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
15
+ api_version="2024-02-15-preview"
16
+ )
17
+
18
+ return client
19
+
20
+
21
+ def interaction(client, message_text):
22
+ completion = client.chat.completions.create(
23
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
24
+ messages = message_text,
25
+ temperature=0.7,
26
+ max_tokens=800,
27
+ top_p=0.95,
28
+ frequency_penalty=0,
29
+ presence_penalty=0,
30
+ stop=None
31
+ )
32
+
33
+ return completion
34
+
35
+
36
+ def annotate(prediction_set, caption_files, output_dir, args):
37
+ """
38
+ Evaluates question and answer pairs using GPT-3 and
39
+ returns a score for detailed orientation.
40
+ """
41
+ for file in tqdm(caption_files):
42
+ key = file[:-5] # Strip file extension
43
+ qa_set = prediction_set[key]
44
+ question = qa_set['q']
45
+ answer = qa_set['a']
46
+ pred = qa_set['p']
47
+ try:
48
+ # Compute the detailed-orientation score
49
+ message = [
50
+ {
51
+ "role": "system",
52
+ "content":
53
+ "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
54
+ "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
55
+ "------"
56
+ "##INSTRUCTIONS: "
57
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
58
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
59
+ "- Consider synonyms or paraphrases as valid matches.\n"
60
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
61
+ },
62
+ {
63
+ "role": "user",
64
+ "content":
65
+ "Please evaluate the following video-based question-answer pair:\n\n"
66
+ f"Question: {question}\n"
67
+ f"Correct Answer: {answer}\n"
68
+ f"Predicted Answer: {pred}\n\n"
69
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
70
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
71
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
72
+ "For example, your response should look like this: {''score': 4.8}."
73
+ }
74
+ ]
75
+
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ print(f"Error: {e}")
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for detailed orientation:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_3_context.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for contextual understanding.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ # Compute the contextual understanding score
51
+ message = [
52
+ {
53
+ "role": "system",
54
+ "content":
55
+ "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
60
+ "- The predicted answer must capture the main themes and sentiments of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content":
67
+ "Please evaluate the following video-based question-answer pair:\n\n"
68
+ f"Question: {question}\n"
69
+ f"Correct Answer: {answer}\n"
70
+ f"Predicted Answer: {pred}\n\n"
71
+ "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
72
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
73
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
74
+ "For example, your response should look like this: {''score': 4.8}."
75
+ }
76
+ ]
77
+
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+
84
+ # Save the question-answer pairs to a json file.
85
+ with open(f"{output_dir}/{key}.json", "w") as f:
86
+ json.dump(result_qa_pair, f)
87
+
88
+ except Exception as e:
89
+ print(f"Error processing file '{key}': {e}")
90
+
91
+
92
+ def main(args):
93
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
94
+
95
+ # Dictionary to store the count of occurrences for each video_id
96
+ video_id_counts = {}
97
+ new_pred_contents = []
98
+
99
+ # Iterate through each sample in pred_contents
100
+ for sample in pred_contents:
101
+ video_id = sample['video_name']
102
+ if video_id in video_id_counts:
103
+ video_id_counts[video_id] += 1
104
+ else:
105
+ video_id_counts[video_id] = 0
106
+
107
+ # Create a new sample with the modified key
108
+ new_sample = sample
109
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
110
+ new_pred_contents.append(new_sample)
111
+
112
+ # Generating list of id's and corresponding files
113
+ id_list = [x['video_name'] for x in new_pred_contents]
114
+ caption_files = [f"{id}.json" for id in id_list]
115
+
116
+ output_dir = args.output_dir
117
+ # Generate output directory if not exists.
118
+ if not os.path.exists(output_dir):
119
+ os.makedirs(output_dir)
120
+
121
+ # Preparing dictionary of question-answer sets
122
+ prediction_set = {}
123
+ for sample in new_pred_contents:
124
+ id = sample['video_name']
125
+ question = sample['Q']
126
+ answer = sample['A']
127
+ pred = sample['P']
128
+ qa_set = {"q": question, "a": answer, "p": pred}
129
+ prediction_set[id] = qa_set
130
+
131
+ # Set the OpenAI API key.
132
+ # openai.api_key = args.api_key
133
+ num_tasks = args.num_tasks
134
+
135
+ # While loop to ensure that all captions are processed.
136
+ while True:
137
+ try:
138
+ # Files that have not been processed yet.
139
+ completed_files = os.listdir(output_dir)
140
+ print(f"completed_files: {len(completed_files)}")
141
+
142
+ # Files that have not been processed yet.
143
+ incomplete_files = [f for f in caption_files if f not in completed_files]
144
+ print(f"incomplete_files: {len(incomplete_files)}")
145
+
146
+ # Break the loop when there are no incomplete files
147
+ if len(incomplete_files) == 0:
148
+ break
149
+ if len(incomplete_files) <= num_tasks:
150
+ num_tasks = 1
151
+
152
+ # Split tasks into parts.
153
+ part_len = len(incomplete_files) // num_tasks
154
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
155
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
156
+
157
+ # Use a pool of workers to process the files in parallel.
158
+ with Pool() as pool:
159
+ pool.starmap(annotate, task_args)
160
+
161
+ except Exception as e:
162
+ print(f"Error: {e}")
163
+
164
+ # Combine all the processed files into one
165
+ combined_contents = {}
166
+ json_path = args.output_json
167
+
168
+ # Iterate through json files
169
+ for file_name in tqdm(os.listdir(output_dir)):
170
+ if file_name.endswith(".json"):
171
+ file_path = os.path.join(output_dir, file_name)
172
+ with open(file_path, "r") as json_file:
173
+ content = json.load(json_file)
174
+ combined_contents[file_name[:-5]] = content
175
+
176
+ # Write combined content to a json file
177
+ with open(json_path, "w") as json_file:
178
+ json.dump(combined_contents, json_file)
179
+ print("All evaluation completed!")
180
+
181
+ # Calculate average score
182
+ score_sum = 0
183
+ count = 0
184
+ for key, result in combined_contents.items():
185
+ count += 1
186
+ score_match = result[0]['score']
187
+ score = int(score_match)
188
+ score_sum += score
189
+ average_score = score_sum / count
190
+
191
+ print("Average score for contextual understanding:", average_score)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
196
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
197
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
198
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
199
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
200
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
201
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
202
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
203
+ args = parser.parse_args()
204
+
205
+ # Set the OpenAI API key.
206
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
207
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
208
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
209
+
210
+ client = init()
211
+
212
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+
39
+ for file in tqdm(caption_files):
40
+ key = file[:-5] # Strip file extension
41
+ qa_set = prediction_set[key]
42
+ question = qa_set['q']
43
+ answer = qa_set['a']
44
+ pred = qa_set['p']
45
+ try:
46
+ message = [
47
+ {
48
+ "role": "system",
49
+ "content":
50
+ "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
51
+ "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
52
+ "------"
53
+ "##INSTRUCTIONS: "
54
+ "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
55
+ "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
56
+ "- Evaluate the temporal accuracy of the prediction compared to the answer."
57
+ },
58
+ {
59
+ "role": "user",
60
+ "content":
61
+ "Please evaluate the following video-based question-answer pair:\n\n"
62
+ f"Question: {question}\n"
63
+ f"Correct Answer: {answer}\n"
64
+ f"Predicted Answer: {pred}\n\n"
65
+ "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
66
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
67
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
68
+ "For example, your response should look like this: {''score': 4.8}."
69
+ }
70
+ ]
71
+
72
+ completion = interaction(client, message)
73
+ # Convert response to a Python dictionary.
74
+ response_message = completion.choices[0].message.content
75
+ response_dict = ast.literal_eval(response_message)
76
+ result_qa_pair = [response_dict, qa_set]
77
+
78
+ # Save the question-answer pairs to a json file.
79
+ with open(f"{output_dir}/{key}.json", "w") as f:
80
+ json.dump(result_qa_pair, f)
81
+
82
+ except Exception as e:
83
+ print(f"Error processing file '{key}': {e}")
84
+
85
+
86
+ def main(args):
87
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
88
+
89
+ # Dictionary to store the count of occurrences for each video_id
90
+ video_id_counts = {}
91
+ new_pred_contents = []
92
+
93
+ # Iterate through each sample in pred_contents
94
+ for sample in pred_contents:
95
+ video_id = sample['video_name']
96
+ if video_id in video_id_counts:
97
+ video_id_counts[video_id] += 1
98
+ else:
99
+ video_id_counts[video_id] = 0
100
+
101
+ # Create a new sample with the modified key
102
+ new_sample = sample
103
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
104
+ new_pred_contents.append(new_sample)
105
+
106
+ # Generating list of id's and corresponding files
107
+ id_list = [x['video_name'] for x in new_pred_contents]
108
+ caption_files = [f"{id}.json" for id in id_list]
109
+
110
+ output_dir = args.output_dir
111
+ # Generate output directory if not exists.
112
+ if not os.path.exists(output_dir):
113
+ os.makedirs(output_dir)
114
+
115
+ # Preparing dictionary of question-answer sets
116
+ prediction_set = {}
117
+ for sample in new_pred_contents:
118
+ id = sample['video_name']
119
+ question = sample['Q']
120
+ answer = sample['A']
121
+ pred = sample['P']
122
+ qa_set = {"q": question, "a": answer, "p": pred}
123
+ prediction_set[id] = qa_set
124
+
125
+ # Set the OpenAI API key.
126
+ # openai.api_key = args.api_key
127
+ num_tasks = args.num_tasks
128
+
129
+ # While loop to ensure that all captions are processed.
130
+ while True:
131
+ try:
132
+ # Files that have not been processed yet.
133
+ completed_files = os.listdir(output_dir)
134
+ print(f"completed_files: {len(completed_files)}")
135
+
136
+ # Files that have not been processed yet.
137
+ incomplete_files = [f for f in caption_files if f not in completed_files]
138
+ print(f"incomplete_files: {len(incomplete_files)}")
139
+
140
+ # Break the loop when there are no incomplete files
141
+ if len(incomplete_files) == 0:
142
+ break
143
+ if len(incomplete_files) <= num_tasks:
144
+ num_tasks = 1
145
+
146
+ # Split tasks into parts.
147
+ part_len = len(incomplete_files) // num_tasks
148
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
149
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
150
+
151
+ # Use a pool of workers to process the files in parallel.
152
+ with Pool() as pool:
153
+ pool.starmap(annotate, task_args)
154
+
155
+ except Exception as e:
156
+ print(f"Error: {e}")
157
+
158
+ # Combine all the processed files into one
159
+ combined_contents = {}
160
+ json_path = args.output_json
161
+
162
+ # Iterate through json files
163
+ for file_name in os.listdir(output_dir):
164
+ if file_name.endswith(".json"):
165
+ file_path = os.path.join(output_dir, file_name)
166
+ with open(file_path, "r") as json_file:
167
+ content = json.load(json_file)
168
+ combined_contents[file_name[:-5]] = content
169
+
170
+ # Write combined content to a json file
171
+ with open(json_path, "w") as json_file:
172
+ json.dump(combined_contents, json_file)
173
+ print("All evaluation completed!")
174
+
175
+ # Calculate average score
176
+ score_sum = 0
177
+ count = 0
178
+ for key, result in combined_contents.items():
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+ average_score = score_sum / count
184
+
185
+ print("Average score temporal understanding:", average_score)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
190
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
191
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
192
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
193
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
194
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
195
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
196
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
197
+ args = parser.parse_args()
198
+
199
+ # Set the OpenAI API key.
200
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
201
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
202
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
203
+
204
+ client = init()
205
+
206
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for consistency.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question1 = qa_set['q1']
47
+ question2 = qa_set['q2']
48
+ answer = qa_set['a']
49
+ pred1 = qa_set['p1']
50
+ pred2 = qa_set['p2']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
57
+ "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
58
+ "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
59
+ "------"
60
+ "##INSTRUCTIONS: "
61
+ "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
62
+ "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
63
+ "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
64
+ "- Evaluate the consistency of the two predicted answers compared to the correct answer."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content":
69
+ "Please evaluate the following video-based question-answer pair:\n\n"
70
+ f"Question 1: {question1}\n"
71
+ f"Question 2: {question2}\n"
72
+ f"Correct Answer: {answer}\n"
73
+ f"Predicted Answer to Question 1: {pred1}\n"
74
+ f"Predicted Answer to Question 2: {pred2}\n\n"
75
+ "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
76
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
77
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
78
+ "For example, your response should look like this: {''score': 4.8}."
79
+ }
80
+ ]
81
+
82
+ completion = interaction(client, message)
83
+ # Convert response to a Python dictionary.
84
+ response_message = completion.choices[0].message.content
85
+ response_dict = ast.literal_eval(response_message)
86
+ result_qa_pair = [response_dict, qa_set]
87
+
88
+ # Save the question-answer pairs to a json file.
89
+ with open(f"{output_dir}/{key}.json", "w") as f:
90
+ json.dump(result_qa_pair, f)
91
+
92
+ except Exception as e:
93
+ print(f"Error processing file '{key}': {e}")
94
+
95
+
96
+ def main(args):
97
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
98
+
99
+ # Dictionary to store the count of occurrences for each video_id
100
+ video_id_counts = {}
101
+ new_pred_contents = []
102
+
103
+ # Iterate through each sample in pred_contents
104
+ for sample in pred_contents:
105
+ video_id = sample['video_name']
106
+ if video_id in video_id_counts:
107
+ video_id_counts[video_id] += 1
108
+ else:
109
+ video_id_counts[video_id] = 0
110
+
111
+ # Create a new sample with the modified key
112
+ new_sample = sample
113
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
114
+ new_pred_contents.append(new_sample)
115
+
116
+ # Generating list of id's and corresponding files
117
+ id_list = [x['video_name'] for x in new_pred_contents]
118
+ caption_files = [f"{id}.json" for id in id_list]
119
+
120
+ output_dir = args.output_dir
121
+ # Generate output directory if not exists.
122
+ if not os.path.exists(output_dir):
123
+ os.makedirs(output_dir)
124
+
125
+ # Preparing dictionary of question-answer sets
126
+ prediction_set = {}
127
+ for sample in new_pred_contents:
128
+ id = sample['video_name']
129
+ question1 = sample['Q1']
130
+ question2 = sample['Q2']
131
+ answer = sample['A']
132
+ pred1 = sample['P1']
133
+ pred2 = sample['P2']
134
+ qa_set = {"q1": question1, "q2": question2, "a": answer, "p1": pred1, "p2": pred2}
135
+ prediction_set[id] = qa_set
136
+
137
+ # Set the OpenAI API key.
138
+ # openai.api_key = args.api_key
139
+ num_tasks = args.num_tasks
140
+
141
+ # While loop to ensure that all captions are processed.
142
+ while True:
143
+ try:
144
+ # Files that have not been processed yet.
145
+ completed_files = os.listdir(output_dir)
146
+ print(f"completed_files: {len(completed_files)}")
147
+
148
+ # Files that have not been processed yet.
149
+ incomplete_files = [f for f in caption_files if f not in completed_files]
150
+ print(f"incomplete_files: {len(incomplete_files)}")
151
+
152
+ # Break the loop when there are no incomplete files
153
+ if len(incomplete_files) == 0:
154
+ break
155
+ if len(incomplete_files) <= num_tasks:
156
+ num_tasks = 1
157
+
158
+ # Split tasks into parts.
159
+ part_len = len(incomplete_files) // num_tasks
160
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
161
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
162
+
163
+ # Use a pool of workers to process the files in parallel.
164
+ with Pool() as pool:
165
+ pool.starmap(annotate, task_args)
166
+
167
+ except Exception as e:
168
+ print(f"Error: {e}")
169
+
170
+ # Combine all the processed files into one
171
+ combined_contents = {}
172
+ json_path = args.output_json
173
+
174
+ # Iterate through json files
175
+ for file_name in os.listdir(output_dir):
176
+ if file_name.endswith(".json"):
177
+ file_path = os.path.join(output_dir, file_name)
178
+ with open(file_path, "r") as json_file:
179
+ content = json.load(json_file)
180
+ combined_contents[file_name[:-5]] = content
181
+
182
+ # Write combined content to a json file
183
+ with open(json_path, "w") as json_file:
184
+ json.dump(combined_contents, json_file)
185
+ print("All evaluation completed!")
186
+
187
+ # Calculate average score
188
+ score_sum = 0
189
+ count = 0
190
+ for key, result in combined_contents.items():
191
+ count += 1
192
+ score_match = result[0]['score']
193
+ score = int(score_match)
194
+ score_sum += score
195
+ average_score = score_sum / count
196
+
197
+ print("Average score for consistency:", average_score)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
202
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
203
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
204
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
205
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
206
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
207
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
208
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
209
+ args = parser.parse_args()
210
+
211
+ # Set the OpenAI API key.
212
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
213
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
214
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
215
+
216
+ client = init()
217
+
218
+ main(args)
videollama2/eval/inference_video_cap_msvc.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import argparse
4
+ import json
5
+ import warnings
6
+ from tqdm import tqdm
7
+
8
+ from torch.utils.data import Dataset, DataLoader
9
+
10
+ import sys
11
+ sys.path.append('./')
12
+ from videollama2 import model_init, mm_infer
13
+ from videollama2.utils import disable_torch_init
14
+
15
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
16
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
17
+
18
+
19
+ def split_list(lst, n):
20
+ """Split a list into n (roughly) equal-sized chunks"""
21
+ chunk_size = math.ceil(len(lst) / n) # integer division
22
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
23
+
24
+
25
+ def get_chunk(lst, n, k):
26
+ chunks = split_list(lst, n)
27
+ return chunks[k]
28
+
29
+
30
+ class MSVCDataset(Dataset):
31
+
32
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
33
+
34
+ def __init__(self, folder, questions, processor):
35
+ self.folder = folder
36
+ self.questions = questions
37
+ self.processor = processor
38
+
39
+ def __len__(self):
40
+ return len(self.questions)
41
+
42
+ def __getitem__(self, idx):
43
+ sample = self.questions[idx]
44
+
45
+ video_name = sample['video_path']
46
+ question = sample['question']
47
+ answer = sample['captions']
48
+
49
+ video_path = os.path.join(self.folder, video_name)
50
+ video_tensor = self.processor(video_path)
51
+
52
+ return {
53
+ 'video': video_tensor,
54
+ 'video_name': video_name,
55
+ 'question': question,
56
+ 'answer': answer,
57
+ }
58
+
59
+
60
+ def collate_fn(batch):
61
+ vid = [x['video'] for x in batch]
62
+ v_id = [x['video_name'] for x in batch]
63
+ qus = [x['question'] for x in batch]
64
+ ans = [x['answer'] for x in batch]
65
+ return vid, v_id, qus, ans
66
+
67
+
68
+ def run_inference(args):
69
+ disable_torch_init()
70
+
71
+ model, processor, tokenizer = model_init(args.model_path)
72
+
73
+ gt_questions = json.load(open(args.question_file, "r"))
74
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
75
+
76
+ answer_file = os.path.join(args.output_file)
77
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
78
+ ans_file = open(answer_file, "w")
79
+
80
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
81
+ dataset = MSVCDataset(args.video_folder, gt_questions, processor['video'])
82
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
83
+
84
+ # Iterate over each sample in the ground truth file
85
+ for idx, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
86
+ video_tensor = video_tensors[0]
87
+ video_name = video_names[0]
88
+ question = questions[0]
89
+ answer = answers[0]
90
+
91
+ output = mm_infer(
92
+ video_tensor,
93
+ question,
94
+ model=model,
95
+ tokenizer=tokenizer,
96
+ modal='video',
97
+ do_sample=False,
98
+ )
99
+
100
+ sample_set = {'video_name': video_name, 'question': question, 'answer': answer, 'pred': output}
101
+ ans_file.write(json.dumps(sample_set) + "\n")
102
+
103
+ ans_file.close()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ parser = argparse.ArgumentParser()
108
+
109
+ parser.add_argument('--model-path', help='', required=True)
110
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
111
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
112
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
113
+ parser.add_argument("--num-chunks", type=int, default=1)
114
+ parser.add_argument("--chunk-idx", type=int, default=0)
115
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
116
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
117
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
118
+ args = parser.parse_args()
119
+
120
+ run_inference(args)
videollama2/eval/inference_video_mcqa_egoschema.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+
9
+ from tqdm import tqdm
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class EgoschemaDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_folder, data_list, processor):
37
+ self.data_folder = data_folder
38
+ self.data_list = data_list
39
+ self.processor = processor
40
+
41
+ def __len__(self):
42
+ return len(self.data_list)
43
+
44
+ def __getitem__(self, idx):
45
+ line = self.data_list[idx]
46
+ q_uid = line['q_uid']
47
+
48
+ for fmt in self.video_formats: # Added this line
49
+ temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
50
+ if os.path.exists(temp_path):
51
+ video_path = temp_path
52
+ break
53
+
54
+ video_tensor = self.processor(video_path)
55
+
56
+ question = line['question']
57
+ a0 = line['option 0']
58
+ a1 = line['option 1']
59
+ a2 = line['option 2']
60
+ a3 = line['option 3']
61
+ a4 = line['option 4']
62
+ axs = [a0, a1, a2, a3, a4]
63
+ ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
64
+
65
+ instruct = f'Select the best answer to the following multiple-choice question based on the video.\n{question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option. The best answer is: '
66
+
67
+ return {
68
+ 'q_uid': q_uid,
69
+ 'video': video_tensor,
70
+ 'instruct': instruct,
71
+ }
72
+
73
+
74
+ def build_egoschema_eval(args, processor):
75
+ questions = json.load(open(args.question_file, "r"))
76
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
77
+ dataset = EgoschemaDataset(args.video_folder, questions, processor)
78
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
79
+
80
+ return dataloader
81
+
82
+
83
+ def egoschema_dump(ans_file, line, outputs):
84
+ for idx, output in enumerate(outputs):
85
+ q_uid = line['q_uid'][idx]
86
+ instruct = line['instruct'][idx]
87
+ letters = ['A', 'B', 'C', 'D', 'E']
88
+
89
+ output = output.replace('answer', '')
90
+ output = output.replace('Answer', '')
91
+ pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
92
+ try:
93
+
94
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(line['q_uid'], instruct, output)
95
+ pred_answer = pred_answer[0].strip()
96
+ pred_answer = pred_answer.strip('()')
97
+ pred_idx = letters.index(pred_answer)
98
+ except:
99
+ traceback.print_exc()
100
+ pred_idx = 2
101
+
102
+ ans_file.write(f'{q_uid}, {pred_idx}\n')
103
+
104
+
105
+ def run_inference(args):
106
+ disable_torch_init()
107
+
108
+ model, processor, tokenizer = model_init(args.model_path)
109
+
110
+ answer_file = os.path.expanduser(args.answer_file)
111
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
112
+ ans_file = open(answer_file, "w")
113
+
114
+ val_loader = build_egoschema_eval(args, processor['video'])
115
+
116
+ # Iterate over each sample in the ground truth file
117
+ for i, line in enumerate(tqdm(val_loader)):
118
+ video_tensor = line['video'][0]
119
+ instruct = line['instruct'][0]
120
+
121
+ try:
122
+ pred = mm_infer(
123
+ video_tensor,
124
+ instruct,
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ modal='video',
128
+ do_sample=False,
129
+ )
130
+ except:
131
+ traceback.print_exc()
132
+ pred = 'C'
133
+
134
+ egoschema_dump(ans_file, line, [pred])
135
+
136
+ ans_file.close()
137
+
138
+
139
+ if __name__ == "__main__":
140
+ parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
141
+
142
+ parser.add_argument('--model-path', help='', required=True)
143
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
144
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
145
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
146
+ parser.add_argument("--num-chunks", type=int, default=1)
147
+ parser.add_argument("--chunk-idx", type=int, default=0)
148
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
149
+ parser.add_argument("--batch-size", type=int, default=1)
150
+ parser.add_argument("--num-workers", type=int, default=8)
151
+ args = parser.parse_args()
152
+
153
+ run_inference(args)
videollama2/eval/inference_video_mcqa_mvbench.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+
9
+ import torch
10
+ import numpy as np
11
+ from PIL import Image
12
+ from tqdm import tqdm
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2 import model_init, mm_infer
19
+ from videollama2.utils import disable_torch_init
20
+
21
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
22
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
23
+
24
+
25
+ def split_list(lst, n):
26
+ """Split a list into n (roughly) equal-sized chunks"""
27
+ chunk_size = math.ceil(len(lst) / n) # integer division
28
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
29
+
30
+
31
+ def get_chunk(lst, n, k):
32
+ chunks = split_list(lst, n)
33
+ return chunks[k]
34
+
35
+
36
+ class MVBenchDataset(Dataset):
37
+
38
+ def __init__(self, data_list, processor):
39
+ self.data_list = data_list
40
+ self.processor = processor
41
+
42
+ def __len__(self):
43
+ return len(self.data_list)
44
+
45
+ def __getitem__(self, idx):
46
+ bound = (None, None)
47
+ if self.data_list[idx]['bound']:
48
+ bound = (self.data_list[idx]['data']['start'], self.data_list[idx]['data']['end'])
49
+ video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
50
+ torch_imgs = self.processor(video_path, s=bound[0], e=bound[1])
51
+ question = self.data_list[idx]['data']['question']
52
+ options = self.data_list[idx]['data']['candidates']
53
+ answer = self.data_list[idx]['data']['answer']
54
+ task_type = self.data_list[idx]['task_type']
55
+
56
+ answer_idx = -1
57
+ letters = []
58
+ options_string = ''
59
+ for option_idx, c in enumerate(options):
60
+ letters.append(f"{chr(ord('A') + option_idx)}")
61
+ options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
62
+ if c == answer:
63
+ answer_idx = option_idx
64
+
65
+ instruct = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.'
66
+
67
+ return {
68
+ 'video': torch_imgs,
69
+ 'video_path': video_path,
70
+ 'instruct': instruct,
71
+ 'letters': letters,
72
+ 'options': options,
73
+ 'answer_idx': answer_idx,
74
+ 'task_type': task_type
75
+ }
76
+
77
+
78
+ tasks = {
79
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
80
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
81
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
82
+ "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
83
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
84
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
85
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
86
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
87
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
88
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
89
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
90
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
91
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
92
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
93
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
94
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
95
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
96
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
97
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
98
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
99
+ }
100
+
101
+
102
+ def build_mvbench_eval(args, processor):
103
+ data_list = []
104
+ for task_name, task in tasks.items():
105
+ json_file = os.path.join(args.question_file, task[0])
106
+ vis_folder = os.path.join(args.video_folder, task[1])
107
+ with open(json_file, 'r') as f:
108
+ json_data = json.load(f)
109
+ for data in json_data:
110
+ data_list.append({
111
+ 'task_type': task_name,
112
+ 'prefix': vis_folder,
113
+ 'data_type': task[2],
114
+ 'bound': task[3],
115
+ 'data': data
116
+ })
117
+ data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
118
+ dataset = MVBenchDataset(data_list, processor)
119
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
120
+
121
+ return dataloader
122
+
123
+
124
+ def mvbench_dump(vid, instruct, letters, options, output):
125
+
126
+ output = output.replace('answer', '')
127
+ output = output.replace('Answer', '')
128
+ pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
129
+ try:
130
+ find_flag = False
131
+ if len(pred_answer) == 0:
132
+ for idx, opt in enumerate(options):
133
+ # Arabic numerals -> English words
134
+ if opt.lower() in output.lower():
135
+ pred_idx = idx
136
+ find_flag = True
137
+ break
138
+ else:
139
+ pred_answer = pred_answer[0].strip()
140
+ pred_answer = pred_answer.strip('()')
141
+ pred_idx = letters.index(pred_answer)
142
+ find_flag = True
143
+
144
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(vid, instruct, output)
145
+ except:
146
+ traceback.print_exc()
147
+ pred_idx = 2
148
+
149
+ return pred_idx
150
+
151
+
152
+ def run_inference(args):
153
+ disable_torch_init()
154
+
155
+ model, processor, tokenizer = model_init(args.model_path)
156
+
157
+ answer_file = os.path.expanduser(args.answer_file)
158
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
159
+ ans_file = open(answer_file, "w")
160
+
161
+ val_loader = build_mvbench_eval(args, processor['video'])
162
+
163
+ # NOTE: only support batch size 1 for now
164
+ for i, line in enumerate(tqdm(val_loader)):
165
+ vid = line['video_path'][0]
166
+ video_tensor = line['video'][0]
167
+ task_type = line['task_type'][0]
168
+ instruct = line['instruct'][0]
169
+ letters = list(zip(*line['letters']))[0]
170
+ options = list(zip(*line['options']))[0]
171
+ answer_idx = line['answer_idx'][0].item()
172
+
173
+ output = mm_infer(
174
+ video_tensor,
175
+ instruct,
176
+ model=model,
177
+ tokenizer=tokenizer,
178
+ modal='video',
179
+ do_sample=False,
180
+ )
181
+
182
+ pred_idx = mvbench_dump(vid, instruct, letters, options, output)
183
+
184
+ ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
185
+
186
+ ans_file.close()
187
+
188
+
189
+ if __name__ == "__main__":
190
+ parser = argparse.ArgumentParser()
191
+
192
+ parser.add_argument('--model-path', help='', required=True)
193
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
194
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
195
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
196
+ parser.add_argument("--num-chunks", type=int, default=1)
197
+ parser.add_argument("--chunk-idx", type=int, default=0)
198
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
199
+ parser.add_argument("--batch-size", type=int, default=1)
200
+ parser.add_argument("--num-workers", type=int, default=8)
201
+ args = parser.parse_args()
202
+
203
+ run_inference(args)
videollama2/eval/inference_video_mcqa_perception_test_mcqa.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+ from tqdm import tqdm
9
+
10
+ import torch
11
+ from torch.utils.data import Dataset, DataLoader
12
+
13
+ import sys
14
+ sys.path.append('./')
15
+ from videollama2 import model_init, mm_infer
16
+ from videollama2.utils import disable_torch_init
17
+
18
+
19
+ def split_list(lst, n):
20
+ """Split a list into n (roughly) equal-sized chunks"""
21
+ chunk_size = math.ceil(len(lst) / n) # integer division
22
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
23
+
24
+
25
+ def get_chunk(lst, n, k):
26
+ chunks = split_list(lst, n)
27
+ return chunks[k]
28
+
29
+
30
+ class PerceptionTestMCQADataset(Dataset):
31
+
32
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
33
+
34
+ def __init__(self, data_list, processor):
35
+ self.data_list = data_list
36
+ self.processor = processor
37
+
38
+ def __len__(self):
39
+ return len(self.data_list)
40
+
41
+ def __getitem__(self, idx):
42
+ line = self.data_list[idx]
43
+ video_name = line['metadata']['video_id']
44
+ mc_questions = line['mc_question']
45
+
46
+ for fmt in self.video_formats: # Added this line
47
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
48
+ if os.path.exists(temp_path):
49
+ video_path = temp_path
50
+ break
51
+
52
+ video_tensor = self.processor(video_path)
53
+
54
+ instructs = []
55
+ qids = []
56
+ ops = []
57
+ for q in mc_questions:
58
+ question = q['question']
59
+ qid = q['id']
60
+ options = q['options']
61
+ instruct = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
62
+
63
+ instructs.append(instruct)
64
+ qids.append(qid)
65
+ ops.append(options)
66
+
67
+ return {
68
+ 'video': video_tensor,
69
+ 'video_id': video_name,
70
+ 'instructs': instructs,
71
+ 'question_ids': qids,
72
+ 'options': ops,
73
+ }
74
+
75
+
76
+ def collate_fn(batch):
77
+ vid = [x['video'] for x in batch]
78
+ v_id = [x['video_id'] for x in batch]
79
+ ins = [x['instructs'] for x in batch]
80
+ q_ids = [x['question_ids'] for x in batch]
81
+ ops = [x['options'] for x in batch]
82
+ vid = torch.stack(vid, dim=0)
83
+ return vid, v_id, ins, q_ids, ops
84
+
85
+
86
+ def run_inference(args):
87
+ disable_torch_init()
88
+
89
+ model, processor, tokenizer = model_init(args.model_path)
90
+
91
+ questions = json.load(open(args.question_file, "r"))
92
+ questions = list(questions.values())
93
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
94
+
95
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
96
+ dataset = PerceptionTestMCQADataset(questions, processor['video'])
97
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
98
+
99
+ answer_file = os.path.expanduser(args.answer_file)
100
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
101
+ ans_file = open(answer_file, "w")
102
+
103
+ # Iterate over each sample in the ground truth file
104
+ for i, (video_tensor, video_id, instructs, question_ids, options) in enumerate(tqdm(dataloader)):
105
+
106
+ # reduce batch dimension
107
+ video_tensor = video_tensor[0]
108
+ video_id = video_id[0]
109
+ instructs = instructs[0]
110
+ question_ids = question_ids[0]
111
+ options = options[0]
112
+
113
+ qas = []
114
+ for idx, instruct in enumerate(instructs):
115
+ letters = ['(A)', '(B)', '(C)']
116
+ question_id = question_ids[idx]
117
+ _options = options[idx]
118
+
119
+ output = mm_infer(
120
+ video_tensor,
121
+ instruct,
122
+ model=model,
123
+ tokenizer=tokenizer,
124
+ modal='video',
125
+ do_sample=False,
126
+ )
127
+
128
+ output = output.replace('answer', '')
129
+ output = output.replace('Answer', '')
130
+ pred_answer = re.findall('\(*[A-C]\)*', output)
131
+ try:
132
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(video_id, instruct, output)
133
+ pred_answer = pred_answer[0].strip()
134
+ # if not pred_answer.startswith('('):
135
+ pred_answer = pred_answer.strip('()')
136
+ pred_answer = f'({pred_answer})'
137
+ pred_idx = letters.index(pred_answer)
138
+ except:
139
+ traceback.print_exc()
140
+ tmp_options = [x.lower() for x in _options]
141
+ if output.lower() in tmp_options:
142
+ tmp_options = [x.lower() for x in _options]
143
+ pred_idx = tmp_options.index(output.lower())
144
+ else:
145
+ pred_idx = 2
146
+
147
+ qas.append({'id': question_id, 'answer_id': pred_idx, 'answer': _options[pred_idx]})
148
+
149
+ ans_file.write('\"{}\": {},\n'.format(video_id, json.dumps(qas)))
150
+
151
+ ans_file.close()
152
+
153
+
154
+ if __name__ == "__main__":
155
+ parser = argparse.ArgumentParser()
156
+
157
+ parser.add_argument('--model-path', help='', required=True)
158
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
159
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
160
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
161
+ parser.add_argument("--num-chunks", type=int, default=1)
162
+ parser.add_argument("--chunk-idx", type=int, default=0)
163
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
164
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
165
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
166
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
167
+ args = parser.parse_args()
168
+
169
+ run_inference(args)
videollama2/eval/inference_video_mcqa_videomme.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import copy
6
+ import argparse
7
+ import warnings
8
+ import traceback
9
+
10
+ import cv2
11
+ import torch
12
+ import pysubs2
13
+ import numpy as np
14
+ import pyarrow.parquet as pq
15
+ from tqdm import tqdm
16
+ from torch.utils.data import Dataset, DataLoader
17
+
18
+ import sys
19
+ sys.path.append('./')
20
+ from videollama2 import model_init, mm_infer
21
+ from videollama2.utils import disable_torch_init
22
+
23
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
24
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
25
+
26
+
27
+ def split_list(lst, n):
28
+ """Split a list into n (roughly) equal-sized chunks"""
29
+ chunk_size = math.ceil(len(lst) / n) # integer division
30
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
31
+
32
+
33
+ def get_chunk(lst, n, k):
34
+ chunks = split_list(lst, n)
35
+ return chunks[k]
36
+
37
+
38
+ def get_seq_frames(total_num_frames, desired_num_frames):
39
+ """
40
+ Calculate the indices of frames to extract from a video.
41
+
42
+ Parameters:
43
+ total_num_frames (int): Total number of frames in the video.
44
+ desired_num_frames (int): Desired number of frames to extract.
45
+
46
+ Returns:
47
+ list: List of indices of frames to extract.
48
+ """
49
+
50
+ # Calculate the size of each segment from which a frame will be extracted
51
+ seg_size = float(total_num_frames - 1) / desired_num_frames
52
+
53
+ seq = []
54
+ for i in range(desired_num_frames):
55
+ # Calculate the start and end indices of each segment
56
+ start = int(np.round(seg_size * i))
57
+ end = int(np.round(seg_size * (i + 1)))
58
+
59
+ # Append the middle index of the segment to the list
60
+ seq.append((start + end) // 2)
61
+
62
+ return seq
63
+
64
+
65
+ class VideoMMEDataset(Dataset):
66
+
67
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
68
+
69
+ def __init__(self, video_folder, subtitle_folder, data_list, processor):
70
+ self.video_folder = video_folder
71
+ self.subtitle_folder = subtitle_folder
72
+ self.data_list = data_list
73
+ self.processor = processor
74
+
75
+ def __len__(self):
76
+ return len(self.data_list)
77
+
78
+ def __getitem__(self, idx):
79
+ line = self.data_list[idx]
80
+
81
+ video_ytid = line['url'].split('watch?v=')[-1]
82
+
83
+ for fmt in self.video_formats: # Added this line
84
+ temp_path = os.path.join(self.video_folder, f'{video_ytid}{fmt}')
85
+ if os.path.exists(temp_path):
86
+ video_path = temp_path
87
+ break
88
+
89
+ subtitle_path = os.path.join(self.subtitle_folder, f'{video_ytid}.srt')
90
+
91
+ try:
92
+ video_tensor = self.processor(video_path)
93
+ num_frames = video_tensor.shape[0]
94
+ except:
95
+ traceback.print_exc()
96
+ print(f'It occurs error when reading {video_ytid}')
97
+ video_tensor = None
98
+ num_frames = 0
99
+
100
+ if video_tensor is not None and os.path.exists(subtitle_path):
101
+ cv2_vr = cv2.VideoCapture(video_path)
102
+ duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
103
+ fps = cv2_vr.get(cv2.CAP_PROP_FPS)
104
+ selected_frame_ids = get_seq_frames(duration, num_frames)
105
+
106
+ subs = pysubs2.load(subtitle_path, encoding="utf-8")
107
+ subtitles = []
108
+ for seleced_frame_id in selected_frame_ids:
109
+ sub_text = ""
110
+ cur_time = pysubs2.make_time(fps=fps, frames=seleced_frame_id)
111
+ for sub in subs:
112
+ if sub.start < cur_time and sub.end > cur_time:
113
+ sub_text = sub.text.replace("\\N", " ")
114
+ break
115
+ if sub_text.strip():
116
+ subtitles.append(sub_text)
117
+ subtitles = "\n".join(subtitles)
118
+ else:
119
+ subtitles = ""
120
+
121
+ return {
122
+ 'video': video_tensor,
123
+ 'subtitle': subtitles,
124
+ 'record': line,
125
+ }
126
+
127
+
128
+ def collate_fn(batch):
129
+ vid = [x['video'] for x in batch]
130
+ sub = [x['subtitle'] for x in batch]
131
+ rcs = [x['record'] for x in batch]
132
+ return vid, sub, rcs
133
+
134
+
135
+ def load_parquet(parquet_file):
136
+ table = pq.read_table(parquet_file)
137
+
138
+ # Convert PyArrow Table to pandas DataFrame
139
+ df = table.to_pandas()
140
+
141
+ jsons = []
142
+ for record in df.itertuples():
143
+
144
+ if len(jsons) < int(record.video_id):
145
+ jsons.append({
146
+ "video_id": record.video_id,
147
+ "youtube_id": record.videoID,
148
+ "url": record.url,
149
+ "duration": record.duration,
150
+ "domain": record.domain,
151
+ "sub_category": record.sub_category,
152
+ "questions": [
153
+ {
154
+ "question_id": record.question_id,
155
+ "task_type": record.task_type,
156
+ "question": record.question,
157
+ "choices": list(record.options),
158
+ "answer": record.answer,
159
+ }
160
+ ]
161
+ })
162
+ else:
163
+ jsons[-1]['questions'].append({
164
+ "question_id": record.question_id,
165
+ "task_type": record.task_type,
166
+ "question": record.question,
167
+ "choices": list(record.options),
168
+ "answer": record.answer,
169
+ })
170
+
171
+ return jsons
172
+
173
+
174
+ def build_videomme_eval(args, processor):
175
+ # convert parquet to json
176
+ questions = load_parquet(args.question_file)
177
+ # questions = json.load(open(args.question_file, "r"))
178
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
179
+ dataset = VideoMMEDataset(args.video_folder, args.subtitle_folder, questions, processor)
180
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
181
+
182
+ return dataloader
183
+
184
+
185
+ def videomme_dump(record, instruct, options, output):
186
+ letters = ['A', 'B', 'C', 'D']
187
+
188
+ digit2word = {
189
+ '1': 'one',
190
+ '2': 'two',
191
+ '3': 'three',
192
+ '4': 'four',
193
+ '5': 'five',
194
+ '6': 'six',
195
+ '7': 'seven',
196
+ '8': 'eight',
197
+ '9': 'nine',
198
+ '0': 'zero',
199
+ }
200
+
201
+ output = output.replace('answer', '')
202
+ output = output.replace('Answer', '')
203
+ pred_answer = re.findall('[\(\ \[]*([A-D])[\)\.\ \]]*', output)
204
+ try:
205
+ find_flag = False
206
+ if len(pred_answer) == 0:
207
+ for idx, opt in enumerate(options):
208
+ # Arabic numerals -> English words
209
+ opt2 = opt
210
+ if opt in digit2word:
211
+ opt2 = digit2word[opt]
212
+ if opt.lower() in output.lower() or opt2.lower() in output.lower():
213
+ pred_idx = idx
214
+ find_flag = True
215
+ break
216
+ else:
217
+ pred_answer = pred_answer[0].strip()
218
+ pred_answer = pred_answer.strip('()')
219
+ pred_idx = letters.index(pred_answer)
220
+ find_flag = True
221
+
222
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(record['youtube_id'], instruct, output)
223
+ except:
224
+ traceback.print_exc()
225
+ pred_idx = 2
226
+
227
+ return letters[pred_idx]
228
+
229
+
230
+ def run_inference(args):
231
+ disable_torch_init()
232
+
233
+ # Initialize the model
234
+ model, processor, tokenizer = model_init(args.model_path)
235
+
236
+ answer_file = os.path.expanduser(args.answer_file)
237
+ answer_sub_file = answer_file.replace('.json', '_sub.json')
238
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
239
+ ans_file = open(answer_file, "w")
240
+ ans_sub_file = open(answer_sub_file, "w")
241
+
242
+ val_loader = build_videomme_eval(args, processor['video'])
243
+
244
+ # Iterate over each sample in the ground truth file
245
+ for i, (videos, subtitles, records) in enumerate(tqdm(val_loader)):
246
+ video_tensor = videos[0]
247
+ subtitle = subtitles[0]
248
+ record = records[0]
249
+
250
+ new_record = copy.deepcopy(record)
251
+ new_record_sub = copy.deepcopy(record)
252
+
253
+ if video_tensor is None:
254
+ new_record['missing'] = True
255
+ ans_file.write(json.dumps(new_record) + ",\n")
256
+ new_record_sub['missing'] = True
257
+ ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
258
+ continue
259
+ else:
260
+ new_record['missing'] = False
261
+ new_record_sub['missing'] = False
262
+
263
+ questions = record['questions']
264
+ for idx, question in enumerate(questions):
265
+ q = question['question']
266
+ choices = question['choices']
267
+ options = [re.findall('[A-D]\. (.*).', c)[0] for c in choices]
268
+
269
+ instruct = "Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.\n"
270
+ instruct += f"{q}\n"
271
+ for cho_idx, cho in enumerate(choices):
272
+ instruct += f"{cho}\n"
273
+ # instruct += "The best option is: "
274
+ instruct += "Answer with the option\'s letter from the given choices directly and only give the best option. The best answer is: "
275
+ output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
276
+ new_record['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
277
+
278
+ instruct = f"This video's subtitles are listed below:\n{subtitle}\n" + instruct
279
+ output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
280
+ new_record_sub['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
281
+
282
+ ans_file.write(json.dumps(new_record) + ",\n")
283
+ ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
284
+
285
+ ans_file.close()
286
+ ans_sub_file.close()
287
+
288
+
289
+ if __name__ == "__main__":
290
+ parser = argparse.ArgumentParser()
291
+
292
+ parser.add_argument('--model-path', help='', required=True)
293
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
294
+ parser.add_argument('--subtitle-folder', help='Directory containing subtitle files.', required=True)
295
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
296
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
297
+ parser.add_argument("--num-chunks", type=int, default=1)
298
+ parser.add_argument("--chunk-idx", type=int, default=0)
299
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
300
+ parser.add_argument("--batch-size", type=int, default=1)
301
+ parser.add_argument("--num-workers", type=int, default=8)
302
+ args = parser.parse_args()
303
+
304
+ run_inference(args)
videollama2/eval/inference_video_oqa_activitynet.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+ import argparse
5
+ import warnings
6
+ import traceback
7
+ from tqdm import tqdm
8
+
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ import sys
12
+ sys.path.append('./')
13
+ from videollama2 import model_init, mm_infer
14
+ from videollama2.utils import disable_torch_init
15
+
16
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
17
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
18
+
19
+
20
+ def split_list(lst, n):
21
+ """Split a list into n (roughly) equal-sized chunks"""
22
+ chunk_size = math.ceil(len(lst) / n) # integer division
23
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
24
+
25
+
26
+ def get_chunk(lst, n, k):
27
+ chunks = split_list(lst, n)
28
+ return chunks[k]
29
+
30
+
31
+ class ActivitynetDataset(Dataset):
32
+
33
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
34
+
35
+ def __init__(self, questions, answers, processor):
36
+ self.questions = questions
37
+ self.answers = answers
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.questions)
42
+
43
+ def __getitem__(self, idx):
44
+ sample = self.questions[idx]
45
+ answer = self.answers[idx]
46
+
47
+ video_name = sample['video_name']
48
+ question = sample['question']
49
+ question_id = sample['question_id']
50
+ answer = answer['answer']
51
+
52
+ video_path = None
53
+ for fmt in self.video_formats: # Added this line
54
+ temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
55
+ if os.path.exists(temp_path):
56
+ video_path = temp_path
57
+ break
58
+ # BUG: compatibility for MSVD, MSRVTT, TGIF
59
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
60
+ if os.path.exists(temp_path):
61
+ video_path = temp_path
62
+ break
63
+
64
+ if video_path is None:
65
+ raise FileNotFoundError(f"Video file not found for {os.path.join(args.video_folder, video_name)}")
66
+
67
+ video_tensor = self.processor(video_path)
68
+
69
+ return {
70
+ 'video': video_tensor,
71
+ 'video_name': video_name,
72
+ 'question': question,
73
+ 'question_id': question_id,
74
+ 'answer': answer,
75
+ }
76
+
77
+
78
+ def collate_fn(batch):
79
+ vid = [x['video'] for x in batch]
80
+ v_id = [x['video_name'] for x in batch]
81
+ qus = [x['question'] for x in batch]
82
+ qid = [x['question_id'] for x in batch]
83
+ ans = [x['answer'] for x in batch]
84
+ return vid, v_id, qus, qid, ans
85
+
86
+
87
+ def run_inference(args):
88
+ disable_torch_init()
89
+
90
+ # Initialize the model
91
+ model, processor, tokenizer = model_init(args.model_path)
92
+
93
+ gt_questions = json.load(open(args.question_file, "r"))
94
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
95
+ gt_answers = json.load(open(args.answer_file, "r"))
96
+ gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
97
+
98
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
99
+ dataset = ActivitynetDataset(gt_questions, gt_answers, processor['video'])
100
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
101
+
102
+ answer_file = os.path.join(args.output_file)
103
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
104
+ ans_file = open(answer_file, "w")
105
+
106
+ # Iterate over each sample in the ground truth file
107
+ for i, (video_tensors, video_names, questions, question_ids, answers) in enumerate(tqdm(dataloader)):
108
+ video_tensor = video_tensors[0]
109
+ video_name = video_names[0]
110
+ question = questions[0]
111
+ question_id = question_ids[0]
112
+ answer = answers[0]
113
+
114
+ # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
115
+
116
+ try:
117
+ output = mm_infer(
118
+ video_tensor,
119
+ question,
120
+ model=model,
121
+ tokenizer=tokenizer,
122
+ modal='video',
123
+ do_sample=False,
124
+ )
125
+ except:
126
+ traceback.print_exc()
127
+ output = "error"
128
+
129
+ sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
130
+ ans_file.write(json.dumps(sample_set) + "\n")
131
+
132
+ ans_file.close()
133
+
134
+
135
+ if __name__ == "__main__":
136
+ parser = argparse.ArgumentParser()
137
+
138
+ parser.add_argument('--model-path', help='', required=True)
139
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
140
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
141
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
142
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
143
+ parser.add_argument("--num-chunks", type=int, default=1)
144
+ parser.add_argument("--chunk-idx", type=int, default=0)
145
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
146
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
147
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
148
+ args = parser.parse_args()
149
+
150
+ run_inference(args)
videollama2/eval/inference_video_oqa_vcgpt_consistency.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class VCGPTDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_list, processor):
37
+ self.data_list = data_list
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.data_list)
42
+
43
+ def __getitem__(self, idx):
44
+ line = self.data_list[idx]
45
+ question1 = line['Q1']
46
+ question2 = line['Q2']
47
+ answer = line['A']
48
+ video_name = line['video_name']
49
+
50
+ for fmt in self.video_formats: # Added this line
51
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
52
+ if os.path.exists(temp_path):
53
+ video_path = temp_path
54
+ break
55
+
56
+ video_tensor = self.processor(video_path)
57
+
58
+ return {
59
+ 'video': video_tensor,
60
+ 'video_name': video_name,
61
+ 'question1': question1,
62
+ 'question2': question2,
63
+ 'answer': answer,
64
+ }
65
+
66
+
67
+ def collate_fn(batch):
68
+ vid = [x['video'] for x in batch]
69
+ v_id = [x['video_name'] for x in batch]
70
+ qus1 = [x['question1'] for x in batch]
71
+ qus2 = [x['question2'] for x in batch]
72
+ ans = [x['answer'] for x in batch]
73
+ vid = torch.stack(vid, dim=0)
74
+ return vid, v_id, qus1, qus2, ans
75
+
76
+
77
+ def run_inference(args):
78
+ disable_torch_init()
79
+
80
+ # Initialize the model
81
+ model, processor, tokenizer = model_init(args.model_path)
82
+
83
+ questions = json.load(open(args.question_file, "r"))
84
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
85
+
86
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
87
+ dataset = VCGPTDataset(questions, processor['video'])
88
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
89
+
90
+ answer_file = os.path.expanduser(args.answer_file)
91
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
92
+ ans_file = open(answer_file, "w")
93
+
94
+ output_list = [] # List to store the output results
95
+
96
+ # Iterate over each sample in the ground truth file
97
+ for i, (video_tensors, video_names, questions1, questions2, answers) in enumerate(tqdm(dataloader)):
98
+
99
+ # reduce batch dimension
100
+ video_tensor = video_tensors[0]
101
+ video_name = video_names[0]
102
+ question1 = questions1[0]
103
+ question2 = questions2[0]
104
+ answer = answers[0]
105
+
106
+ output1 = mm_infer(
107
+ video_tensor,
108
+ question1,
109
+ model=model,
110
+ tokenizer=tokenizer,
111
+ modal='video',
112
+ do_sample=False,
113
+ )
114
+
115
+ output2 = mm_infer(
116
+ video_tensor,
117
+ question2,
118
+ model=model,
119
+ tokenizer=tokenizer,
120
+ do_sample=False,
121
+ modal='video',
122
+ )
123
+
124
+ qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
125
+
126
+ ans_file.write(json.dumps(qa) + "\n")
127
+
128
+ ans_file.close()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+
134
+ # Define the command-line arguments
135
+ parser.add_argument('--model-path', help='', required=True)
136
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
137
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
138
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
139
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
140
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
141
+ parser.add_argument("--num-chunks", type=int, default=1)
142
+ parser.add_argument("--chunk-idx", type=int, default=0)
143
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
144
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
145
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
146
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
147
+
148
+ args = parser.parse_args()
149
+
150
+ run_inference(args)
videollama2/eval/inference_video_oqa_vcgpt_general.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class VCGPTDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_list, processor):
37
+ self.data_list = data_list
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.data_list)
42
+
43
+ def __getitem__(self, idx):
44
+ line = self.data_list[idx]
45
+ question = line['Q']
46
+ answer = line['A']
47
+ video_name = line['video_name']
48
+
49
+ for fmt in self.video_formats: # Added this line
50
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
51
+ if os.path.exists(temp_path):
52
+ video_path = temp_path
53
+ break
54
+
55
+ video_tensor = self.processor(video_path)
56
+
57
+ return {
58
+ 'video': video_tensor,
59
+ 'video_name': video_name,
60
+ 'question': question,
61
+ 'answer': answer,
62
+ }
63
+
64
+
65
+ def collate_fn(batch):
66
+ vid = [x['video'] for x in batch]
67
+ v_id = [x['video_name'] for x in batch]
68
+ qus = [x['question'] for x in batch]
69
+ ans = [x['answer'] for x in batch]
70
+ vid = torch.stack(vid, dim=0)
71
+ return vid, v_id, qus, ans
72
+
73
+
74
+ def run_inference(args):
75
+ disable_torch_init()
76
+
77
+ # Initialize the model
78
+ model, processor, tokenizer = model_init(args.model_path)
79
+
80
+ questions = json.load(open(args.question_file, "r"))
81
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
82
+
83
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
84
+ dataset = VCGPTDataset(questions, processor['video'])
85
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
86
+
87
+ answer_file = os.path.expanduser(args.answer_file)
88
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
89
+ ans_file = open(answer_file, "w")
90
+
91
+ # Iterate over each sample in the ground truth file
92
+ for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
93
+
94
+ # reduce batch dimension
95
+ video_tensor = video_tensors[0]
96
+ video_name = video_names[0]
97
+ question = questions[0]
98
+ answer = answers[0]
99
+
100
+ output = mm_infer(
101
+ video_tensor,
102
+ question,
103
+ model=model,
104
+ tokenizer=tokenizer,
105
+ modal='video',
106
+ do_sample=False,
107
+ )
108
+
109
+ qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
110
+
111
+ ans_file.write(json.dumps(qa) + "\n")
112
+
113
+ ans_file.close()
114
+
115
+
116
+ if __name__ == "__main__":
117
+ parser = argparse.ArgumentParser()
118
+
119
+ parser.add_argument('--model-path', help='', required=True)
120
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
121
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
122
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
123
+ parser.add_argument("--num-chunks", type=int, default=1)
124
+ parser.add_argument("--chunk-idx", type=int, default=0)
125
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
126
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
127
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
128
+ args = parser.parse_args()
129
+
130
+ run_inference(args)
videollama2/mm_utils.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import math
4
+ import base64
5
+ import traceback
6
+ from io import BytesIO
7
+
8
+ import cv2
9
+ import torch
10
+ import imageio
11
+ import numpy as np
12
+ from PIL import Image
13
+ from decord import VideoReader, cpu
14
+ from transformers import StoppingCriteria
15
+
16
+ from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
17
+
18
+
19
+ def chunk_list(input_list, chunk_size):
20
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
21
+
22
+
23
+ def load_image_from_base64(image):
24
+ return Image.open(BytesIO(base64.b64decode(image)))
25
+
26
+
27
+ def expand2square(pil_img, background_color):
28
+ width, height = pil_img.size
29
+ if width == height:
30
+ return pil_img
31
+ elif width > height:
32
+ result = Image.new(pil_img.mode, (width, width), background_color)
33
+ result.paste(pil_img, (0, (width - height) // 2))
34
+ return result
35
+ else:
36
+ result = Image.new(pil_img.mode, (height, height), background_color)
37
+ result.paste(pil_img, ((height - width) // 2, 0))
38
+ return result
39
+
40
+
41
+ def create_photo_grid(arr, rows=None, cols=None):
42
+ """
43
+ Create a photo grid from a 4D numpy array with shape [t, h, w, c].
44
+
45
+ Parameters:
46
+ arr (numpy.ndarray): Input array with shape [t, h, w, c].
47
+ rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
48
+ cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
49
+
50
+ Returns:
51
+ numpy.ndarray: A 3D numpy array representing the photo grid.
52
+ """
53
+
54
+ if isinstance(arr, list):
55
+ if isinstance(arr[0], Image.Image):
56
+ arr = np.stack([np.array(img) for img in arr])
57
+ elif isinstance(arr[0], np.ndarray):
58
+ arr = np.stack(arr)
59
+ else:
60
+ raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
61
+
62
+ t, h, w, c = arr.shape
63
+
64
+ # Calculate the number of rows and columns if not provided
65
+ if rows is None and cols is None:
66
+ rows = math.ceil(math.sqrt(t))
67
+ cols = math.ceil(t / rows)
68
+ elif rows is None:
69
+ rows = math.ceil(t / cols)
70
+ elif cols is None:
71
+ cols = math.ceil(t / rows)
72
+
73
+ # Check if the grid can hold all the images
74
+ if rows * cols < t:
75
+ raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
76
+
77
+ # Create the grid array with appropriate height and width
78
+ grid_height = h * rows
79
+ grid_width = w * cols
80
+ grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
81
+
82
+ # Fill the grid with images
83
+ for i in range(t):
84
+ row_idx = i // cols
85
+ col_idx = i % cols
86
+ grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
87
+
88
+ return grid
89
+
90
+
91
+ def process_image(image_path, processor, aspect_ratio='pad'):
92
+ image = Image.open(image_path).convert('RGB')
93
+
94
+ images = [np.array(image)]
95
+
96
+ if aspect_ratio == 'pad':
97
+ images = [Image.fromarray(f) for f in images]
98
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
99
+ else:
100
+ images = [Image.fromarray(f) for f in images]
101
+
102
+ images = processor.preprocess(images, return_tensors='pt')['pixel_values']
103
+ return images
104
+
105
+
106
+ def frame_sample(duration, mode='uniform', num_frames=None, fps=None):
107
+ if mode == 'uniform':
108
+ assert num_frames is not None, "Number of frames must be provided for uniform sampling."
109
+ # NOTE: v1 version
110
+ # Calculate the size of each segment from which a frame will be extracted
111
+ seg_size = float(duration - 1) / num_frames
112
+
113
+ frame_ids = []
114
+ for i in range(num_frames):
115
+ # Calculate the start and end indices of each segment
116
+ start = seg_size * i
117
+ end = seg_size * (i + 1)
118
+ # Append the middle index of the segment to the list
119
+ frame_ids.append((start + end) / 2)
120
+
121
+ return np.round(np.array(frame_ids) + 1e-6).astype(int)
122
+ # NOTE: v0 version
123
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
124
+ elif mode == 'fps':
125
+ assert fps is not None, "FPS must be provided for FPS sampling."
126
+ segment_len = min(fps // NUM_FRAMES_PER_SECOND, duration)
127
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
128
+ else:
129
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
130
+
131
+
132
+ def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=NUM_FRAMES):
133
+ if isinstance(video_path, str):
134
+ if s is not None and e is not None:
135
+ s = s if s >= 0. else 0.
136
+ e = e if e >= 0. else 0.
137
+ if s > e:
138
+ s, e = e, s
139
+ elif s == e:
140
+ e = s + 1
141
+
142
+ # 1. Loading Video
143
+ if os.path.isdir(video_path):
144
+ frame_files = sorted(os.listdir(video_path))
145
+
146
+ fps = 3
147
+ num_frames_of_video = len(frame_files)
148
+ elif video_path.endswith('.gif'):
149
+ gif_reader = imageio.get_reader(video_path)
150
+
151
+ fps = 25
152
+ num_frames_of_video = len(gif_reader)
153
+ else:
154
+ vreader = VideoReader(video_path, num_threads=2)
155
+
156
+ fps = vreader.get_avg_fps()
157
+ num_frames_of_video = len(vreader)
158
+
159
+ # 2. Determine frame range & Calculate frame indices
160
+ f_start = 0 if s is None else max(int(s * fps) - 1, 0)
161
+ f_end = num_frames_of_video - 1 if e is None else min(int(e * fps) - 1, num_frames_of_video - 1)
162
+ frame_indices = list(range(f_start, f_end + 1))
163
+
164
+ duration = len(frame_indices)
165
+ # 3. Sampling frame indices
166
+ if num_frames is None:
167
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', fps=fps)]
168
+ else:
169
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=num_frames)]
170
+
171
+ # 4. Acquire frame data
172
+ if os.path.isdir(video_path):
173
+ video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
174
+ elif video_path.endswith('.gif'):
175
+ video_data = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
176
+ else:
177
+ video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
178
+
179
+ elif isinstance(video_path, np.ndarray):
180
+ video_data = [Image.fromarray(f) for f in video_path]
181
+ elif isinstance(video_path, list) and isinstance(video_path[0], np.ndarray):
182
+ video_data = [Image.fromarray(f) for f in video_path]
183
+ elif isinstance(video_path, list) and isinstance(video_path[0], str):
184
+ video_data = [Image.open(f) for f in video_path]
185
+ elif isinstance(video_path, list) and isinstance(video_path[0], Image.Image):
186
+ video_data = video_path
187
+ else:
188
+ raise ValueError(f"Unsupported video path type: {type(video_path)}")
189
+
190
+ while num_frames is not None and len(video_data) < num_frames:
191
+ video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
192
+
193
+ # MAX_FRAMES filter
194
+ video_data = video_data[:MAX_FRAMES]
195
+
196
+ if aspect_ratio == 'pad':
197
+ images = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in video_data]
198
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
199
+ else:
200
+ images = [f for f in video_data]
201
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
202
+ return video
203
+
204
+
205
+ def process_video_old(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
206
+ def frame_sample(duration, mode='uniform', local_fps=None):
207
+ if mode == 'uniform':
208
+ # Calculate the size of each segment from which a frame will be extracted
209
+ seg_size = float(duration - 1) / num_frames
210
+
211
+ frame_ids = []
212
+ for i in range(num_frames):
213
+ # Calculate the start and end indices of each segment
214
+ start = int(np.round(seg_size * i))
215
+ end = int(np.round(seg_size * (i + 1)))
216
+ # Append the middle index of the segment to the list
217
+ frame_ids.append((start + end) // 2)
218
+
219
+ return frame_ids
220
+ # NOTE: old version
221
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
222
+ elif mode == 'fps':
223
+ assert local_fps is not None
224
+ segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
225
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
226
+ else:
227
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
228
+
229
+ if isinstance(video_path, str):
230
+ if video_path.endswith('.gif'):
231
+ video_gif = imageio.get_reader(video_path)
232
+ duration, local_fps = len(video_gif), 10
233
+
234
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
235
+ # limit the max input frames
236
+ if len(frame_id_list) > MAX_FRAMES:
237
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
238
+ video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
239
+ else:
240
+ # NOTE: num_threads=1 is required to avoid deadlock in multiprocessing
241
+ # decord_vr = VideoReader(uri=video_path, ctx=cpu(0), num_threads=1)
242
+ decord_vr = VideoReader(video_path, ctx=cpu(0), num_threads=2)
243
+ duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
244
+
245
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
246
+ # limit the max input frames
247
+ if len(frame_id_list) > MAX_FRAMES:
248
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
249
+ try:
250
+ video_data = decord_vr.get_batch(frame_id_list).numpy()
251
+ except:
252
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
253
+
254
+ elif isinstance(video_path, np.ndarray):
255
+ assert len(video_path) == num_frames
256
+ video_data = video_path
257
+ elif isinstance(video_path, list):
258
+ assert len(video_path) == num_frames
259
+ video_data = np.stack([np.array(x) for x in video_path])
260
+
261
+ if image_grid:
262
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
263
+ pg = create_photo_grid(video_data, grid_h, grid_w)
264
+ video_data = [pg, *video_data]
265
+
266
+ if aspect_ratio == 'pad':
267
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
268
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
269
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
270
+ else:
271
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
272
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
273
+
274
+ return video
275
+
276
+
277
+ def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
278
+ """Tokenize text and multimodal tag to input_ids.
279
+
280
+ Args:
281
+ prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
282
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
283
+ multimodal_token (int): Token index corresponding to the multimodal tag.
284
+ """
285
+ multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
286
+ if multimodal_token_index is None:
287
+ input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
288
+ else:
289
+ prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
290
+
291
+ input_ids = []
292
+ for i in range(1, 2 * len(prompt_chunks)):
293
+ if i % 2 == 1:
294
+ input_ids.extend(prompt_chunks[i // 2])
295
+ else:
296
+ input_ids.append(multimodal_token_index)
297
+
298
+ if return_tensors is not None:
299
+ if return_tensors == 'pt':
300
+ return torch.tensor(input_ids, dtype=torch.long)
301
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
302
+ return input_ids
303
+
304
+
305
+ def get_model_name_from_path(model_path):
306
+ model_path = model_path.strip("/")
307
+ model_paths = model_path.split("/")
308
+ if model_paths[-1].startswith('checkpoint-'):
309
+ return model_paths[-2] + "_" + model_paths[-1]
310
+ else:
311
+ return model_paths[-1]
312
+
313
+
314
+ class KeywordsStoppingCriteria(StoppingCriteria):
315
+ def __init__(self, keywords, tokenizer, input_ids):
316
+ self.keywords = keywords
317
+ self.keyword_ids = []
318
+ self.max_keyword_len = 0
319
+ for keyword in keywords:
320
+ cur_keyword_ids = tokenizer(keyword).input_ids
321
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
322
+ cur_keyword_ids = cur_keyword_ids[1:]
323
+ if len(cur_keyword_ids) > self.max_keyword_len:
324
+ self.max_keyword_len = len(cur_keyword_ids)
325
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
326
+ self.tokenizer = tokenizer
327
+ self.start_len = input_ids.shape[1]
328
+
329
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
330
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
331
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
332
+ for keyword_id in self.keyword_ids:
333
+ if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
334
+ return True
335
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
336
+ for keyword in self.keywords:
337
+ if keyword in outputs:
338
+ return True
339
+ return False
340
+
341
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
342
+ outputs = []
343
+ for i in range(output_ids.shape[0]):
344
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
345
+ return all(outputs)
videollama2/model/__init__.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import os
18
+ import warnings
19
+ import shutil
20
+
21
+ import torch
22
+ from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
+
24
+ from .projector import load_mm_projector
25
+ from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2LlamaConfig
26
+ from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
27
+ from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
28
+ from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
29
+
30
+
31
+ VLLMs = {
32
+ "videollama2": Videollama2MistralForCausalLM,
33
+ "videollama2_llama": Videollama2LlamaForCausalLM,
34
+ "videollama2_mistral": Videollama2MistralForCausalLM,
35
+ "videollama2_mixtral": Videollama2MixtralForCausalLM,
36
+ "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
37
+ }
38
+
39
+ VLLMConfigs = {
40
+ "videollama2": Videollama2MistralConfig,
41
+ "videollama2_llama": Videollama2LlamaConfig,
42
+ "videollama2_mistral": Videollama2MistralConfig,
43
+ "videollama2_mixtral": Videollama2MixtralConfig,
44
+ "videollama2_qwen2": Videollama2Qwen2Config,
45
+ }
46
+
47
+
48
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
49
+ if 'token' in kwargs:
50
+ token = kwargs['token']
51
+ else:
52
+ token = None
53
+
54
+ kwargs = {"device_map": device_map, **kwargs}
55
+
56
+ if device != "cuda":
57
+ kwargs['device_map'] = {"": device}
58
+
59
+ if load_8bit:
60
+ kwargs['load_in_8bit'] = True
61
+ elif load_4bit:
62
+ # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
63
+ # kwargs['load_in_4bit'] = True
64
+ kwargs['quantization_config'] = BitsAndBytesConfig(
65
+ load_in_4bit=True,
66
+ bnb_4bit_compute_dtype=torch.float16,
67
+ bnb_4bit_use_double_quant=True,
68
+ bnb_4bit_quant_type='nf4'
69
+ )
70
+ else:
71
+ kwargs['torch_dtype'] = torch.float16
72
+
73
+ if use_flash_attn:
74
+ kwargs['attn_implementation'] = 'flash_attention_2'
75
+
76
+ config = AutoConfig.from_pretrained(model_path)
77
+
78
+ # judge model type
79
+ model_type = config.model_type
80
+
81
+ # judge pretrain/finetune
82
+ try:
83
+ is_pretraining = config.tune_mm_mlp_adapter
84
+ except:
85
+ is_pretraining = False
86
+
87
+ # NOTE: lora/qlora model loading
88
+ if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
89
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
90
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
91
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
92
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
93
+
94
+ # NOTE: remove qlora training quantization config
95
+ if hasattr(config, 'quantization_config'):
96
+ del config.quantization_config
97
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
98
+ print('Loading VideoLLaMA lora model...')
99
+
100
+ if 'vicuna' in model_base.lower():
101
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
102
+ elif 'mistral' in model_base.lower():
103
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
104
+ else:
105
+ #model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
106
+ # Using the visual@MistralForCasualLM will cause the model to give random output when using finetuned qwen2 based varient
107
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
108
+
109
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
110
+ if model.lm_head.weight.shape[0] != token_num:
111
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
112
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
113
+
114
+ print('Loading additional VideoLLaMA weights...')
115
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
116
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
117
+ else:
118
+ # this is probably from HF Hub
119
+ from huggingface_hub import hf_hub_download
120
+ def load_from_hf(repo_id, filename, subfolder=None):
121
+ cache_file = hf_hub_download(
122
+ repo_id=repo_id,
123
+ filename=filename,
124
+ subfolder=subfolder)
125
+ return torch.load(cache_file, map_location='cpu')
126
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
127
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
128
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
129
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
130
+ model.load_state_dict(non_lora_trainables, strict=False)
131
+
132
+ from peft import PeftModel
133
+ print('Loading LoRA weights...')
134
+ model = PeftModel.from_pretrained(model, model_path)
135
+ print('Merging LoRA weights...')
136
+ model = model.merge_and_unload()
137
+ print('Model is loaded...')
138
+ elif model_base is not None or is_pretraining:
139
+ # NOTE: Base/Pretrain model loading
140
+ print('Loading VideoLLaMA 2 from base model...')
141
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
142
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
143
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
144
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
145
+
146
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
147
+
148
+ if model_type in ['videollama2', 'videollama2_mistral']:
149
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
150
+ elif model_type in ['videollama2_mixtral']:
151
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
152
+ elif model_type in ['videollama2_qwen2']:
153
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
154
+ else:
155
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
156
+
157
+ # NOTE; loading vision-language projector
158
+ # * old codes for loading local mm_projector.bin
159
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
160
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
161
+ # model.load_state_dict(mm_projector_weights, strict=False)
162
+ # * new codes which supports loading mm_projector.bin both offline and online
163
+ mm_projector_weights = load_mm_projector(model_path, token=token)
164
+ model.load_state_dict(mm_projector_weights, strict=False)
165
+ elif 'videollama2' in model_type:
166
+ # NOTE: SFT model loading
167
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
168
+
169
+ if model_type in ['videollama2', 'videollama2_mistral']:
170
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
171
+ elif model_type in ['videollama2_mixtral']:
172
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
173
+ elif model_type in ['videollama2_qwen2']:
174
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
175
+ else:
176
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
177
+ else:
178
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
179
+ model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
180
+
181
+ processor = None
182
+
183
+ if "videollama" in model_type:
184
+ vision_tower = model.get_vision_tower()
185
+ # NOTE: videollama2 adopts the same processor for processing image and video.
186
+ processor = vision_tower.image_processor
187
+
188
+ if hasattr(model.config, "max_sequence_length"):
189
+ context_len = model.config.max_sequence_length
190
+ else:
191
+ context_len = 2048
192
+
193
+ return tokenizer, model, processor, context_len
videollama2/model/encoder.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from transformers import (
7
+ CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,
8
+ SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig,
9
+ )
10
+
11
+
12
+ class CLIPVisionTower(nn.Module):
13
+
14
+ def __init__(self, vision_tower, args, load_pretrained=False):
15
+ super().__init__()
16
+
17
+ self.vision_tower_name = vision_tower
18
+ self.select_layer = args.mm_vision_select_layer
19
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
20
+
21
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
22
+
23
+ config = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
24
+ config._attn_implementation = "flash_attention_2"
25
+
26
+ if not load_pretrained:
27
+ self.vision_tower = CLIPVisionModel(config=config)
28
+ else:
29
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
30
+
31
+ def feature_select(self, image_forward_outs):
32
+ image_features = image_forward_outs.hidden_states[self.select_layer]
33
+ if self.select_feature == 'patch':
34
+ image_features = image_features[:, 1:]
35
+ elif self.select_feature == 'cls_patch':
36
+ image_features = image_features
37
+ else:
38
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
39
+ return image_features
40
+
41
+ @torch.no_grad()
42
+ def forward(self, images):
43
+ if type(images) is list:
44
+ image_features = []
45
+ for image in images:
46
+ image_forward_out = self.vision_tower(image.unsqueeze(0), output_hidden_states=True)
47
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
48
+ image_features.append(image_feature)
49
+ else:
50
+ image_forward_outs = self.vision_tower(images, output_hidden_states=True)
51
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
52
+
53
+ return image_features
54
+
55
+ @property
56
+ def dtype(self):
57
+ return self.vision_tower.dtype
58
+
59
+ @property
60
+ def device(self):
61
+ return self.vision_tower.device
62
+
63
+ @property
64
+ def config(self):
65
+ return self.vision_tower.config
66
+
67
+ @property
68
+ def hidden_size(self):
69
+ return self.config.hidden_size
70
+
71
+ @property
72
+ def num_patches(self):
73
+ return (self.config.image_size // self.config.patch_size) ** 2
74
+
75
+ @property
76
+ def num_patches_per_side(self):
77
+ return self.config.image_size // self.config.patch_size
78
+
79
+ @property
80
+ def image_size(self):
81
+ return self.config.image_size
82
+
83
+
84
+ class SiglipVisionTower(nn.Module):
85
+
86
+ def __init__(self, vision_tower, args, load_pretrained=False):
87
+ super().__init__()
88
+
89
+ self.vision_tower_name = vision_tower
90
+ self.select_layer = args.mm_vision_select_layer
91
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
92
+
93
+ self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
94
+
95
+ config = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
96
+ config._attn_implementation = 'flash_attention_2'
97
+
98
+ if not load_pretrained:
99
+ self.vision_tower = SiglipVisionModel(config=config)
100
+ else:
101
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
102
+
103
+ def feature_select(self, image_forward_outs):
104
+ image_features = image_forward_outs.hidden_states[self.select_layer]
105
+ if self.select_feature == 'patch':
106
+ image_features = image_features
107
+ else:
108
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
109
+ return image_features
110
+
111
+ @torch.no_grad()
112
+ def forward(self, images):
113
+ if type(images) is list:
114
+ image_features = []
115
+ for image in images:
116
+ image_forward_out = self.vision_tower(image.unsqueeze(0), output_hidden_states=True)
117
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
118
+ image_features.append(image_feature)
119
+ else:
120
+ image_forward_outs = self.vision_tower(images, output_hidden_states=True)
121
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
122
+
123
+ return image_features
124
+
125
+ @property
126
+ def dtype(self):
127
+ return self.vision_tower.dtype
128
+
129
+ @property
130
+ def device(self):
131
+ return self.vision_tower.device
132
+
133
+ @property
134
+ def config(self):
135
+ return self.vision_tower.config
136
+
137
+ @property
138
+ def hidden_size(self):
139
+ return self.config.hidden_size
140
+
141
+ @property
142
+ def num_patches(self):
143
+ return (self.config.image_size // self.config.patch_size) ** 2
144
+
145
+ @property
146
+ def num_patches_per_side(self):
147
+ return self.config.image_size // self.config.patch_size
148
+
149
+ @property
150
+ def image_size(self):
151
+ return self.config.image_size
152
+
153
+
154
+ def build_vision_tower(vision_tower_cfg, **kwargs):
155
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
156
+
157
+ if 'clip' in vision_tower:
158
+ vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
159
+ elif 'siglip' in vision_tower:
160
+ vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
161
+ else:
162
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
163
+
164
+ return vision_tower
videollama2/model/projector.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alibaba DAMO Academy
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import einops
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from timm.models.regnet import RegStage
23
+ from timm.models.layers import LayerNorm, LayerNorm2d
24
+ from transformers import TRANSFORMERS_CACHE
25
+
26
+
27
+ def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
28
+ revision = "main"
29
+ # 1. parse the downloaded cache folder
30
+ if cache_dir is None:
31
+ cache_dir = TRANSFORMERS_CACHE
32
+ else:
33
+ cache_dir = cache_dir
34
+ object_id = repo_id.replace("/", "--")
35
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
36
+ # 2. resolve refs (for instance to convert main to the associated commit sha)
37
+ refs_dir = os.path.join(repo_cache, "refs")
38
+ if os.path.isdir(refs_dir):
39
+ revision_file = os.path.join(refs_dir, revision)
40
+ if os.path.isfile(revision_file):
41
+ with open(revision_file) as f:
42
+ revision = f.read()
43
+ # 3. acquire the snapshot folder
44
+ folder = os.path.join(repo_cache, "snapshots", revision)
45
+
46
+ return folder
47
+
48
+
49
+ def load_mm_projector(model_path, cache_dir=None, token=None):
50
+ if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
51
+ is_local = True
52
+ folder = model_path
53
+ else:
54
+ is_local = False
55
+ folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
56
+ if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
57
+ # downloading from remote repo
58
+ from huggingface_hub import snapshot_download
59
+ snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
60
+
61
+ mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
62
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
63
+ return mm_projector_weights
64
+
65
+
66
+ class IdentityMap(nn.Module):
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ def forward(self, x, *args, **kwargs):
72
+ return x
73
+
74
+ @property
75
+ def config(self):
76
+ return {"mm_projector_type": 'identity'}
77
+
78
+
79
+ class SimpleResBlock(nn.Module):
80
+
81
+ def __init__(self, channels):
82
+ super().__init__()
83
+ self.pre_norm = nn.LayerNorm(channels)
84
+
85
+ self.proj = nn.Sequential(
86
+ nn.Linear(channels, channels),
87
+ nn.GELU(),
88
+ nn.Linear(channels, channels)
89
+ )
90
+ def forward(self, x):
91
+ x = self.pre_norm(x)
92
+ return x + self.proj(x)
93
+
94
+
95
+ def build_vision_projector(config, delay_load=False, **kwargs):
96
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
97
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
98
+ if mlp_gelu_match:
99
+ mlp_depth = int(mlp_gelu_match.group(1))
100
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
101
+ for _ in range(1, mlp_depth):
102
+ modules.append(nn.GELU())
103
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
104
+ return nn.Sequential(*modules)
105
+
106
+ if projector_type == "linear":
107
+ # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
108
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
109
+ elif projector_type == "stc_connector":
110
+ return STCConnector(config)
111
+ elif projector_type == "stp_connector":
112
+ return STPConnector(config)
113
+ elif projector_type == "stc_connector_v35":
114
+ return STCConnectorV35(config)
115
+ elif projector_type == "spatial_conv":
116
+ return SpatialConv(config)
117
+ elif projector_type == "spatial_pool":
118
+ return SpatialPool(config)
119
+ if projector_type == 'identity':
120
+ return IdentityMap()
121
+
122
+ raise ValueError(f'Unknown projector type: {projector_type}')
123
+
124
+
125
+ def build_mlp(depth, hidden_size, output_hidden_size):
126
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
127
+ for _ in range(1, depth):
128
+ modules.append(nn.GELU())
129
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
130
+ return nn.Sequential(*modules)
131
+
132
+
133
+ class STCConnector(nn.Module):
134
+
135
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
136
+ """Temporal Convolutional Vision-Language Connector.
137
+
138
+ Args:
139
+ config: config object.
140
+ downsample: (temporal, height, width) downsample rate.
141
+ depth: depth of the spatial interaction blocks.
142
+ mlp_depth: depth of the vision-language projector layers.
143
+ """
144
+ super().__init__()
145
+ self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
146
+ self.hidden_size = hidden_size = config.hidden_size
147
+ self.output_hidden_size = output_hidden_size = config.hidden_size
148
+ # TODO: make these as config arguments
149
+ self.depth = depth
150
+ self.mlp_depth = mlp_depth
151
+ self.downsample = downsample
152
+ if depth != 0:
153
+ self.s1 = RegStage(
154
+ depth=depth,
155
+ in_chs=encoder_hidden_size,
156
+ out_chs=hidden_size,
157
+ stride=1,
158
+ dilation=1,
159
+ act_layer=nn.SiLU,
160
+ norm_layer=LayerNorm2d,
161
+ )
162
+ else:
163
+ self.s1 = nn.Identity()
164
+ self.sampler = nn.Sequential(
165
+ nn.Conv3d(
166
+ in_channels=hidden_size,
167
+ out_channels=hidden_size,
168
+ kernel_size=downsample,
169
+ stride=downsample,
170
+ padding=1,
171
+ bias=True
172
+ ),
173
+ nn.SiLU()
174
+ )
175
+ if depth != 0:
176
+ self.s2 = RegStage(
177
+ depth=depth,
178
+ in_chs=hidden_size,
179
+ out_chs=hidden_size,
180
+ stride=1,
181
+ dilation=1,
182
+ act_layer=nn.SiLU,
183
+ norm_layer=LayerNorm2d,
184
+ )
185
+ else:
186
+ self.s2 = nn.Identity()
187
+ self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
188
+
189
+ def forward(self, x):
190
+ """Aggregate tokens on the temporal and spatial dimensions.
191
+ Args:
192
+ x: input tokens [b, t, h, w, d] / [b, t, l, d]
193
+ Returns:
194
+ aggregated tokens [b, l, d]
195
+ """
196
+ t = x.size(1)
197
+ if x.ndim == 4:
198
+ hw = int(x.size(2) ** 0.5)
199
+ x = einops.rearrange(x, "b t (h w) d -> b d t h w", h=hw, w=hw)
200
+ elif x.ndim == 5:
201
+ x = einops.rearrange(x, "b t h w d -> b d t h w")
202
+
203
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
204
+ # 1. the first stage of the adapter
205
+ x = self.s1(x)
206
+ x = einops.rearrange(x, "(b t) d h w -> b d t h w", t=t)
207
+ # 2. downsampler
208
+ x = self.sampler(x)
209
+ new_t = x.size(2)
210
+ # 3. the second stage of the adapter
211
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
212
+ x = self.s2(x)
213
+ x = einops.rearrange(x, "(b t) d h w -> b (t h w) d", t=new_t)
214
+ x = self.readout(x)
215
+ return x
216
+
217
+
218
+ class STPConnector(STCConnector):
219
+
220
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
221
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
222
+ self.sampler = nn.Sequential(nn.AvgPool3d(downsample), nn.SiLU())
223
+
224
+
225
+ class STCConnectorV35(STCConnector):
226
+
227
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
228
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
229
+ self.sampler = nn.Sequential(
230
+ nn.Conv3d(
231
+ in_channels=self.hidden_size,
232
+ out_channels=self.hidden_size,
233
+ kernel_size=downsample,
234
+ stride=downsample,
235
+ padding=0,
236
+ bias=True
237
+ ),
238
+ nn.SiLU())
239
+
240
+
241
+ class SpatialConv(STCConnector):
242
+
243
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
244
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
245
+
246
+
247
+ class SpatialPool(STPConnector):
248
+
249
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
250
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
videollama2/model/videollama2_arch.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+
19
+ import einops
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from .projector import load_mm_projector, build_vision_projector
24
+ from .encoder import build_vision_tower
25
+ from ..constants import IGNORE_INDEX, NUM_FRAMES, MODAL_INDEX_MAP
26
+
27
+
28
+ class Videollama2MetaModel:
29
+
30
+ def __init__(self, config):
31
+ super(Videollama2MetaModel, self).__init__(config)
32
+
33
+ if hasattr(config, "mm_vision_tower"):
34
+ self.vision_tower = build_vision_tower(config)
35
+ self.mm_projector = build_vision_projector(config)
36
+
37
+ def get_vision_tower(self):
38
+ vision_tower = getattr(self, 'vision_tower', None)
39
+ if type(vision_tower) is list:
40
+ vision_tower = vision_tower[0]
41
+ return vision_tower
42
+
43
+ def initialize_vision_modules(self, model_args, fsdp=None):
44
+ vision_tower = model_args.vision_tower
45
+ mm_vision_select_layer = model_args.mm_vision_select_layer
46
+ mm_vision_select_feature = model_args.mm_vision_select_feature
47
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
48
+
49
+ self.config.mm_vision_tower = vision_tower
50
+
51
+ if self.get_vision_tower() is None:
52
+ vision_tower = build_vision_tower(model_args, load_pretrained=True)
53
+
54
+ if fsdp is not None and len(fsdp) > 0:
55
+ self.vision_tower = [vision_tower]
56
+ else:
57
+ self.vision_tower = vision_tower
58
+ else:
59
+ if fsdp is not None and len(fsdp) > 0:
60
+ vision_tower = self.vision_tower[0]
61
+ else:
62
+ vision_tower = self.vision_tower
63
+
64
+ self.config.use_mm_proj = True
65
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
66
+ self.config.mm_hidden_size = vision_tower.hidden_size
67
+ self.config.mm_vision_select_layer = mm_vision_select_layer
68
+ self.config.mm_vision_select_feature = mm_vision_select_feature
69
+
70
+ if getattr(self, 'mm_projector', None) is None:
71
+ self.mm_projector = build_vision_projector(self.config)
72
+ else:
73
+ # In case it is frozen by LoRA
74
+ for p in self.mm_projector.parameters():
75
+ p.requires_grad = True
76
+
77
+ if pretrain_mm_mlp_adapter is not None:
78
+ if os.path.exists(pretrain_mm_mlp_adapter):
79
+ is_local = True
80
+ if os.path.isdir(pretrain_mm_mlp_adapter):
81
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
82
+ else:
83
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
84
+ else:
85
+ # Support loading projector weights from remote HuggingFace model hub
86
+ is_local = False
87
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.replace('mm_projector.bin', '')
88
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.strip('/').strip('\\').strip()
89
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
90
+
91
+ def get_w(weights, keyword):
92
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
93
+
94
+ # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
95
+ # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
96
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
97
+
98
+
99
+ class Videollama2MetaForCausalLM(ABC):
100
+
101
+ @abstractmethod
102
+ def get_model(self):
103
+ pass
104
+
105
+ def num_frames(self):
106
+ if hasattr(self.config, 'num_frames'):
107
+ return self.config.num_frames
108
+ else:
109
+ return NUM_FRAMES
110
+
111
+ def get_vision_tower(self):
112
+ return self.get_model().get_vision_tower()
113
+
114
+ def encode_images_or_videos(self, images):
115
+ num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
116
+
117
+ data_batch = []
118
+ for i, (data, modal) in enumerate(images):
119
+ if modal == 'image':
120
+ data = data.expand(num_frames, -1, -1, -1)
121
+ else:
122
+ data = data
123
+ data_batch.append(data)
124
+
125
+ data_batch = torch.stack(data_batch, dim=0)
126
+
127
+ assert len(data_batch.size()) == 5
128
+ batch_size = data_batch.size(0)
129
+
130
+ frames = einops.rearrange(data_batch, 'b t c h w -> (b t) c h w')
131
+ frames_features = self.get_model().get_vision_tower()(frames)
132
+ frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
133
+
134
+ return self.temporal_aggregator(frames_features)
135
+
136
+ def temporal_aggregator(self, frames_features):
137
+ """Temporal aggregation of frame features.
138
+ Args:
139
+ frames_features (torch.Tensor): Frame features with shape (b, t, n, h).
140
+ Returns:
141
+ torch.Tensor: Video features with shape (b, n, h).
142
+ """
143
+ # TODO: improve the merging method.
144
+ # *********** mean pooling *************
145
+ if self.config.mm_projector_type == "mlp2x_gelu" or self.config.mm_projector_type == "linear":
146
+ video_features = self.get_model().mm_projector(frames_features.mean(1))
147
+ # *********** spatial convolution *************
148
+ elif self.config.mm_projector_type == "spatial_conv":
149
+ video_features = self.get_model().mm_projector(frames_features)
150
+ # *********** spatial pooling *************
151
+ elif self.config.mm_projector_type == "spatial_pool":
152
+ video_features = self.get_model().mm_projector(frames_features)
153
+ # *********** time ************
154
+ elif "tc_connector" in self.config.mm_projector_type or "tp_connector" in self.config.mm_projector_type:
155
+ video_features = self.get_model().mm_projector(frames_features)
156
+ else:
157
+ raise Exception(f"Unsupported projector type {self.config.mm_projector_type}!!!")
158
+
159
+ return video_features
160
+
161
+ def prepare_inputs_labels_for_multimodal(
162
+ self, input_ids, attention_mask, past_key_values, labels, images
163
+ ):
164
+ vision_tower = self.get_vision_tower()
165
+ # NOTE: text-only situation
166
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
167
+ # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
168
+ # attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
169
+ return input_ids, attention_mask, past_key_values, None, labels
170
+
171
+ mm_features = self.encode_images_or_videos(images)
172
+
173
+ new_input_embeds = []
174
+ new_labels = [] if labels is not None else None
175
+ cur_mm_idx = 0
176
+ # replace image/video/audio tokens with pre-computed embeddings
177
+ for batch_idx, cur_input_ids in enumerate(input_ids):
178
+ num_multimodals = sum((cur_input_ids == mm_token_idx).sum() for mm_token_idx in MODAL_INDEX_MAP.values())
179
+ # pure text input
180
+ if num_multimodals == 0:
181
+ half_len = cur_input_ids.shape[0] // 2
182
+ cur_mm_features = mm_features[cur_mm_idx]
183
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
184
+ cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
185
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_mm_features[0:0], cur_input_embeds_2], dim=0)
186
+ new_input_embeds.append(cur_input_embeds)
187
+ if labels is not None:
188
+ new_labels.append(labels[batch_idx])
189
+ cur_mm_idx += 1
190
+ continue
191
+
192
+ cur_new_input_embeds = []
193
+ if labels is not None:
194
+ cur_labels = labels[batch_idx]
195
+ cur_new_labels = []
196
+ assert cur_labels.shape == cur_input_ids.shape
197
+
198
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
199
+ while mm_token_indices.numel() > 0:
200
+ cur_mm_features = mm_features[cur_mm_idx]
201
+ mm_token_start = mm_token_indices[0]
202
+
203
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:mm_token_start]))
204
+ cur_new_input_embeds.append(cur_mm_features)
205
+ if labels is not None:
206
+ cur_new_labels.append(cur_labels[:mm_token_start])
207
+ cur_new_labels.append(torch.full((cur_mm_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
208
+ cur_labels = cur_labels[mm_token_start+1:]
209
+
210
+ cur_mm_idx += 1
211
+ cur_input_ids = cur_input_ids[mm_token_start+1:]
212
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
213
+
214
+ if cur_input_ids.numel() > 0:
215
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
216
+ if labels is not None:
217
+ cur_new_labels.append(cur_labels)
218
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
219
+ # NOTE: one cur_new_input_embeds per each
220
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
221
+ new_input_embeds.append(cur_new_input_embeds)
222
+ if labels is not None:
223
+ cur_new_labels = torch.cat(cur_new_labels, dim=0)
224
+ new_labels.append(cur_new_labels)
225
+
226
+ # padding
227
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
228
+ max_len = max(x.shape[0] for x in new_input_embeds)
229
+
230
+ new_input_embeds_align = []
231
+ for cur_new_embed in new_input_embeds:
232
+ cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
233
+ new_input_embeds_align.append(cur_new_embed)
234
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
235
+
236
+ if labels is not None:
237
+ new_labels_align = []
238
+ _new_labels = new_labels
239
+ for cur_new_label in new_labels:
240
+ cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
241
+ new_labels_align.append(cur_new_label)
242
+ new_labels = torch.stack(new_labels_align, dim=0)
243
+
244
+ if attention_mask is not None:
245
+ new_attention_mask = []
246
+ for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
247
+ new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
248
+ new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
249
+ cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
250
+ new_attention_mask.append(cur_new_attention_mask)
251
+ attention_mask = torch.stack(new_attention_mask, dim=0)
252
+ assert attention_mask.shape == new_labels.shape
253
+ else:
254
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
255
+ if labels is not None:
256
+ new_labels = torch.stack(new_labels, dim=0)
257
+
258
+ if attention_mask is not None:
259
+ new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
260
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
261
+ assert attention_mask.shape == new_input_embeds.shape[:2]
262
+
263
+ return None, attention_mask, past_key_values, new_input_embeds, new_labels
videollama2/model/videollama2_llama.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaModel, LlamaForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2LlamaConfig(LlamaConfig):
31
+ model_type = "videollama2_llama"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_llama"
36
+
37
+
38
+ class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
39
+ config_class = Videollama2LlamaConfig
40
+
41
+ def __init__(self, config: LlamaConfig):
42
+ super(Videollama2LlamaModel, self).__init__(config)
43
+
44
+
45
+ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2LlamaConfig
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(LlamaForCausalLM, self).__init__(config)
50
+ self.model = Videollama2LlamaModel(config)
51
+ self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ **kwargs
75
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
76
+
77
+ if inputs_embeds is None:
78
+ (
79
+ input_ids,
80
+ attention_mask,
81
+ past_key_values,
82
+ inputs_embeds,
83
+ labels
84
+ ) = self.prepare_inputs_labels_for_multimodal(
85
+ input_ids,
86
+ attention_mask,
87
+ past_key_values,
88
+ labels,
89
+ images
90
+ )
91
+
92
+ outputs = super().forward(
93
+ input_ids=input_ids,
94
+ attention_mask=attention_mask,
95
+ past_key_values=past_key_values,
96
+ inputs_embeds=inputs_embeds,
97
+ labels=labels,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict,
102
+ )
103
+
104
+ outputs.labels = labels
105
+
106
+ return outputs
107
+
108
+ @torch.no_grad()
109
+ def generate(
110
+ self,
111
+ inputs: Optional[torch.Tensor] = None,
112
+ images: Optional[torch.Tensor] = None,
113
+ **kwargs,
114
+ ) -> Union[GenerateOutput, torch.LongTensor]:
115
+ position_ids = kwargs.pop("position_ids", None)
116
+ attention_mask = kwargs.pop("attention_mask", None)
117
+ if "inputs_embeds" in kwargs:
118
+ raise NotImplementedError("`inputs_embeds` is not supported")
119
+
120
+ if images is not None:
121
+ (
122
+ input_ids,
123
+ attention_mask,
124
+ past_key_values,
125
+ inputs_embeds,
126
+ _
127
+ ) = self.prepare_inputs_labels_for_multimodal(
128
+ input_ids=inputs,
129
+ attention_mask=attention_mask,
130
+ past_key_values=None,
131
+ labels=None,
132
+ images=images
133
+ )
134
+ else:
135
+ inputs_embeds = self.get_model().embed_tokens(inputs)
136
+
137
+ return super().generate(
138
+ position_ids=position_ids,
139
+ attention_mask=attention_mask,
140
+ inputs_embeds=inputs_embeds,
141
+ **kwargs
142
+ )
143
+
144
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
145
+ images = kwargs.pop("images", None)
146
+ _inputs = super().prepare_inputs_for_generation(
147
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
148
+ )
149
+ if images is not None:
150
+ _inputs['images'] = images
151
+ return _inputs
152
+
153
+
154
+ AutoConfig.register("videollama2_llama", Videollama2LlamaConfig)
155
+ AutoModelForCausalLM.register(Videollama2LlamaConfig, Videollama2LlamaForCausalLM)
videollama2/model/videollama2_mistral.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
+ MistralConfig, MistralModel, MistralForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2MistralConfig(MistralConfig):
33
+ model_type = "videollama2_mistral"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_mistral"
38
+
39
+
40
+ class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
41
+ config_class = Videollama2MistralConfig
42
+
43
+ def __init__(self, config: MistralConfig):
44
+ super(Videollama2MistralModel, self).__init__(config)
45
+
46
+
47
+ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2MistralConfig
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(MistralForCausalLM, self).__init__(config)
52
+ self.model = Videollama2MistralModel(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ **kwargs
77
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
78
+
79
+ if inputs_embeds is None:
80
+ (
81
+ input_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ inputs_embeds,
85
+ labels
86
+ ) = self.prepare_inputs_labels_for_multimodal(
87
+ input_ids,
88
+ attention_mask,
89
+ past_key_values,
90
+ labels,
91
+ images
92
+ )
93
+
94
+ outputs = super().forward(
95
+ input_ids=input_ids,
96
+ attention_mask=attention_mask,
97
+ past_key_values=past_key_values,
98
+ inputs_embeds=inputs_embeds,
99
+ labels=labels,
100
+ use_cache=use_cache,
101
+ output_attentions=output_attentions,
102
+ output_hidden_states=output_hidden_states,
103
+ return_dict=return_dict,
104
+ )
105
+
106
+ outputs.labels = labels
107
+
108
+ return outputs
109
+
110
+ @torch.no_grad()
111
+ def generate(
112
+ self,
113
+ inputs: Optional[torch.Tensor] = None,
114
+ images: Optional[torch.Tensor] = None,
115
+ **kwargs,
116
+ ) -> Union[GenerateOutput, torch.LongTensor]:
117
+ position_ids = kwargs.pop("position_ids", None)
118
+ attention_mask = kwargs.pop("attention_mask", None)
119
+ if "inputs_embeds" in kwargs:
120
+ raise NotImplementedError("`inputs_embeds` is not supported")
121
+
122
+ if images is not None:
123
+ (
124
+ input_ids,
125
+ attention_mask,
126
+ past_key_values,
127
+ inputs_embeds,
128
+ _
129
+ ) = self.prepare_inputs_labels_for_multimodal(
130
+ input_ids=inputs,
131
+ attention_mask=attention_mask,
132
+ past_key_values=None,
133
+ labels=None,
134
+ images=images
135
+ )
136
+ else:
137
+ inputs_embeds = self.get_model().embed_tokens(inputs)
138
+
139
+ return super().generate(
140
+ position_ids=position_ids,
141
+ attention_mask=attention_mask,
142
+ inputs_embeds=inputs_embeds,
143
+ **kwargs
144
+ )
145
+
146
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
147
+ images = kwargs.pop("images", None)
148
+ _inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ _inputs['images'] = images
153
+ return _inputs
154
+
155
+
156
+ AutoConfig.register("videollama2_mistral", Videollama2MistralConfig)
157
+ AutoModelForCausalLM.register(Videollama2MistralConfig, Videollama2MistralForCausalLM)
videollama2/model/videollama2_mixtral.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MixtralConfig, MixtralModel, MixtralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
29
+
30
+
31
+ class Videollama2MixtralConfig(MixtralConfig):
32
+ model_type = "videollama2_mixtral"
33
+
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+ self.model_type = "videollama2_mixtral"
37
+
38
+
39
+ class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
40
+ config_class = Videollama2MixtralConfig
41
+
42
+ def __init__(self, config: MixtralConfig):
43
+ super(Videollama2MixtralModel, self).__init__(config)
44
+
45
+
46
+ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausalLM):
47
+ config_class = Videollama2MixtralConfig
48
+
49
+ def __init__(self, config, **kwargs):
50
+ super(MixtralForCausalLM, self).__init__(config)
51
+ self.model = Videollama2MixtralModel(config)
52
+ # self.pretraining_tp = config.pretraining_tp
53
+ self.vocab_size = config.vocab_size
54
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
55
+
56
+ # Initialize weights and apply final processing
57
+ self.post_init()
58
+
59
+ def get_model(self):
60
+ return self.model
61
+
62
+ def forward(
63
+ self,
64
+ input_ids: torch.LongTensor = None,
65
+ attention_mask: Optional[torch.Tensor] = None,
66
+ position_ids: Optional[torch.LongTensor] = None,
67
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
68
+ inputs_embeds: Optional[torch.FloatTensor] = None,
69
+ labels: Optional[torch.LongTensor] = None,
70
+ use_cache: Optional[bool] = None,
71
+ output_attentions: Optional[bool] = None,
72
+ output_hidden_states: Optional[bool] = None,
73
+ images: Optional[torch.FloatTensor] = None,
74
+ return_dict: Optional[bool] = None,
75
+ **kwargs
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ inputs_embeds,
84
+ labels
85
+ ) = self.prepare_inputs_labels_for_multimodal(
86
+ input_ids,
87
+ attention_mask,
88
+ past_key_values,
89
+ labels,
90
+ images
91
+ )
92
+
93
+ return super().forward(
94
+ input_ids=input_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ labels=labels,
99
+ use_cache=use_cache,
100
+ output_attentions=output_attentions,
101
+ output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
+ )
104
+
105
+ @torch.no_grad()
106
+ def generate(
107
+ self,
108
+ inputs: Optional[torch.Tensor] = None,
109
+ images: Optional[torch.Tensor] = None,
110
+ **kwargs,
111
+ ) -> Union[GenerateOutput, torch.LongTensor]:
112
+ position_ids = kwargs.pop("position_ids", None)
113
+ attention_mask = kwargs.pop("attention_mask", None)
114
+ if "inputs_embeds" in kwargs:
115
+ raise NotImplementedError("`inputs_embeds` is not supported")
116
+
117
+ if images is not None:
118
+ (
119
+ input_ids,
120
+ attention_mask,
121
+ past_key_values,
122
+ inputs_embeds,
123
+ _
124
+ ) = self.prepare_inputs_labels_for_multimodal(
125
+ input_ids=inputs,
126
+ attention_mask=attention_mask,
127
+ past_key_values=None,
128
+ labels=None,
129
+ images=images
130
+ )
131
+ else:
132
+ inputs_embeds = self.get_model().embed_tokens(inputs)
133
+
134
+ return super().generate(
135
+ position_ids=position_ids,
136
+ attention_mask=attention_mask,
137
+ inputs_embeds=inputs_embeds,
138
+ **kwargs
139
+ )
140
+
141
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
142
+ images = kwargs.pop("images", None)
143
+ _inputs = super().prepare_inputs_for_generation(
144
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
145
+ )
146
+ if images is not None:
147
+ _inputs['images'] = images
148
+ return _inputs
149
+
150
+
151
+ AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
152
+ AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
videollama2/model/videollama2_qwen2.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ Qwen2Config, Qwen2Model, Qwen2ForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2Qwen2Config(Qwen2Config):
31
+ model_type = "videollama2_qwen2"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_qwen2"
36
+
37
+
38
+ class Videollama2Qwen2Model(Videollama2MetaModel, Qwen2Model):
39
+ config_class = Videollama2Qwen2Config
40
+
41
+ def __init__(self, config: Videollama2Qwen2Config):
42
+ super(Videollama2Qwen2Model, self).__init__(config)
43
+
44
+
45
+ class Videollama2Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2Qwen2Config
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(Qwen2ForCausalLM, self).__init__(config)
50
+ self.model = Videollama2Qwen2Model(config)
51
+ # self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ **kwargs
75
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
76
+
77
+ if inputs_embeds is None:
78
+ (
79
+ input_ids,
80
+ attention_mask,
81
+ past_key_values,
82
+ inputs_embeds,
83
+ labels
84
+ ) = self.prepare_inputs_labels_for_multimodal(
85
+ input_ids,
86
+ attention_mask,
87
+ past_key_values,
88
+ labels,
89
+ images
90
+ )
91
+
92
+ return super().forward(
93
+ input_ids=input_ids,
94
+ attention_mask=attention_mask,
95
+ past_key_values=past_key_values,
96
+ inputs_embeds=inputs_embeds,
97
+ labels=labels,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict,
102
+ )
103
+
104
+ @torch.no_grad()
105
+ def generate(
106
+ self,
107
+ inputs: Optional[torch.Tensor] = None,
108
+ images: Optional[torch.Tensor] = None,
109
+ **kwargs,
110
+ ) -> Union[GenerateOutput, torch.LongTensor]:
111
+ position_ids = kwargs.pop("position_ids", None)
112
+ attention_mask = kwargs.pop("attention_mask", None)
113
+ if "inputs_embeds" in kwargs:
114
+ raise NotImplementedError("`inputs_embeds` is not supported")
115
+
116
+ if images is not None:
117
+ (
118
+ input_ids,
119
+ attention_mask,
120
+ past_key_values,
121
+ inputs_embeds,
122
+ _
123
+ ) = self.prepare_inputs_labels_for_multimodal(
124
+ input_ids=inputs,
125
+ attention_mask=attention_mask,
126
+ past_key_values=None,
127
+ labels=None,
128
+ images=images
129
+ )
130
+ else:
131
+ inputs_embeds = self.get_model().embed_tokens(inputs)
132
+
133
+ return super().generate(
134
+ position_ids=position_ids,
135
+ attention_mask=attention_mask,
136
+ inputs_embeds=inputs_embeds,
137
+ **kwargs
138
+ )
139
+
140
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
141
+ images = kwargs.pop("images", None)
142
+ _inputs = super().prepare_inputs_for_generation(
143
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
144
+ )
145
+ if images is not None:
146
+ _inputs['images'] = images
147
+ return _inputs
148
+
149
+
150
+ AutoConfig.register("videollama2_qwen2", Videollama2Qwen2Config)
151
+ AutoModelForCausalLM.register(Videollama2Qwen2Config, Videollama2Qwen2ForCausalLM)
videollama2/serve/cli.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, NUM_FRAMES
5
+ from videollama2.conversation import conv_templates, SeparatorStyle
6
+ from videollama2.model.builder import load_pretrained_model
7
+ from videollama2.utils import disable_torch_init
8
+ from videollama2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, tokenizer_MMODAL_token
9
+
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+
13
+ import requests
14
+ from io import BytesIO
15
+ from transformers import TextStreamer
16
+
17
+
18
+ def load_image(image_file):
19
+ if image_file.startswith('http://') or image_file.startswith('https://'):
20
+ response = requests.get(image_file)
21
+ image = Image.open(BytesIO(response.content)).convert('RGB')
22
+ else:
23
+ image = Image.open(image_file).convert('RGB')
24
+ return image
25
+
26
+ def load_video(video_file):
27
+ decord_vr = VideoReader(uri=video_file, ctx=cpu(0))
28
+ duration = len(decord_vr)
29
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
30
+ video = decord_vr.get_batch(frame_id_list)
31
+ return video
32
+
33
+ def load_image_or_video(image_or_video_file):
34
+ if file_path.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
35
+ return load_image(image_file=image_or_video_file)
36
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
37
+ return load_video(video_file=image_or_video_file)
38
+ else:
39
+ raise Exception(f"File type of {image_or_video_file} not supported!!!")
40
+
41
+
42
+ def main(args):
43
+ # Model
44
+ disable_torch_init()
45
+
46
+ model_name = get_model_name_from_path(args.model_path)
47
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
48
+
49
+ # if "llama-2" in model_name.lower():
50
+ # conv_mode = "llava_llama2"
51
+ # elif "mistral" in model_name.lower():
52
+ # conv_mode = "mistral"
53
+ # elif "v1.6-34b" in model_name.lower():
54
+ # conv_mode = "chatml_direct"
55
+ # elif "v1" in model_name.lower():
56
+ # conv_mode = "llava_v1"
57
+ # else:
58
+ # conv_mode = "llava_v0"
59
+ conv_mode = "llava_v1" # fix conversation mode for now
60
+
61
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
62
+ print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
63
+ else:
64
+ args.conv_mode = conv_mode
65
+
66
+ conv = conv_templates[args.conv_mode].copy()
67
+ roles = conv.roles
68
+
69
+ image = load_image(args.image_file)
70
+ image_size = image.size
71
+ # Similar operation in model_worker.py
72
+ image_tensor = process_images([image], image_processor, model.config)
73
+ if type(image_tensor) is list:
74
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
75
+ else:
76
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
77
+
78
+ while True:
79
+ try:
80
+ inp = input(f"{roles[0]}: ")
81
+ except EOFError:
82
+ inp = ""
83
+ if not inp:
84
+ print("exit...")
85
+ break
86
+
87
+ print(f"{roles[1]}: ", end="")
88
+
89
+ if image is not None:
90
+ # first message
91
+ if model.config.mm_use_im_start_end:
92
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
93
+ else:
94
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
95
+ conv.append_message(conv.roles[0], inp)
96
+ image = None
97
+ else:
98
+ # later messages
99
+ conv.append_message(conv.roles[0], inp)
100
+ conv.append_message(conv.roles[1], None)
101
+ prompt = conv.get_prompt()
102
+
103
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
104
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
105
+ keywords = [stop_str]
106
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
107
+
108
+ with torch.inference_mode():
109
+ output_ids = model.generate(
110
+ input_ids,
111
+ images=image_tensor,
112
+ image_sizes=[image_size],
113
+ do_sample=True if args.temperature > 0 else False,
114
+ temperature=args.temperature,
115
+ max_new_tokens=args.max_new_tokens,
116
+ streamer=streamer,
117
+ use_cache=True)
118
+
119
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
120
+ conv.messages[-1][-1] = outputs
121
+
122
+ if args.debug:
123
+ print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
124
+
125
+
126
+ if __name__ == "__main__":
127
+ parser = argparse.ArgumentParser()
128
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
129
+ parser.add_argument("--model-base", type=str, default=None)
130
+ parser.add_argument("--image-file", type=str, required=True)
131
+ parser.add_argument("--device", type=str, default="cuda")
132
+ parser.add_argument("--conv-mode", type=str, default=None)
133
+ parser.add_argument("--temperature", type=float, default=0.2)
134
+ parser.add_argument("--max-new-tokens", type=int, default=512)
135
+ parser.add_argument("--load-8bit", action="store_true")
136
+ parser.add_argument("--load-4bit", action="store_true")
137
+ parser.add_argument("--debug", action="store_true")
138
+ args = parser.parse_args()
139
+ main(args)
videollama2/serve/controller.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A controller manages distributed workers.
3
+ It sends worker addresses to clients.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import dataclasses
8
+ from enum import Enum, auto
9
+ import json
10
+ import logging
11
+ import time
12
+ from typing import List, Union
13
+ import threading
14
+
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.responses import StreamingResponse
17
+ import numpy as np
18
+ import requests
19
+ import uvicorn
20
+
21
+ from videollama2.constants import CONTROLLER_HEART_BEAT_EXPIRATION
22
+ from videollama2.utils import build_logger, server_error_msg
23
+
24
+
25
+ logger = build_logger("controller", "controller.log")
26
+
27
+
28
+ class DispatchMethod(Enum):
29
+ LOTTERY = auto()
30
+ SHORTEST_QUEUE = auto()
31
+
32
+ @classmethod
33
+ def from_str(cls, name):
34
+ if name == "lottery":
35
+ return cls.LOTTERY
36
+ elif name == "shortest_queue":
37
+ return cls.SHORTEST_QUEUE
38
+ else:
39
+ raise ValueError(f"Invalid dispatch method")
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class WorkerInfo:
44
+ model_names: List[str]
45
+ speed: int
46
+ queue_length: int
47
+ check_heart_beat: bool
48
+ last_heart_beat: str
49
+
50
+
51
+ def heart_beat_controller(controller):
52
+ while True:
53
+ time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
54
+ controller.remove_stable_workers_by_expiration()
55
+
56
+
57
+ class Controller:
58
+ def __init__(self, dispatch_method: str):
59
+ # Dict[str -> WorkerInfo]
60
+ self.worker_info = {}
61
+ self.dispatch_method = DispatchMethod.from_str(dispatch_method)
62
+
63
+ self.heart_beat_thread = threading.Thread(
64
+ target=heart_beat_controller, args=(self,), daemon=True)
65
+ self.heart_beat_thread.start()
66
+
67
+ logger.info("Init controller")
68
+
69
+ def register_worker(self, worker_name: str, check_heart_beat: bool,
70
+ worker_status: dict):
71
+ if worker_name not in self.worker_info:
72
+ logger.info(f"Register a new worker: {worker_name}")
73
+ else:
74
+ logger.info(f"Register an existing worker: {worker_name}")
75
+
76
+ if not worker_status:
77
+ worker_status = self.get_worker_status(worker_name)
78
+ if not worker_status:
79
+ return False
80
+
81
+ self.worker_info[worker_name] = WorkerInfo(
82
+ worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
83
+ check_heart_beat, time.time())
84
+
85
+ logger.info(f"Register done: {worker_name}, {worker_status}")
86
+ return True
87
+
88
+ def get_worker_status(self, worker_name: str):
89
+ try:
90
+ r = requests.post(worker_name + "/worker_get_status", timeout=5)
91
+ except requests.exceptions.RequestException as e:
92
+ logger.error(f"Get status fails: {worker_name}, {e}")
93
+ return None
94
+
95
+ if r.status_code != 200:
96
+ logger.error(f"Get status fails: {worker_name}, {r}")
97
+ return None
98
+
99
+ return r.json()
100
+
101
+ def remove_worker(self, worker_name: str):
102
+ del self.worker_info[worker_name]
103
+
104
+ def refresh_all_workers(self):
105
+ old_info = dict(self.worker_info)
106
+ self.worker_info = {}
107
+
108
+ for w_name, w_info in old_info.items():
109
+ if not self.register_worker(w_name, w_info.check_heart_beat, None):
110
+ logger.info(f"Remove stale worker: {w_name}")
111
+
112
+ def list_models(self):
113
+ model_names = set()
114
+
115
+ for w_name, w_info in self.worker_info.items():
116
+ model_names.update(w_info.model_names)
117
+
118
+ return list(model_names)
119
+
120
+ def get_worker_address(self, model_name: str):
121
+ if self.dispatch_method == DispatchMethod.LOTTERY:
122
+ worker_names = []
123
+ worker_speeds = []
124
+ for w_name, w_info in self.worker_info.items():
125
+ if model_name in w_info.model_names:
126
+ worker_names.append(w_name)
127
+ worker_speeds.append(w_info.speed)
128
+ worker_speeds = np.array(worker_speeds, dtype=np.float32)
129
+ norm = np.sum(worker_speeds)
130
+ if norm < 1e-4:
131
+ return ""
132
+ worker_speeds = worker_speeds / norm
133
+ if True: # Directly return address
134
+ pt = np.random.choice(np.arange(len(worker_names)),
135
+ p=worker_speeds)
136
+ worker_name = worker_names[pt]
137
+ return worker_name
138
+
139
+ # Check status before returning
140
+ while True:
141
+ pt = np.random.choice(np.arange(len(worker_names)),
142
+ p=worker_speeds)
143
+ worker_name = worker_names[pt]
144
+
145
+ if self.get_worker_status(worker_name):
146
+ break
147
+ else:
148
+ self.remove_worker(worker_name)
149
+ worker_speeds[pt] = 0
150
+ norm = np.sum(worker_speeds)
151
+ if norm < 1e-4:
152
+ return ""
153
+ worker_speeds = worker_speeds / norm
154
+ continue
155
+ return worker_name
156
+ elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
157
+ worker_names = []
158
+ worker_qlen = []
159
+ for w_name, w_info in self.worker_info.items():
160
+ if model_name in w_info.model_names:
161
+ worker_names.append(w_name)
162
+ worker_qlen.append(w_info.queue_length / w_info.speed)
163
+ if len(worker_names) == 0:
164
+ return ""
165
+ min_index = np.argmin(worker_qlen)
166
+ w_name = worker_names[min_index]
167
+ self.worker_info[w_name].queue_length += 1
168
+ logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
169
+ return w_name
170
+ else:
171
+ raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
172
+
173
+ def receive_heart_beat(self, worker_name: str, queue_length: int):
174
+ if worker_name not in self.worker_info:
175
+ logger.info(f"Receive unknown heart beat. {worker_name}")
176
+ return False
177
+
178
+ self.worker_info[worker_name].queue_length = queue_length
179
+ self.worker_info[worker_name].last_heart_beat = time.time()
180
+ logger.info(f"Receive heart beat. {worker_name}")
181
+ return True
182
+
183
+ def remove_stable_workers_by_expiration(self):
184
+ expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
185
+ to_delete = []
186
+ for worker_name, w_info in self.worker_info.items():
187
+ if w_info.check_heart_beat and w_info.last_heart_beat < expire:
188
+ to_delete.append(worker_name)
189
+
190
+ for worker_name in to_delete:
191
+ self.remove_worker(worker_name)
192
+
193
+ def worker_api_generate_stream(self, params):
194
+ worker_addr = self.get_worker_address(params["model"])
195
+ if not worker_addr:
196
+ logger.info(f"no worker: {params['model']}")
197
+ ret = {
198
+ "text": server_error_msg,
199
+ "error_code": 2,
200
+ }
201
+ yield json.dumps(ret).encode() + b"\0"
202
+
203
+ try:
204
+ response = requests.post(worker_addr + "/worker_generate_stream",
205
+ json=params, stream=True, timeout=5)
206
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
207
+ if chunk:
208
+ yield chunk + b"\0"
209
+ except requests.exceptions.RequestException as e:
210
+ logger.info(f"worker timeout: {worker_addr}")
211
+ ret = {
212
+ "text": server_error_msg,
213
+ "error_code": 3,
214
+ }
215
+ yield json.dumps(ret).encode() + b"\0"
216
+
217
+
218
+ # Let the controller act as a worker to achieve hierarchical
219
+ # management. This can be used to connect isolated sub networks.
220
+ def worker_api_get_status(self):
221
+ model_names = set()
222
+ speed = 0
223
+ queue_length = 0
224
+
225
+ for w_name in self.worker_info:
226
+ worker_status = self.get_worker_status(w_name)
227
+ if worker_status is not None:
228
+ model_names.update(worker_status["model_names"])
229
+ speed += worker_status["speed"]
230
+ queue_length += worker_status["queue_length"]
231
+
232
+ return {
233
+ "model_names": list(model_names),
234
+ "speed": speed,
235
+ "queue_length": queue_length,
236
+ }
237
+
238
+
239
+ app = FastAPI()
240
+
241
+
242
+ @app.post("/register_worker")
243
+ async def register_worker(request: Request):
244
+ data = await request.json()
245
+ controller.register_worker(
246
+ data["worker_name"], data["check_heart_beat"],
247
+ data.get("worker_status", None))
248
+
249
+
250
+ @app.post("/refresh_all_workers")
251
+ async def refresh_all_workers():
252
+ models = controller.refresh_all_workers()
253
+
254
+
255
+ @app.post("/list_models")
256
+ async def list_models():
257
+ models = controller.list_models()
258
+ return {"models": models}
259
+
260
+
261
+ @app.post("/get_worker_address")
262
+ async def get_worker_address(request: Request):
263
+ data = await request.json()
264
+ addr = controller.get_worker_address(data["model"])
265
+ return {"address": addr}
266
+
267
+
268
+ @app.post("/receive_heart_beat")
269
+ async def receive_heart_beat(request: Request):
270
+ data = await request.json()
271
+ exist = controller.receive_heart_beat(
272
+ data["worker_name"], data["queue_length"])
273
+ return {"exist": exist}
274
+
275
+
276
+ @app.post("/worker_generate_stream")
277
+ async def worker_api_generate_stream(request: Request):
278
+ params = await request.json()
279
+ generator = controller.worker_api_generate_stream(params)
280
+ return StreamingResponse(generator)
281
+
282
+
283
+ @app.post("/worker_get_status")
284
+ async def worker_api_get_status(request: Request):
285
+ return controller.worker_api_get_status()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument("--host", type=str, default="localhost")
291
+ parser.add_argument("--port", type=int, default=21001)
292
+ parser.add_argument("--dispatch-method", type=str, choices=[
293
+ "lottery", "shortest_queue"], default="shortest_queue")
294
+ args = parser.parse_args()
295
+ logger.info(f"args: {args}")
296
+
297
+ controller = Controller(args.dispatch_method)
298
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/gradio_web_server.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import hashlib
5
+ import requests
6
+ import argparse
7
+ import datetime
8
+
9
+ import numpy as np
10
+ import gradio as gr
11
+ from decord import VideoReader, cpu
12
+
13
+ from videollama2.constants import LOGDIR, NUM_FRAMES
14
+ from videollama2.conversation import (default_conversation, conv_templates,SeparatorStyle)
15
+ from videollama2.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
16
+
17
+
18
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
19
+
20
+ headers = {"User-Agent": "Videollama2 Client"}
21
+
22
+ no_change_btn = gr.Button.update()
23
+ enable_btn = gr.Button.update(interactive=True)
24
+ disable_btn = gr.Button.update(interactive=False)
25
+
26
+ priority = {
27
+ "vicuna-13b": "aaaaaaa",
28
+ "koala-13b": "aaaaaab",
29
+ }
30
+
31
+
32
+ def get_conv_log_filename():
33
+ t = datetime.datetime.now()
34
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
35
+ return name
36
+
37
+
38
+ def get_model_list():
39
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
40
+ assert ret.status_code == 200
41
+ ret = requests.post(args.controller_url + "/list_models")
42
+ models = ret.json()["models"]
43
+ models.sort(key=lambda x: priority.get(x, x))
44
+ logger.info(f"Models: {models}")
45
+ return models
46
+
47
+
48
+ get_window_url_params = """
49
+ function() {
50
+ const params = new URLSearchParams(window.location.search);
51
+ url_params = Object.fromEntries(params);
52
+ console.log(url_params);
53
+ return url_params;
54
+ }
55
+ """
56
+
57
+
58
+ def load_demo(url_params, request: gr.Request):
59
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
60
+
61
+ dropdown_update = gr.Dropdown.update(visible=True)
62
+ if "model" in url_params:
63
+ model = url_params["model"]
64
+ if model in models:
65
+ dropdown_update = gr.Dropdown.update(
66
+ value=model, visible=True)
67
+
68
+ state = default_conversation.copy()
69
+ return state, dropdown_update
70
+
71
+
72
+ def load_demo_refresh_model_list(request: gr.Request):
73
+ logger.info(f"load_demo. ip: {request.client.host}")
74
+ models = get_model_list()
75
+ state = default_conversation.copy()
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=models,
78
+ value=models[0] if len(models) > 0 else ""
79
+ )
80
+ return state, dropdown_update
81
+
82
+
83
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
84
+ with open(get_conv_log_filename(), "a") as fout:
85
+ data = {
86
+ "tstamp": round(time.time(), 4),
87
+ "type": vote_type,
88
+ "model": model_selector,
89
+ "state": state.dict(),
90
+ "ip": request.client.host,
91
+ }
92
+ fout.write(json.dumps(data) + "\n")
93
+
94
+
95
+ def upvote_last_response(state, model_selector, request: gr.Request):
96
+ logger.info(f"upvote. ip: {request.client.host}")
97
+ vote_last_response(state, "upvote", model_selector, request)
98
+ return ("",) + (disable_btn,) * 3
99
+
100
+
101
+ def downvote_last_response(state, model_selector, request: gr.Request):
102
+ logger.info(f"downvote. ip: {request.client.host}")
103
+ vote_last_response(state, "downvote", model_selector, request)
104
+ return ("",) + (disable_btn,) * 3
105
+
106
+
107
+ def flag_last_response(state, model_selector, request: gr.Request):
108
+ logger.info(f"flag. ip: {request.client.host}")
109
+ vote_last_response(state, "flag", model_selector, request)
110
+ return ("",) + (disable_btn,) * 3
111
+
112
+
113
+ def regenerate(state, image_process_mode, request: gr.Request):
114
+ logger.info(f"regenerate. ip: {request.client.host}")
115
+ state.messages[-1][-1] = None
116
+ prev_human_msg = state.messages[-2]
117
+ if type(prev_human_msg[1]) in (tuple, list):
118
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
119
+ state.skip_next = False
120
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
121
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
122
+
123
+
124
+ def clear_history(request: gr.Request):
125
+ logger.info(f"clear_history. ip: {request.client.host}")
126
+ state = default_conversation.copy()
127
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
128
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
129
+
130
+
131
+ def add_text_ori(state, text, image, video, image_process_mode, request: gr.Request):
132
+ # note: imagebox itself is PIL object while videobox is filepath
133
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
134
+ if len(text) <= 0 and image is None:
135
+ state.skip_next = True
136
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
137
+ if args.moderate:
138
+ flagged = violates_moderation(text)
139
+ if flagged:
140
+ state.skip_next = True
141
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
142
+ no_change_btn,) * 5
143
+ assert image is None or video is None, "Please don't feed image and video inputs at the same time!!!"
144
+ text = text[:1536] # Hard cut-off
145
+ if image is not None:
146
+ # here image is the PIL object itself
147
+ text = text[:1200] # Hard cut-off for images
148
+ if '<image>' not in text:
149
+ # text = '<Image><image></Image>' + text
150
+ text = text + '\n<image>'
151
+ text = (text, image, image_process_mode)
152
+ if len(state.get_images(return_pil=True)) > 0:
153
+ state = default_conversation.copy()
154
+ state.modality = "image"
155
+ if video is not None:
156
+ print("Video box:", video)
157
+ # here video is the file path of video
158
+ text = text[:1200] # Hard cut-off for images
159
+ if '<video>' not in text:
160
+ # text = '<Image><image></Image>' + text
161
+ text = text + '\n<video>'
162
+ text = (text, video, image_process_mode)
163
+ if len(state.get_videos(return_pil=True)) > 0:
164
+ state = default_conversation.copy()
165
+ state.modality = "video"
166
+ print("Set modality as video...")
167
+ state.append_message(state.roles[0], text)
168
+ state.append_message(state.roles[1], None)
169
+ state.skip_next = False
170
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
171
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
172
+
173
+
174
+ def add_text(state, text, image, video, image_process_mode, request: gr.Request):
175
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
176
+
177
+ # if input is new video or image ,reset the state
178
+ if image is not None or video is not None:
179
+ state = default_conversation.copy()
180
+
181
+ if len(text) <= 0 and image is None and video is None:
182
+ state.skip_next = True
183
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
184
+
185
+ if args.moderate:
186
+ flagged = violates_moderation(text)
187
+ if flagged:
188
+ state.skip_next = True
189
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (no_change_btn,) * 5
190
+
191
+ # process the input video
192
+ if video is not None:
193
+ text = text[:1200] #
194
+ if '<video>' not in text:
195
+ text = text + '\n<video>'
196
+ text = (text, video, image_process_mode)
197
+ state.modality = "video"
198
+ # process the input image
199
+ elif image is not None:
200
+ text = text[:1200] #
201
+ if '<image>' not in text:
202
+ text = text + '\n<image>'
203
+ text = (text, image, image_process_mode)
204
+ state.modality = "image"
205
+ elif state.modality == "image" and len(text)>0:
206
+ state.modality = "image_text"
207
+ text = text[:1536] # Hard cut-off
208
+ elif state.modality == "video" and len(text)>0:
209
+ state.modality = "video_text"
210
+ text = text[:1536] # Hard cut-off
211
+
212
+ state.append_message(state.roles[0], text)
213
+ state.append_message(state.roles[1], None)
214
+ state.skip_next = False
215
+
216
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
217
+
218
+
219
+ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
220
+ logger.info(f"http_bot. ip: {request.client.host}")
221
+ start_tstamp = time.time()
222
+ model_name = model_selector
223
+
224
+ if state.skip_next:
225
+ # This generate call is skipped due to invalid inputs
226
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
227
+ return
228
+
229
+ if len(state.messages) == state.offset + 2:
230
+ # First round of conversation
231
+ if "llava" in model_name.lower():
232
+ if 'llama-2' in model_name.lower():
233
+ template_name = "llava_llama2"
234
+ elif "v1" in model_name.lower():
235
+ if 'mmtag' in model_name.lower():
236
+ template_name = "v1_mmtag"
237
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
238
+ template_name = "v1_mmtag"
239
+ else:
240
+ template_name = "llava_v1"
241
+ else:
242
+ if 'mmtag' in model_name.lower():
243
+ template_name = "v0_mmtag"
244
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
245
+ template_name = "v0_mmtag"
246
+ else:
247
+ template_name = "llava_v0"
248
+ elif "llama-2" in model_name:
249
+ template_name = "llama2"
250
+ else:
251
+ template_name = "vicuna_v1"
252
+ template_name = "llava_v1"
253
+ new_state = conv_templates[template_name].copy()
254
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
255
+ new_state.append_message(new_state.roles[1], None)
256
+ new_state.modality = state.modality
257
+ state = new_state
258
+
259
+ # Query worker address
260
+ controller_url = args.controller_url
261
+ ret = requests.post(controller_url + "/get_worker_address",
262
+ json={"model": model_name})
263
+ worker_addr = ret.json()["address"]
264
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
265
+
266
+ # No available worker
267
+ if worker_addr == "":
268
+ state.messages[-1][-1] = server_error_msg
269
+ yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
270
+ return
271
+
272
+ # Construct prompt
273
+ prompt = state.get_prompt()
274
+ if state.modality == "image" or state.modality == "image_text":
275
+ all_images = state.get_images(return_pil=True) # return PIL.Image object
276
+ elif state.modality == "video" or state.modality == "video_text":
277
+ all_images = state.get_videos(return_pil=True) # return video frames where each frame is a PIL.Image object
278
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
279
+ for idx, (image, hash) in enumerate(zip(all_images, all_image_hash)):
280
+ t = datetime.datetime.now()
281
+ if state.modality == "image" or state.modality == "image_text":
282
+ filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
283
+ elif state.modality == "video" or state.modality == "video_text":
284
+ filename = os.path.join(LOGDIR, "serve_videos", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}_{idx}.jpg")
285
+ if not os.path.isfile(filename):
286
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
287
+ image.save(filename)
288
+
289
+ # Make requests
290
+ pload = {
291
+ "model": model_name,
292
+ "prompt": prompt,
293
+ "temperature": float(temperature),
294
+ "top_p": float(top_p),
295
+ "max_new_tokens": min(int(max_new_tokens), 1536),
296
+ "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE] else state.sep2,
297
+ #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
298
+ "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
299
+ }
300
+ logger.info(f"==== request ====\n{pload}")
301
+
302
+ if state.modality == "image" or state.modality == "image_text":
303
+ pload['images'] = state.get_images()
304
+ elif state.modality == "video" or state.modality == "video_text":
305
+ pload['images'] = state.get_videos()
306
+
307
+ state.messages[-1][-1] = "▌"
308
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
309
+
310
+ try:
311
+ # Stream output
312
+ response = requests.post(worker_addr + "/worker_generate_stream",
313
+ headers=headers, json=pload, stream=True, timeout=10)
314
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
315
+ if chunk:
316
+ data = json.loads(chunk.decode())
317
+ if data["error_code"] == 0:
318
+ output = data["text"][len(prompt):].strip()
319
+ state.messages[-1][-1] = output + "▌"
320
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
321
+ else:
322
+ output = data["text"] + f" (error_code: {data['error_code']})"
323
+ state.messages[-1][-1] = output
324
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
325
+ return
326
+ time.sleep(0.03)
327
+ except requests.exceptions.RequestException as e:
328
+ state.messages[-1][-1] = server_error_msg
329
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
330
+ return
331
+
332
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
333
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
334
+
335
+ finish_tstamp = time.time()
336
+ logger.info(f"{output}")
337
+
338
+ with open(get_conv_log_filename(), "a") as fout:
339
+ data = {
340
+ "tstamp": round(finish_tstamp, 4),
341
+ "type": "chat",
342
+ "model": model_name,
343
+ "start": round(start_tstamp, 4),
344
+ "finish": round(start_tstamp, 4),
345
+ #"state": state.dict(),
346
+ "images": all_image_hash,
347
+ "ip": request.client.host,
348
+ }
349
+ fout.write(json.dumps(data) + "\n")
350
+
351
+ title_markdown = ("""
352
+ # The publicl release of VideoLLaMA2
353
+ """)
354
+
355
+ tos_markdown = ("""
356
+ ### Terms of use
357
+ By using this service, users are required to agree to the following terms:
358
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
359
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
360
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
361
+ """)
362
+
363
+
364
+ learn_more_markdown = ("""
365
+ ### License
366
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
367
+ """)
368
+
369
+ block_css = """
370
+
371
+ #buttons button {
372
+ min-width: min(120px,100%);
373
+ }
374
+
375
+ """
376
+
377
+ def build_demo(embed_mode):
378
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
379
+ with gr.Blocks(title="Video-Llama", theme=gr.themes.Default(), css=block_css) as demo:
380
+ state = gr.State()
381
+
382
+ if not embed_mode:
383
+ gr.Markdown(title_markdown)
384
+
385
+ with gr.Row():
386
+ with gr.Column(scale=3):
387
+ with gr.Row(elem_id="model_selector_row"):
388
+ model_selector = gr.Dropdown(
389
+ choices=models,
390
+ value=models[0] if len(models) > 0 else "",
391
+ interactive=True,
392
+ show_label=False,
393
+ container=False)
394
+
395
+ imagebox = gr.Image(type="pil")
396
+ videobox = gr.Video()
397
+ image_process_mode = gr.Radio(
398
+ ["Crop", "Resize", "Pad", "Default"],
399
+ value="Default",
400
+ label="Preprocess for non-square image", visible=False)
401
+
402
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
403
+ gr.Examples(examples=[
404
+ [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
405
+ [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
406
+ [f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?"],
407
+ ], inputs=[imagebox, textbox], label="Image examples")
408
+
409
+ # video example inputs
410
+ gr.Examples(examples=[
411
+ [f"{cur_dir}/examples/sample_demo_1.mp4", "Why is this video funny?"],
412
+ [f"{cur_dir}/examples/sample_demo_3.mp4", "Can you identify any safety hazards in this video?"],
413
+ [f"{cur_dir}/examples/1034346401.mp4", "What is this young woman doing?"]
414
+ ], inputs=[videobox, textbox], label="Video examples")
415
+ #[f"{cur_dir}/examples/sample_demo_9.mp4", "Describe the video in detail and please do not generate repetitive content."]
416
+
417
+ with gr.Accordion("Parameters", open=False) as parameter_row:
418
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
419
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
420
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
421
+
422
+ with gr.Column(scale=8):
423
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Videollama2 Chatbot", height=550)
424
+ with gr.Row():
425
+ with gr.Column(scale=8):
426
+ textbox.render()
427
+ with gr.Column(scale=1, min_width=50):
428
+ submit_btn = gr.Button(value="Send", variant="primary")
429
+ with gr.Row(elem_id="buttons") as button_row:
430
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
431
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
432
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
433
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
434
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
435
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
436
+
437
+ if not embed_mode:
438
+ gr.Markdown(tos_markdown)
439
+ gr.Markdown(learn_more_markdown)
440
+ url_params = gr.JSON(visible=False)
441
+
442
+ # Register listeners
443
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
444
+ upvote_btn.click(upvote_last_response,
445
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
446
+ downvote_btn.click(downvote_last_response,
447
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
448
+ flag_btn.click(flag_last_response,
449
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
450
+ regenerate_btn.click(regenerate, [state, image_process_mode],
451
+ [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
452
+ http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
453
+ [state, chatbot] + btn_list)
454
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
455
+
456
+ textbox.submit(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
457
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
458
+ [state, chatbot] + btn_list)
459
+ submit_btn.click(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
460
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
461
+ [state, chatbot] + btn_list)
462
+
463
+ if args.model_list_mode == "once":
464
+ demo.load(load_demo, [url_params], [state, model_selector],
465
+ _js=get_window_url_params)
466
+ elif args.model_list_mode == "reload":
467
+ demo.load(load_demo_refresh_model_list, None, [state, model_selector])
468
+ else:
469
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
470
+
471
+ return demo
472
+
473
+
474
+ if __name__ == "__main__":
475
+ parser = argparse.ArgumentParser()
476
+ parser.add_argument("--host", type=str, default="0.0.0.0")
477
+ parser.add_argument("--port", type=int)
478
+ parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
479
+ parser.add_argument("--concurrency-count", type=int, default=10)
480
+ parser.add_argument("--model-list-mode", type=str, default="once",
481
+ choices=["once", "reload"])
482
+ parser.add_argument("--share", action="store_true")
483
+ parser.add_argument("--moderate", action="store_true")
484
+ parser.add_argument("--embed", action="store_true")
485
+ args = parser.parse_args()
486
+ logger.info(f"args: {args}")
487
+
488
+ models = get_model_list()
489
+
490
+ logger.info(args)
491
+ demo = build_demo(args.embed)
492
+ demo.queue(
493
+ concurrency_count=args.concurrency_count,
494
+ api_open=False
495
+ ).launch(
496
+ server_name=args.host,
497
+ server_port=args.port,
498
+ share=args.share
499
+ )
videollama2/serve/gradio_web_server_adhoc.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import os
4
+ import re
5
+
6
+ import torch
7
+ import gradio as gr
8
+
9
+ import sys
10
+ sys.path.append('./')
11
+ from videollama2 import model_init, mm_infer
12
+ from videollama2.utils import disable_torch_init
13
+
14
+
15
+ title_markdown = ("""
16
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
17
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
18
+ <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
19
+ </a>
20
+ <div>
21
+ <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
22
+ <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
23
+ </div>
24
+ </div>
25
+
26
+
27
+ <div align="center">
28
+ <div style="display:flex; gap: 0.25rem; margin-top: 10px;" align="center">
29
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2"><img src='https://img.shields.io/badge/Github-VideoLLaMA2-9C276A'></a>
30
+ <a href="https://arxiv.org/pdf/2406.07476.pdf"><img src="https://img.shields.io/badge/Arxiv-2406.07476-AD1C18"></a>
31
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2/stargazers"><img src="https://img.shields.io/github/stars/DAMO-NLP-SG/VideoLLaMA2.svg?style=social"></a>
32
+ </div>
33
+ </div>
34
+ """)
35
+
36
+
37
+ block_css = """
38
+ #buttons button {
39
+ min-width: min(120px,100%);
40
+ color: #9C276A
41
+ }
42
+ """
43
+
44
+
45
+ tos_markdown = ("""
46
+ ### Terms of use
47
+ By using this service, users are required to agree to the following terms:
48
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
49
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
50
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
51
+ """)
52
+
53
+
54
+ learn_more_markdown = ("""
55
+ ### License
56
+ This project is released under the Apache 2.0 license as found in the LICENSE file. The service is a research preview intended for non-commercial use ONLY, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
57
+ """)
58
+
59
+
60
+ plum_color = gr.themes.colors.Color(
61
+ name='plum',
62
+ c50='#F8E4EF',
63
+ c100='#E9D0DE',
64
+ c200='#DABCCD',
65
+ c300='#CBA8BC',
66
+ c400='#BC94AB',
67
+ c500='#AD809A',
68
+ c600='#9E6C89',
69
+ c700='#8F5878',
70
+ c800='#804467',
71
+ c900='#713056',
72
+ c950='#662647',
73
+ )
74
+
75
+
76
+ class Chat:
77
+
78
+ def __init__(self, model_path, load_8bit=False, load_4bit=False):
79
+ disable_torch_init()
80
+
81
+ self.model, self.processor, self.tokenizer = model_init(model_path, load_8bit=load_8bit, load_4bit=load_4bit)
82
+
83
+ @spaces.GPU(duration=120)
84
+ @torch.inference_mode()
85
+ def generate(self, data: list, message, temperature, top_p, max_output_tokens):
86
+ # TODO: support multiple turns of conversation.
87
+ assert len(data) == 1
88
+
89
+ tensor, modal = data[0]
90
+ response = mm_infer(tensor, message, self.model, self.tokenizer, modal=modal.strip('<>'),
91
+ do_sample=True if temperature > 0.0 else False,
92
+ temperature=temperature,
93
+ top_p=top_p,
94
+ max_new_tokens=max_output_tokens)
95
+
96
+ return response
97
+
98
+
99
+ @spaces.GPU(duration=120)
100
+ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
101
+ data = []
102
+
103
+ processor = handler.processor
104
+ try:
105
+ if image is not None:
106
+ data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
107
+ elif video is not None:
108
+ data.append((processor['video'](video).to(handler.model.device, dtype=dtype), '<video>'))
109
+ elif image is None and video is None:
110
+ data.append((None, '<text>'))
111
+ else:
112
+ raise NotImplementedError("Not support image and video at the same time")
113
+ except Exception as e:
114
+ traceback.print_exc()
115
+ return gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), message, chatbot
116
+
117
+ assert len(message) % 2 == 0, "The message should be a pair of user and system message."
118
+
119
+ show_images = ""
120
+ if image is not None:
121
+ show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
122
+ if video is not None:
123
+ show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={video}"></video>'
124
+
125
+ one_turn_chat = [textbox_in, None]
126
+
127
+ # 1. first run case
128
+ if len(chatbot) == 0:
129
+ one_turn_chat[0] += "\n" + show_images
130
+ # 2. not first run case
131
+ else:
132
+ # scanning the last image or video
133
+ length = len(chatbot)
134
+ for i in range(length - 1, -1, -1):
135
+ previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[i][0])
136
+ previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;" src="./file=(.+?)"', chatbot[i][0])
137
+
138
+ if len(previous_image) > 0:
139
+ previous_image = previous_image[-1]
140
+ # 2.1 new image append or pure text input will start a new conversation
141
+ if (video is not None) or (image is not None and os.path.basename(previous_image) != os.path.basename(image)):
142
+ message.clear()
143
+ one_turn_chat[0] += "\n" + show_images
144
+ break
145
+ elif len(previous_video) > 0:
146
+ previous_video = previous_video[-1]
147
+ # 2.2 new video append or pure text input will start a new conversation
148
+ if image is not None or (video is not None and os.path.basename(previous_video) != os.path.basename(video)):
149
+ message.clear()
150
+ one_turn_chat[0] += "\n" + show_images
151
+ break
152
+
153
+ message.append({'role': 'user', 'content': textbox_in})
154
+ text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
155
+ message.append({'role': 'assistant', 'content': text_en_out})
156
+
157
+ one_turn_chat[1] = text_en_out
158
+ chatbot.append(one_turn_chat)
159
+
160
+ return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot
161
+
162
+
163
+ def regenerate(message, chatbot):
164
+ message.pop(-1), message.pop(-1)
165
+ chatbot.pop(-1)
166
+ return message, chatbot
167
+
168
+
169
+ def clear_history(message, chatbot):
170
+ message.clear(), chatbot.clear()
171
+ return (gr.update(value=None, interactive=True),
172
+ gr.update(value=None, interactive=True),
173
+ message, chatbot,
174
+ gr.update(value=None, interactive=True))
175
+
176
+
177
+ # BUG of Zero Environment
178
+ # 1. The environment is fixed to torch>=2.0,<=2.2, gradio>=4.x.x
179
+ # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
180
+ # 3. The function can't return tensor or other cuda objects.
181
+
182
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
183
+
184
+ handler = Chat(model_path, load_8bit=False, load_4bit=True)
185
+
186
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
187
+
188
+ theme = gr.themes.Default(primary_hue=plum_color)
189
+ # theme.update_color("primary", plum_color.c500)
190
+ theme.set(slider_color="#9C276A")
191
+ theme.set(block_title_text_color="#9C276A")
192
+ theme.set(block_label_text_color="#9C276A")
193
+ theme.set(button_primary_text_color="#9C276A")
194
+ # theme.set(button_secondary_text_color="*neutral_800")
195
+
196
+
197
+ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
198
+ gr.Markdown(title_markdown)
199
+ message = gr.State([])
200
+
201
+ with gr.Row():
202
+ with gr.Column(scale=3):
203
+ image = gr.Image(label="Input Image", type="filepath")
204
+ video = gr.Video(label="Input Video")
205
+
206
+ with gr.Accordion("Parameters", open=True) as parameter_row:
207
+ # num_beams = gr.Slider(
208
+ # minimum=1,
209
+ # maximum=10,
210
+ # value=1,
211
+ # step=1,
212
+ # interactive=True,
213
+ # label="beam search numbers",
214
+ # )
215
+
216
+ temperature = gr.Slider(
217
+ minimum=0.1,
218
+ maximum=1.0,
219
+ value=0.2,
220
+ step=0.1,
221
+ interactive=True,
222
+ label="Temperature",
223
+ )
224
+
225
+ top_p = gr.Slider(
226
+ minimum=0.0,
227
+ maximum=1.0,
228
+ value=0.7,
229
+ step=0.1,
230
+ interactive=True,
231
+ label="Top P",
232
+ )
233
+
234
+ max_output_tokens = gr.Slider(
235
+ minimum=64,
236
+ maximum=1024,
237
+ value=512,
238
+ step=64,
239
+ interactive=True,
240
+ label="Max output tokens",
241
+ )
242
+
243
+ with gr.Column(scale=7):
244
+ chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
245
+ with gr.Row():
246
+ with gr.Column(scale=8):
247
+ textbox.render()
248
+ with gr.Column(scale=1, min_width=50):
249
+ submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
250
+ with gr.Row(elem_id="buttons") as button_row:
251
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
252
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
253
+ # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
254
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
255
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
256
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)
257
+
258
+ with gr.Row():
259
+ with gr.Column():
260
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
261
+ gr.Examples(
262
+ examples=[
263
+ [
264
+ f"{cur_dir}/examples/extreme_ironing.jpg",
265
+ "What happens in this image?",
266
+ ],
267
+ [
268
+ f"{cur_dir}/examples/waterview.jpg",
269
+ "What are the things I should be cautious about when I visit here?",
270
+ ],
271
+ [
272
+ f"{cur_dir}/examples/desert.jpg",
273
+ "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
274
+ ],
275
+ ],
276
+ inputs=[image, textbox],
277
+ )
278
+ with gr.Column():
279
+ gr.Examples(
280
+ examples=[
281
+ [
282
+ f"{cur_dir}/../../assets/cat_and_chicken.mp4",
283
+ "What happens in this video?",
284
+ ],
285
+ [
286
+ f"{cur_dir}/../../assets/sora.mp4",
287
+ "Please describe this video.",
288
+ ],
289
+ [
290
+ f"{cur_dir}/examples/sample_demo_1.mp4",
291
+ "What does the baby do?",
292
+ ],
293
+ ],
294
+ inputs=[video, textbox],
295
+ )
296
+
297
+ gr.Markdown(tos_markdown)
298
+ gr.Markdown(learn_more_markdown)
299
+
300
+ submit_btn.click(
301
+ generate,
302
+ [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
303
+ [image, video, message, chatbot])
304
+
305
+ regenerate_btn.click(
306
+ regenerate,
307
+ [message, chatbot],
308
+ [message, chatbot]).then(
309
+ generate,
310
+ [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
311
+ [image, video, message, chatbot])
312
+
313
+ clear_btn.click(
314
+ clear_history,
315
+ [message, chatbot],
316
+ [image, video, message, chatbot, textbox])
317
+
318
+ demo.launch()
videollama2/serve/model_worker.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import os
5
+ import json
6
+ import time
7
+ import uuid
8
+ import asyncio
9
+ import requests
10
+ import argparse
11
+ import threading
12
+ from threading import Thread
13
+ from functools import partial
14
+ from typing import Iterator, List, Optional, Tuple
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, Request, BackgroundTasks
18
+ from fastapi.responses import StreamingResponse
19
+
20
+ import torch
21
+ import decord
22
+ import numpy as np
23
+ from PIL import Image
24
+ from decord import VideoReader, cpu
25
+ from transformers import TextIteratorStreamer
26
+
27
+ from videollama2.constants import WORKER_HEART_BEAT_INTERVAL
28
+ from videollama2.utils import (build_logger, server_error_msg, pretty_print_semaphore)
29
+ from videollama2.model.builder import load_pretrained_model
30
+ from videollama2.mm_utils import process_images, process_videos, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, tokenizer_MMODAL_token
31
+ from videollama2.mm_utils import chunk_list, frame_expansion
32
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, NUM_FRAMES, MMODAL_TOKEN_INDEX
33
+
34
+
35
+ GB = 1 << 30
36
+
37
+ worker_id = str(uuid.uuid4())[:6]
38
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
39
+ global_counter = 0
40
+
41
+ model_semaphore = None
42
+
43
+
44
+ # variable_content = os.getenv('MY_VARIABLE', '')
45
+ # KEYWORDS_LIST = set(variable_content.split('\n'))
46
+ KEYWORDS_LIST = []
47
+ path = 'assets/keywords.txt'
48
+ if os.path.exists(path):
49
+ with open(path, 'r', encoding='utf-8') as file:
50
+ for line in file:
51
+
52
+ KEYWORDS_LIST.append(line.strip())
53
+ else:
54
+ KEYWORDS_LIST = []
55
+
56
+
57
+ KEYWORD_BLOCK_MESSAGE2 = "The output contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
58
+ KEYWORD_BLOCK_MESSAGE1 = "Your input question contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
59
+ STREAM_CHECK_MULTIPLE = 20
60
+
61
+
62
+ def heart_beat_worker(controller):
63
+
64
+ while True:
65
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
66
+ controller.send_heart_beat()
67
+
68
+
69
+ def safety_check(text, history=None, ) -> Optional[str]:
70
+
71
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
72
+ print('############')
73
+ return KEYWORD_BLOCK_MESSAGE2
74
+
75
+ return None
76
+
77
+
78
+ def input_safety_check(text) -> Optional[str]:
79
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
80
+ print('######## Input keyword alarm triggered:', text)
81
+ return KEYWORD_BLOCK_MESSAGE1
82
+ return None
83
+
84
+
85
+ class ModelWorker:
86
+
87
+ def __init__(self, controller_addr, worker_addr,
88
+ worker_id, no_register,
89
+ model_path, model_base, model_name,
90
+ load_8bit, load_4bit, device):
91
+ self.controller_addr = controller_addr
92
+ self.worker_addr = worker_addr
93
+ self.worker_id = worker_id
94
+ self.model_path = model_path
95
+ if model_path.endswith("/"):
96
+ model_path = model_path[:-1]
97
+ if model_name is None:
98
+ model_paths = model_path.split("/")
99
+ if model_paths[-1].startswith('checkpoint-'):
100
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
101
+ else:
102
+ self.model_name = model_paths[-1]
103
+ else:
104
+ self.model_name = model_name
105
+
106
+ self.device = device
107
+ logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
108
+ self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
109
+ model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
110
+ self.is_multimodal = 'videollama2' in self.model_name.lower() or 'vlb' in self.model_name.lower()
111
+
112
+ if not no_register:
113
+ self.register_to_controller()
114
+ self.heart_beat_thread = threading.Thread(
115
+ target=heart_beat_worker, args=(self,))
116
+ self.heart_beat_thread.start()
117
+
118
+ def register_to_controller(self):
119
+ logger.info("Register to controller")
120
+
121
+ url = self.controller_addr + "/register_worker"
122
+ data = {
123
+ "worker_name": self.worker_addr,
124
+ "check_heart_beat": True,
125
+ "worker_status": self.get_status()
126
+ }
127
+ r = requests.post(url, json=data)
128
+ assert r.status_code == 200
129
+
130
+ def send_heart_beat(self):
131
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
132
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
133
+ f"global_counter: {global_counter}")
134
+
135
+ url = self.controller_addr + "/receive_heart_beat"
136
+
137
+ while True:
138
+ try:
139
+ ret = requests.post(url, json={
140
+ "worker_name": self.worker_addr,
141
+ "queue_length": self.get_queue_length()}, timeout=5)
142
+ exist = ret.json()["exist"]
143
+ break
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"heart beat error: {e}")
146
+ time.sleep(5)
147
+
148
+ if not exist:
149
+ self.register_to_controller()
150
+
151
+ def get_queue_length(self):
152
+ if model_semaphore is None:
153
+ return 0
154
+ else:
155
+ return args.limit_model_concurrency - model_semaphore._value + (len(
156
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
157
+
158
+ def get_status(self):
159
+ return {
160
+ "model_names": [self.model_name],
161
+ "speed": 1,
162
+ "queue_length": self.get_queue_length(),
163
+ }
164
+
165
+ @torch.inference_mode()
166
+ def generate_stream(self, params):
167
+ tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
168
+
169
+ prompt = params["prompt"]
170
+ ori_prompt = prompt
171
+ images_or_videos = params.get("images", None)
172
+ #print("Input images:", images_or_videos)
173
+ num_image_tokens = 0
174
+ modal_list = []
175
+ if images_or_videos is not None and len(images_or_videos) and self.is_multimodal:
176
+ if len(images_or_videos) > 0:
177
+ if len(images_or_videos) != prompt.count(DEFAULT_IMAGE_TOKEN) and len(images_or_videos) != (prompt.count(DEFAULT_VIDEO_TOKEN)):
178
+ raise ValueError("Number of images/videos does not match number of <image>/<video> tokens in prompt")
179
+
180
+ try:
181
+ print("Load image...")
182
+ images_or_videos = [load_image_from_base64(image) for image in images_or_videos]
183
+ images_or_videos = process_images(images_or_videos, image_processor, model.config)
184
+
185
+ modal_list = ["image"]
186
+ replace_token = DEFAULT_IMAGE_TOKEN
187
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
188
+ except:
189
+ print("Load video instead...")
190
+ decord_vr = VideoReader(uri=images_or_videos[0], ctx=cpu(0))
191
+ duration = len(decord_vr)
192
+ if not "use_taug" in self.model_path:
193
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
194
+ video_frames = decord_vr.get_batch(frame_id_list).asnumpy()
195
+ images_or_videos = process_videos(video_frames, image_processor, model.config)
196
+ else:
197
+ print("Temporal augmentation activated!!!")
198
+ frame_id_list = np.linspace(0, duration-1, 8 * 2 * 2, dtype=int)
199
+ video_data = decord_vr.get_batch(frame_id_list)
200
+ video_frames = [Image.fromarray(f) for f in video_data.asnumpy()]
201
+ chunked_video_frames = chunk_list(video_frames, 2*2)
202
+ expanded_video_frames = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
203
+ images_or_videos = process_videos(expanded_video_frames, image_processor, model.config)
204
+
205
+ # frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
206
+ # images_or_videos = decord_vr.get_batch(frame_id_list).asnumpy()
207
+ # images_or_videos = process_videos(images_or_videos, image_processor, model.config)
208
+ #print("images_or_videos.shape:", images_or_videos.shape)
209
+ modal_list = ["video"]
210
+ replace_token = DEFAULT_VIDEO_TOKEN
211
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
212
+
213
+ if type(images_or_videos) is list:
214
+ images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
215
+ else:
216
+ images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
217
+ if modal_list[0] == "video":
218
+ print("Video:", images_or_videos.shape)
219
+ images_or_videos = [images_or_videos]
220
+ else:
221
+ print("Image:", images_or_videos.shape)
222
+
223
+
224
+ #image_sizes = [image.size for image in images_or_videos]
225
+
226
+
227
+ # if len(images_or_videos) % NUM_FRAMES == 0:
228
+ # images_or_videos = process_images(images_or_videos, image_processor, model.config)
229
+ # #images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
230
+ # #modal_list = ["image"] * len(images_or_videos)
231
+ # images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
232
+ # modal_list = ["video"]
233
+ # replace_token = DEFAULT_VIDEO_TOKEN
234
+ # else:
235
+
236
+ if getattr(self.model.config, 'mm_use_im_start_end', False):
237
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
238
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
239
+
240
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
241
+ else:
242
+ images = None
243
+ modal_list = []
244
+ image_args = {"images_or_videos": images_or_videos, "modal_list": modal_list}
245
+ else:
246
+ images = None
247
+ image_args = {}
248
+ print("image_args:", image_args)
249
+ temperature = float(params.get("temperature", 1.0))
250
+ top_p = float(params.get("top_p", 1.0))
251
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
252
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
253
+ stop_str = params.get("stop", None)
254
+ do_sample = True if temperature > 0.001 else False
255
+
256
+ #input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
257
+ # tokenizer for our video-llama beta
258
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to(self.device)
259
+ #print("Current prompt:", prompt)
260
+ #print("input_ids.shape:", input_ids.shape)
261
+ keywords = [stop_str]
262
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
263
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
264
+
265
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
266
+
267
+ if max_new_tokens < 1:
268
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
269
+ return
270
+
271
+ thread = Thread(target=model.generate, kwargs=dict(
272
+ inputs=input_ids,
273
+ do_sample=do_sample,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ max_new_tokens=max_new_tokens,
277
+ streamer=streamer,
278
+ stopping_criteria=[stopping_criteria],
279
+ use_cache=True,
280
+ **image_args
281
+ ))
282
+ thread.start()
283
+
284
+ generated_text = ori_prompt
285
+ token_count = 0
286
+ for new_text in streamer:
287
+ generated_text += new_text
288
+ token_count += len(tokenizer.encode(new_text))
289
+ if token_count >= STREAM_CHECK_MULTIPLE:
290
+ safety_message = safety_check(generated_text)
291
+ if safety_message:
292
+ print('####### Keyword alarm triggered:', generated_text)
293
+ yield json.dumps({"text": safety_message , "error_code": 1}).encode() + b"\0"
294
+ return
295
+ token_count = 0 #
296
+
297
+
298
+ if generated_text.endswith(stop_str):
299
+ generated_text = generated_text[:-len(stop_str)]
300
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
301
+
302
+ def generate_stream_gate(self, params):
303
+ try:
304
+ input_text = params.get("prompt", "")
305
+ safety_message = input_safety_check(input_text)
306
+ if safety_message:
307
+ yield json.dumps({"text": safety_message, "error_code": 1}).encode() + b"\0"
308
+ return
309
+
310
+ for x in self.generate_stream(params):
311
+ yield x
312
+ except ValueError as e:
313
+ print("Caught ValueError:", e)
314
+ ret = {
315
+ "text": server_error_msg,
316
+ "error_code": 1,
317
+ }
318
+ yield json.dumps(ret).encode() + b"\0"
319
+ except torch.cuda.CudaError as e:
320
+ print("Caught torch.cuda.CudaError:", e)
321
+ ret = {
322
+ "text": server_error_msg,
323
+ "error_code": 1,
324
+ }
325
+ yield json.dumps(ret).encode() + b"\0"
326
+ except Exception as e:
327
+ print("Caught Unknown Error", e)
328
+ ret = {
329
+ "text": server_error_msg,
330
+ "error_code": 1,
331
+ }
332
+ yield json.dumps(ret).encode() + b"\0"
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ def release_model_semaphore(fn=None):
339
+ model_semaphore.release()
340
+ if fn is not None:
341
+ fn()
342
+
343
+
344
+ @app.post("/worker_generate_stream")
345
+ async def generate_stream(request: Request):
346
+ global model_semaphore, global_counter
347
+ global_counter += 1
348
+ params = await request.json()
349
+
350
+ if model_semaphore is None:
351
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
352
+ await model_semaphore.acquire()
353
+ worker.send_heart_beat()
354
+ generator = worker.generate_stream_gate(params)
355
+ background_tasks = BackgroundTasks()
356
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
357
+ return StreamingResponse(generator, background=background_tasks)
358
+
359
+
360
+ @app.post("/worker_get_status")
361
+ async def get_status(request: Request):
362
+ return worker.get_status()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ parser = argparse.ArgumentParser()
367
+ parser.add_argument("--host", type=str, default="localhost")
368
+ parser.add_argument("--port", type=int, default=21002)
369
+ parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
370
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
371
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
372
+ parser.add_argument("--model-base", type=str, default=None)
373
+ parser.add_argument("--model-name", type=str)
374
+ parser.add_argument("--device", type=str, default="cuda")
375
+ parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
376
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
377
+ parser.add_argument("--stream-interval", type=int, default=1)
378
+ parser.add_argument("--no-register", action="store_true")
379
+ parser.add_argument("--load-8bit", action="store_true")
380
+ parser.add_argument("--load-4bit", action="store_true")
381
+ args = parser.parse_args()
382
+ logger.info(f"args: {args}")
383
+
384
+ if args.multi_modal:
385
+ logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
386
+
387
+ worker = ModelWorker(args.controller_address,
388
+ args.worker_address,
389
+ worker_id,
390
+ args.no_register,
391
+ args.model_path,
392
+ args.model_base,
393
+ args.model_name,
394
+ args.load_8bit,
395
+ args.load_4bit,
396
+ args.device)
397
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/register_worker.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manually register workers.
3
+
4
+ Usage:
5
+ python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6
+ """
7
+
8
+ import argparse
9
+
10
+ import requests
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--controller-address", type=str)
15
+ parser.add_argument("--worker-name", type=str)
16
+ parser.add_argument("--check-heart-beat", action="store_true")
17
+ args = parser.parse_args()
18
+
19
+ url = args.controller_address + "/register_worker"
20
+ data = {
21
+ "worker_name": args.worker_name,
22
+ "check_heart_beat": args.check_heart_beat,
23
+ "worker_status": None,
24
+ }
25
+ r = requests.post(url, json=data)
26
+ assert r.status_code == 200
videollama2/serve/sglang_worker.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import argparse
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import json
8
+ import time
9
+ import threading
10
+ import uuid
11
+
12
+ from fastapi import FastAPI, Request, BackgroundTasks
13
+ from fastapi.responses import StreamingResponse
14
+ import requests
15
+ import re
16
+ import uvicorn
17
+ from functools import partial
18
+
19
+ from llava.constants import WORKER_HEART_BEAT_INTERVAL
20
+ from llava.utils import (build_logger, server_error_msg,
21
+ pretty_print_semaphore)
22
+ from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
23
+ from llava.constants import DEFAULT_IMAGE_TOKEN
24
+
25
+ import sglang as sgl
26
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
27
+
28
+
29
+ GB = 1 << 30
30
+
31
+ worker_id = str(uuid.uuid4())[:6]
32
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
33
+ global_counter = 0
34
+
35
+ model_semaphore = None
36
+
37
+
38
+ def heart_beat_worker(controller):
39
+ while True:
40
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
41
+ controller.send_heart_beat()
42
+
43
+
44
+ @sgl.function
45
+ def pipeline(s, prompt, max_tokens):
46
+ for p in prompt:
47
+ if type(p) is str:
48
+ s += p
49
+ else:
50
+ s += sgl.image(p)
51
+ s += sgl.gen("response", max_tokens=max_tokens)
52
+
53
+
54
+ class ModelWorker:
55
+ def __init__(self, controller_addr, worker_addr, sgl_endpoint,
56
+ worker_id, no_register, model_name):
57
+ self.controller_addr = controller_addr
58
+ self.worker_addr = worker_addr
59
+ self.worker_id = worker_id
60
+
61
+ # Select backend
62
+ backend = RuntimeEndpoint(sgl_endpoint)
63
+ sgl.set_default_backend(backend)
64
+ model_path = backend.model_info["model_path"]
65
+
66
+ if model_path.endswith("/"):
67
+ model_path = model_path[:-1]
68
+ if model_name is None:
69
+ model_paths = model_path.split("/")
70
+ if model_paths[-1].startswith('checkpoint-'):
71
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
72
+ else:
73
+ self.model_name = model_paths[-1]
74
+ else:
75
+ self.model_name = model_name
76
+
77
+ logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
78
+
79
+ if not no_register:
80
+ self.register_to_controller()
81
+ self.heart_beat_thread = threading.Thread(
82
+ target=heart_beat_worker, args=(self,), daemon=True)
83
+ self.heart_beat_thread.start()
84
+
85
+ def register_to_controller(self):
86
+ logger.info("Register to controller")
87
+
88
+ url = self.controller_addr + "/register_worker"
89
+ data = {
90
+ "worker_name": self.worker_addr,
91
+ "check_heart_beat": True,
92
+ "worker_status": self.get_status()
93
+ }
94
+ r = requests.post(url, json=data)
95
+ assert r.status_code == 200
96
+
97
+ def send_heart_beat(self):
98
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
99
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
100
+ f"global_counter: {global_counter}")
101
+
102
+ url = self.controller_addr + "/receive_heart_beat"
103
+
104
+ while True:
105
+ try:
106
+ ret = requests.post(url, json={
107
+ "worker_name": self.worker_addr,
108
+ "queue_length": self.get_queue_length()}, timeout=5)
109
+ exist = ret.json()["exist"]
110
+ break
111
+ except requests.exceptions.RequestException as e:
112
+ logger.error(f"heart beat error: {e}")
113
+ time.sleep(5)
114
+
115
+ if not exist:
116
+ self.register_to_controller()
117
+
118
+ def get_queue_length(self):
119
+ if model_semaphore is None:
120
+ return 0
121
+ else:
122
+ return args.limit_model_concurrency - model_semaphore._value + (len(
123
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
124
+
125
+ def get_status(self):
126
+ return {
127
+ "model_names": [self.model_name],
128
+ "speed": 1,
129
+ "queue_length": self.get_queue_length(),
130
+ }
131
+
132
+ async def generate_stream(self, params):
133
+ ori_prompt = prompt = params["prompt"]
134
+ images = params.get("images", None)
135
+ if images is not None and len(images) > 0:
136
+ if len(images) > 0:
137
+ if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
138
+ raise ValueError("Number of images does not match number of <image> tokens in prompt")
139
+
140
+ images = [load_image_from_base64(image) for image in images]
141
+
142
+ # FIXME: for image-start/end token
143
+ # replace_token = DEFAULT_IMAGE_TOKEN
144
+ # if getattr(self.model.config, 'mm_use_im_start_end', False):
145
+ # replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
146
+ # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
147
+ prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
148
+ prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
149
+ prompt = []
150
+ for i in range(len(prompt_split)):
151
+ prompt.append(prompt_split[i])
152
+ if i < len(images):
153
+ prompt.append(images[i])
154
+ else:
155
+ prompt = [prompt]
156
+
157
+ temperature = float(params.get("temperature", 1.0))
158
+ top_p = float(params.get("top_p", 1.0))
159
+ # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
160
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
161
+ stop_str = params.get("stop", None)
162
+ stop_str = [stop_str] if stop_str is not None else None
163
+
164
+ print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
165
+ state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
166
+
167
+ generated_text = ori_prompt
168
+ async for text_outputs in state.text_async_iter(var_name="response"):
169
+ generated_text += text_outputs
170
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
171
+
172
+ async def generate_stream_gate(self, params):
173
+ try:
174
+ async for x in self.generate_stream(params):
175
+ yield x
176
+ except ValueError as e:
177
+ print("Caught ValueError:", e)
178
+ ret = {
179
+ "text": server_error_msg,
180
+ "error_code": 1,
181
+ }
182
+ yield json.dumps(ret).encode() + b"\0"
183
+ except Exception as e:
184
+ print("Caught Unknown Error", e)
185
+ ret = {
186
+ "text": server_error_msg,
187
+ "error_code": 1,
188
+ }
189
+ yield json.dumps(ret).encode() + b"\0"
190
+
191
+
192
+ app = FastAPI()
193
+
194
+
195
+ def release_model_semaphore(fn=None):
196
+ model_semaphore.release()
197
+ if fn is not None:
198
+ fn()
199
+
200
+
201
+ @app.post("/worker_generate_stream")
202
+ async def generate_stream(request: Request):
203
+ global model_semaphore, global_counter
204
+ global_counter += 1
205
+ params = await request.json()
206
+
207
+ if model_semaphore is None:
208
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
209
+ await model_semaphore.acquire()
210
+ worker.send_heart_beat()
211
+ generator = worker.generate_stream_gate(params)
212
+ background_tasks = BackgroundTasks()
213
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
214
+ return StreamingResponse(generator, background=background_tasks)
215
+
216
+
217
+ @app.post("/worker_get_status")
218
+ async def get_status(request: Request):
219
+ return worker.get_status()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ parser = argparse.ArgumentParser()
224
+ parser.add_argument("--host", type=str, default="localhost")
225
+ parser.add_argument("--port", type=int, default=21002)
226
+ parser.add_argument("--worker-address", type=str,
227
+ default="http://localhost:21002")
228
+ parser.add_argument("--controller-address", type=str,
229
+ default="http://localhost:21001")
230
+ parser.add_argument("--model-name", type=str)
231
+ parser.add_argument("--sgl-endpoint", type=str)
232
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
233
+ parser.add_argument("--stream-interval", type=int, default=1)
234
+ parser.add_argument("--no-register", action="store_true")
235
+ args = parser.parse_args()
236
+ logger.info(f"args: {args}")
237
+
238
+ worker = ModelWorker(args.controller_address,
239
+ args.worker_address,
240
+ args.sgl_endpoint,
241
+ worker_id,
242
+ args.no_register,
243
+ args.model_name)
244
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/test_message.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ import requests
5
+
6
+ from llava.conversation import default_conversation
7
+
8
+
9
+ def main():
10
+ if args.worker_address:
11
+ worker_addr = args.worker_address
12
+ else:
13
+ controller_addr = args.controller_address
14
+ ret = requests.post(controller_addr + "/refresh_all_workers")
15
+ ret = requests.post(controller_addr + "/list_models")
16
+ models = ret.json()["models"]
17
+ models.sort()
18
+ print(f"Models: {models}")
19
+
20
+ ret = requests.post(controller_addr + "/get_worker_address",
21
+ json={"model": args.model_name})
22
+ worker_addr = ret.json()["address"]
23
+ print(f"worker_addr: {worker_addr}")
24
+
25
+ if worker_addr == "":
26
+ return
27
+
28
+ conv = default_conversation.copy()
29
+ conv.append_message(conv.roles[0], args.message)
30
+ prompt = conv.get_prompt()
31
+
32
+ headers = {"User-Agent": "LLaVA Client"}
33
+ pload = {
34
+ "model": args.model_name,
35
+ "prompt": prompt,
36
+ "max_new_tokens": args.max_new_tokens,
37
+ "temperature": 0.7,
38
+ "stop": conv.sep,
39
+ }
40
+ response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41
+ json=pload, stream=True)
42
+
43
+ print(prompt.replace(conv.sep, "\n"), end="")
44
+ for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45
+ if chunk:
46
+ data = json.loads(chunk.decode("utf-8"))
47
+ output = data["text"].split(conv.sep)[-1]
48
+ print(output, end="\r")
49
+ print("")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55
+ parser.add_argument("--worker-address", type=str)
56
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57
+ parser.add_argument("--max-new-tokens", type=int, default=32)
58
+ parser.add_argument("--message", type=str, default=
59
+ "Tell me a story with more than 1000 words.")
60
+ args = parser.parse_args()
61
+
62
+ main()
videollama2/train.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import re
19
+ import os
20
+ import copy
21
+ import json
22
+ import random
23
+ import pathlib
24
+ import traceback
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, Optional, Sequence, List
27
+
28
+ # torch-related packages
29
+ # NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
30
+ import torch
31
+ from torch.utils.data import Dataset
32
+
33
+ import transformers
34
+ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
35
+
36
+ import sys
37
+ sys.path.append('./')
38
+ from videollama2.model import *
39
+ from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP
40
+ from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image
41
+ from videollama2.videollama2_trainer import (VideoLLaMA2Trainer,
42
+ get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3,
43
+ find_all_linear_names, safe_save_model_for_hf_trainer
44
+ )
45
+
46
+ # NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486
47
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
48
+
49
+ local_rank = None
50
+
51
+
52
+ def rank0_print(*args):
53
+ if local_rank == 0:
54
+ print(*args)
55
+
56
+
57
+ def set_seed(seed=42):
58
+ """
59
+ Set the random seed for reproducible results.
60
+
61
+ :param seed: An integer value to be used as the random seed.
62
+ """
63
+ torch.manual_seed(seed)
64
+ torch.cuda.manual_seed(seed)
65
+ torch.cuda.manual_seed_all(seed) # for multi-GPU setups
66
+ torch.backends.cudnn.deterministic = True
67
+ torch.backends.cudnn.benchmark = False
68
+
69
+
70
+ @dataclass
71
+ class ModelArguments:
72
+ # LLM Arguments
73
+ model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
74
+ model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
75
+ version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
76
+ freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
77
+ # Connector Arguments
78
+ mm_projector_type: Optional[str] = field(default='linear')
79
+ tune_mm_mlp_adapter: bool = field(default=False)
80
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
81
+ # Vision tower Arguments
82
+ vision_tower: Optional[str] = field(default=None)
83
+ mm_vision_select_layer: Optional[int] = field(default=-1)
84
+ mm_vision_select_feature: Optional[str] = field(default="patch")
85
+
86
+
87
+ @dataclass
88
+ class DataArguments:
89
+ # Path Arguments
90
+ data_path: List[str] = field(default=None, metadata={"help": "Path to the training data."})
91
+ # image_folder: Optional[str] = field(default=None)
92
+ # video_folder: Optional[str] = field(default=None)
93
+ data_folder: Optional[str] = field(default=None)
94
+ # Loading Arguments
95
+ is_multimodal: bool = False
96
+ lazy_preprocess: bool = False
97
+ num_frames: Optional[int] = field(default=None)
98
+ # Preprocess Arguments
99
+ image_aspect_ratio: str = 'square'
100
+
101
+
102
+ @dataclass
103
+ class TrainingArguments(transformers.TrainingArguments):
104
+ optim: str = field(default="adamw_torch")
105
+ mm_projector_lr: Optional[float] = None
106
+ freeze_mm_mlp_adapter: bool = field(default=False)
107
+ remove_unused_columns: bool = field(default=False)
108
+ # Training Data Arguments
109
+ group_by_modality_length: bool = field(default=False)
110
+ model_max_length: int = field(
111
+ default=512,
112
+ metadata={
113
+ "help":
114
+ "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
115
+ },
116
+ )
117
+ # Lora or Quant Arguments
118
+ double_quant: bool = field(
119
+ default=True,
120
+ metadata={"help": "Compress the quantization statistics through double quantization."}
121
+ )
122
+ quant_type: str = field(
123
+ default="nf4",
124
+ metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
125
+ )
126
+ bits: int = field(
127
+ default=16,
128
+ metadata={"help": "How many bits to use."}
129
+ )
130
+ lora_enable: bool = False
131
+ lora_r: int = 64
132
+ lora_alpha: int = 16
133
+ lora_dropout: float = 0.05
134
+ lora_weight_path: str = ""
135
+ lora_bias: str = "none"
136
+
137
+
138
+ def preprocess_plain(
139
+ sources: Sequence[str],
140
+ tokenizer: transformers.PreTrainedTokenizer,
141
+ modal_token: str = None,
142
+ ) -> Dict:
143
+ roles = {"human": "user", "gpt": "assistant"}
144
+ conversations = []
145
+ input_ids = []
146
+ targets = []
147
+ for source in sources:
148
+ # 1. apply chat template for input conversation
149
+ assert len(source) == 2
150
+ assert modal_token in source[0]['value']
151
+ message = [
152
+ {'role': 'user', 'content': modal_token},
153
+ {'role': 'assistant', 'content': source[1]['value']}
154
+ ]
155
+ conversation = " ".join([sentence['value'] for sentence in source])
156
+
157
+ input_id = tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')
158
+ target = copy.deepcopy(input_id)
159
+ target[input_id == MODAL_INDEX_MAP[modal_token]] = IGNORE_INDEX
160
+
161
+ input_ids.append(input_id)
162
+ targets.append(target)
163
+
164
+ return dict(input_ids=input_ids, labels=targets)
165
+
166
+
167
+ def preprocess(
168
+ sources: Sequence[str],
169
+ tokenizer: transformers.PreTrainedTokenizer,
170
+ modal_token: str = None,
171
+ ) -> Dict:
172
+ roles = {"human": "user", "gpt": "assistant"}
173
+
174
+ # Apply prompt templates
175
+ conversations = []
176
+ input_ids = []
177
+ targets = []
178
+ for i, source in enumerate(sources):
179
+ if roles[source[0]["from"]] != "user":
180
+ # Skip the first one if it is not from human
181
+ source = source[1:]
182
+
183
+ message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source]
184
+ conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
185
+ input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
186
+ targets.append(copy.deepcopy(input_ids[-1]))
187
+
188
+ assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}."
189
+
190
+ cur = 0
191
+ message = []
192
+ for idx, sentence in enumerate(source):
193
+ if idx % 2 == 1:
194
+ tmp_message = [
195
+ {'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']},
196
+ {'role': roles[sentence['from']], 'content': sentence['value']}
197
+ ]
198
+
199
+ instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True)
200
+ conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False)
201
+
202
+ instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
203
+ conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
204
+
205
+ targets[-1][cur:instruction_len] = IGNORE_INDEX
206
+
207
+ cur = conversation_len
208
+ message += tmp_message
209
+
210
+ return dict(input_ids=input_ids, labels=targets)
211
+
212
+
213
+ def preprocess_multimodal(
214
+ sources: Sequence[str],
215
+ data_args: DataArguments,
216
+ modal_token: str = None,
217
+ ) -> Dict:
218
+ is_multimodal = data_args.is_multimodal
219
+ if not is_multimodal:
220
+ return sources
221
+
222
+ assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}."
223
+
224
+ for source in sources:
225
+ for sentence in source:
226
+ if modal_token in sentence['value']:
227
+ sentence['value'] = sentence['value'].replace(modal_token, '').strip()
228
+ sentence['value'] = modal_token + '\n' + sentence['value']
229
+ sentence['value'] = sentence['value'].strip()
230
+ replace_token = modal_token
231
+ # TODO: fix this for multimedia, e.g., <video>, <audio>, etc.
232
+ sentence["value"] = sentence["value"].replace(modal_token, replace_token)
233
+
234
+ return sources
235
+
236
+
237
+ class LazySupervisedDataset(Dataset):
238
+ """Dataset for supervised fine-tuning."""
239
+
240
+ def __init__(self, data_path: str,
241
+ tokenizer: transformers.PreTrainedTokenizer,
242
+ data_args: DataArguments):
243
+ super(LazySupervisedDataset, self).__init__()
244
+ list_data_dict = []
245
+ for dp in data_path:
246
+ _datas = json.load(open(dp, "r"))
247
+ list_data_dict.extend(_datas)
248
+
249
+ rank0_print("Formatting inputs...Skip in lazy mode")
250
+ self.tokenizer = tokenizer
251
+ self.list_data_dict = list_data_dict
252
+ self.data_args = data_args
253
+
254
+ def __len__(self):
255
+ return len(self.list_data_dict)
256
+
257
+ @property
258
+ def lengths(self):
259
+ length_list = []
260
+ for sample in self.list_data_dict:
261
+ img_tokens = 576 if 'image' in sample else 0
262
+ length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
263
+ return length_list
264
+
265
+ @property
266
+ def modality_lengths(self):
267
+ length_list = []
268
+ for sample in self.list_data_dict:
269
+ cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
270
+ cur_len = cur_len if 'image' in sample else -cur_len
271
+ length_list.append(cur_len)
272
+ return length_list
273
+
274
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
275
+ sources = self.list_data_dict[i]
276
+ if isinstance(i, int):
277
+ sources = [sources]
278
+ assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME
279
+
280
+ image_processor = self.data_args.image_processor
281
+ video_processor = self.data_args.video_processor
282
+
283
+ num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
284
+
285
+ if 'image' in sources[0]:
286
+ image_file = self.list_data_dict[i]['image']
287
+ image_folder = self.data_args.data_folder
288
+ image_file = os.path.join(image_folder, image_file)
289
+
290
+ try:
291
+ image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio)
292
+ except:
293
+ traceback.print_exc()
294
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
295
+ print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
296
+ return self.__getitem__(backup_idx)
297
+
298
+ # place <image> tag to question head.
299
+ modal_token = "<image>"
300
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
301
+ elif 'video' in sources[0]:
302
+ video_file = self.list_data_dict[i]['video']
303
+ video_folder = self.data_args.data_folder
304
+ video_file = os.path.join(video_folder, video_file)
305
+
306
+ try:
307
+ video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames)
308
+ except Exception as e:
309
+ traceback.print_exc()
310
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
311
+ print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
312
+ return self.__getitem__(backup_idx)
313
+
314
+ # place <video> tag to question head.
315
+ modal_token = "<video>"
316
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
317
+ else:
318
+ modal_token = None
319
+ sources = copy.deepcopy([e["conversations"] for e in sources])
320
+
321
+ if self.data_args.is_pretraining:
322
+ data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token)
323
+ else:
324
+ data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token)
325
+
326
+ if isinstance(i, int):
327
+ data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
328
+
329
+ # image exist in the data
330
+ if 'image' in self.list_data_dict[i]:
331
+ data_dict['image'] = image
332
+ elif 'video' in self.list_data_dict[i]:
333
+ data_dict['video'] = video
334
+ elif self.data_args.is_multimodal:
335
+ # image does not exist in the data, but the model is multimodal
336
+ data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size)
337
+ return data_dict
338
+
339
+
340
+ @dataclass
341
+ class DataCollatorForSupervisedDataset(object):
342
+ """Collate examples for supervised fine-tuning."""
343
+
344
+ tokenizer: transformers.PreTrainedTokenizer
345
+
346
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
347
+ input_ids, labels = tuple([instance[key] for instance in instances]
348
+ for key in ("input_ids", "labels"))
349
+ input_ids = torch.nn.utils.rnn.pad_sequence(
350
+ input_ids,
351
+ batch_first=True,
352
+ padding_value=self.tokenizer.pad_token_id)
353
+ labels = torch.nn.utils.rnn.pad_sequence(labels,
354
+ batch_first=True,
355
+ padding_value=IGNORE_INDEX)
356
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
357
+ labels = labels[:, :self.tokenizer.model_max_length]
358
+ batch = dict(
359
+ input_ids=input_ids,
360
+ labels=labels,
361
+ attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
362
+ )
363
+
364
+ # work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py
365
+ batch['images'] = []
366
+ for instance in instances:
367
+ for modal_token in MODAL_INDEX_MAP.keys():
368
+ modal_token = modal_token.lower()
369
+ # MODAL_TOKEN shape like: <image>, <video>, ...
370
+ modal_name = re.findall(f'[<](.*)[>]', modal_token)
371
+ assert len(modal_name) == 1
372
+ modal_name = modal_name[0]
373
+ if modal_name in instance:
374
+ batch['images'].append((instance[modal_name], modal_name))
375
+
376
+ return batch
377
+
378
+
379
+ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
380
+ data_args) -> Dict:
381
+ """Make dataset and collator for supervised fine-tuning."""
382
+ train_dataset = LazySupervisedDataset(
383
+ tokenizer=tokenizer,
384
+ data_path=data_args.data_path,
385
+ data_args=data_args
386
+ )
387
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
388
+ return dict(train_dataset=train_dataset,
389
+ eval_dataset=None,
390
+ data_collator=data_collator)
391
+
392
+
393
+ def train(attn_implementation=None):
394
+ global local_rank
395
+ set_seed(42)
396
+
397
+ parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
398
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
399
+
400
+ local_rank = training_args.local_rank
401
+ compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
402
+
403
+ bnb_model_from_pretrained_args = {}
404
+ if training_args.bits in [4, 8]:
405
+ from transformers import BitsAndBytesConfig
406
+ bnb_model_from_pretrained_args.update(dict(
407
+ # device_map={"": training_args.device},
408
+ # BUG: High version transformers report error:
409
+ # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time
410
+ # load_in_4bit=training_args.bits == 4,
411
+ # load_in_8bit=training_args.bits == 8,
412
+ quantization_config=BitsAndBytesConfig(
413
+ load_in_4bit=training_args.bits == 4,
414
+ load_in_8bit=training_args.bits == 8,
415
+ llm_int8_skip_modules=["mm_projector"],
416
+ llm_int8_threshold=6.0,
417
+ llm_int8_has_fp16_weight=False,
418
+ bnb_4bit_compute_dtype=compute_dtype,
419
+ bnb_4bit_use_double_quant=training_args.double_quant,
420
+ bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'}
421
+ bnb_4bit_quant_storage=compute_dtype,
422
+ )
423
+ ))
424
+
425
+ config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True)
426
+ config._attn_implementation = attn_implementation
427
+
428
+ if model_args.vision_tower is not None:
429
+ model = VLLMs[model_args.model_type].from_pretrained(
430
+ model_args.model_path,
431
+ config=config,
432
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
433
+ do_sample=True,
434
+ **bnb_model_from_pretrained_args
435
+ )
436
+ if 'mixtral' in model_args.model_type:
437
+ import deepspeed
438
+ deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
439
+ else:
440
+ model = transformers.LlamaForCausalLM.from_pretrained(
441
+ model_args.model_path,
442
+ config=config,
443
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
444
+ do_sample=True,
445
+ **bnb_model_from_pretrained_args
446
+ )
447
+ model.config.use_cache = False
448
+
449
+ if model_args.freeze_backbone:
450
+ model.model.requires_grad_(False)
451
+
452
+ if training_args.bits in [4, 8]:
453
+ from peft import prepare_model_for_kbit_training
454
+ model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
455
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
456
+
457
+ if training_args.gradient_checkpointing:
458
+ if hasattr(model, "enable_input_require_grads"):
459
+ model.enable_input_require_grads()
460
+ else:
461
+ def make_inputs_require_grad(module, input, output):
462
+ output.requires_grad_(True)
463
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
464
+
465
+ if training_args.lora_enable:
466
+ from peft import LoraConfig, get_peft_model
467
+ lora_config = LoraConfig(
468
+ r=training_args.lora_r,
469
+ lora_alpha=training_args.lora_alpha,
470
+ target_modules=find_all_linear_names(model),
471
+ lora_dropout=training_args.lora_dropout,
472
+ bias=training_args.lora_bias,
473
+ task_type="CAUSAL_LM",
474
+ )
475
+ if training_args.bits == 16:
476
+ if training_args.bf16:
477
+ model.to(torch.bfloat16)
478
+ if training_args.fp16:
479
+ model.to(torch.float16)
480
+ rank0_print("Adding LoRA adapters...")
481
+ model = get_peft_model(model, lora_config)
482
+
483
+
484
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
485
+ model_args.model_path,
486
+ model_max_length=training_args.model_max_length,
487
+ padding_side="right",
488
+ use_fast=True,
489
+ )
490
+
491
+ if tokenizer.pad_token is None:
492
+ tokenizer.pad_token = tokenizer.unk_token
493
+
494
+ if model_args.vision_tower is not None:
495
+ # initialize vision encoder + multi-modal projector
496
+ model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
497
+
498
+ vision_tower = model.get_vision_tower()
499
+ vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
500
+
501
+ data_args.image_size = vision_tower.image_size
502
+
503
+ data_args.image_processor = vision_tower.image_processor
504
+ data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
505
+
506
+ data_args.is_multimodal = True
507
+
508
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
509
+ model.config.tokenizer_padding_side = tokenizer.padding_side
510
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
511
+
512
+ model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
513
+ if model_args.tune_mm_mlp_adapter:
514
+ model.requires_grad_(False)
515
+ for p in model.get_model().mm_projector.parameters():
516
+ p.requires_grad = True
517
+
518
+ if model_args.tune_mm_mlp_adapter:
519
+ data_args.is_pretraining = True
520
+ else:
521
+ data_args.is_pretraining = False
522
+
523
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
524
+ if training_args.freeze_mm_mlp_adapter:
525
+ for p in model.get_model().mm_projector.parameters():
526
+ p.requires_grad = False
527
+
528
+ if training_args.bits in [4, 8]:
529
+ model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
530
+
531
+ model.config.mm_projector_lr = training_args.mm_projector_lr
532
+ model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
533
+ # vision_tower is not trainable in VideoLLaMA2
534
+ model.get_model().vision_tower.requires_grad_(False)
535
+
536
+ if training_args.bits in [4, 8]:
537
+ from peft.tuners.lora import LoraLayer
538
+ for name, module in model.named_modules():
539
+ if isinstance(module, LoraLayer):
540
+ if training_args.bf16:
541
+ module = module.to(torch.bfloat16)
542
+ if 'norm' in name:
543
+ module = module.to(torch.float32)
544
+ if 'lm_head' in name or 'embed_tokens' in name:
545
+ if hasattr(module, 'weight'):
546
+ if training_args.bf16 and module.weight.dtype == torch.float32:
547
+ module = module.to(torch.bfloat16)
548
+
549
+ print("Current model:", model)
550
+ data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
551
+ # select a Trainer
552
+ trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
553
+
554
+ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
555
+ trainer.train(resume_from_checkpoint=True)
556
+ else:
557
+ trainer.train()
558
+ trainer.save_state()
559
+
560
+ model.config.use_cache = True
561
+
562
+ if training_args.lora_enable:
563
+ state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
564
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
565
+ if training_args.local_rank == 0 or training_args.local_rank == -1:
566
+ model.config.save_pretrained(training_args.output_dir)
567
+ model.save_pretrained(training_args.output_dir, state_dict=state_dict)
568
+ torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
569
+ else:
570
+ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
571
+
572
+
573
+ if __name__ == "__main__":
574
+ train("flash_attention_2")
videollama2/utils.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import logging.handlers
4
+ import os
5
+ import sys
6
+
7
+ import requests
8
+
9
+ from .constants import LOGDIR
10
+
11
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
12
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
13
+
14
+ handler = None
15
+
16
+
17
+ def build_logger(logger_name, logger_filename):
18
+ global handler
19
+
20
+ formatter = logging.Formatter(
21
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
22
+ datefmt="%Y-%m-%d %H:%M:%S",
23
+ )
24
+
25
+ # Set the format of root handlers
26
+ if not logging.getLogger().handlers:
27
+ logging.basicConfig(level=logging.INFO)
28
+ logging.getLogger().handlers[0].setFormatter(formatter)
29
+
30
+ # Redirect stdout and stderr to loggers
31
+ stdout_logger = logging.getLogger("stdout")
32
+ stdout_logger.setLevel(logging.INFO)
33
+ sl = StreamToLogger(stdout_logger, logging.INFO)
34
+ sys.stdout = sl
35
+
36
+ stderr_logger = logging.getLogger("stderr")
37
+ stderr_logger.setLevel(logging.ERROR)
38
+ sl = StreamToLogger(stderr_logger, logging.ERROR)
39
+ sys.stderr = sl
40
+
41
+ # Get logger
42
+ logger = logging.getLogger(logger_name)
43
+ logger.setLevel(logging.INFO)
44
+
45
+ # Add a file handler for all loggers
46
+ if handler is None:
47
+ os.makedirs(LOGDIR, exist_ok=True)
48
+ filename = os.path.join(LOGDIR, logger_filename)
49
+ handler = logging.handlers.TimedRotatingFileHandler(
50
+ filename, when='D', utc=True, encoding='UTF-8')
51
+ handler.setFormatter(formatter)
52
+
53
+ for name, item in logging.root.manager.loggerDict.items():
54
+ if isinstance(item, logging.Logger):
55
+ item.addHandler(handler)
56
+
57
+ return logger
58
+
59
+
60
+ class StreamToLogger(object):
61
+ """
62
+ Fake file-like stream object that redirects writes to a logger instance.
63
+ """
64
+ def __init__(self, logger, log_level=logging.INFO):
65
+ self.terminal = sys.stdout
66
+ self.logger = logger
67
+ self.log_level = log_level
68
+ self.linebuf = ''
69
+
70
+ def __getattr__(self, attr):
71
+ return getattr(self.terminal, attr)
72
+
73
+ def write(self, buf):
74
+ temp_linebuf = self.linebuf + buf
75
+ self.linebuf = ''
76
+ for line in temp_linebuf.splitlines(True):
77
+ # From the io.TextIOWrapper docs:
78
+ # On output, if newline is None, any '\n' characters written
79
+ # are translated to the system default line separator.
80
+ # By default sys.stdout.write() expects '\n' newlines and then
81
+ # translates them so this is still cross platform.
82
+ if line[-1] == '\n':
83
+ self.logger.log(self.log_level, line.rstrip())
84
+ else:
85
+ self.linebuf += line
86
+
87
+ def flush(self):
88
+ if self.linebuf != '':
89
+ self.logger.log(self.log_level, self.linebuf.rstrip())
90
+ self.linebuf = ''
91
+
92
+
93
+ def disable_torch_init():
94
+ """
95
+ Disable the redundant torch default initialization to accelerate model creation.
96
+ """
97
+ import torch
98
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
99
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100
+
101
+
102
+ def violates_moderation(text):
103
+ """
104
+ Check whether the text violates OpenAI moderation API.
105
+ """
106
+ url = "https://api.openai.com/v1/moderations"
107
+ headers = {"Content-Type": "application/json",
108
+ "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109
+ text = text.replace("\n", "")
110
+ data = "{" + '"input": ' + f'"{text}"' + "}"
111
+ data = data.encode("utf-8")
112
+ try:
113
+ ret = requests.post(url, headers=headers, data=data, timeout=5)
114
+ flagged = ret.json()["results"][0]["flagged"]
115
+ except requests.exceptions.RequestException as e:
116
+ flagged = False
117
+ except KeyError as e:
118
+ flagged = False
119
+
120
+ return flagged
121
+
122
+
123
+ def pretty_print_semaphore(semaphore):
124
+ if semaphore is None:
125
+ return "None"
126
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
videollama2/videollama2_trainer.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
2
+ import os
3
+ import logging
4
+ from typing import List, Optional
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.utils.data import Sampler
9
+
10
+ from transformers import Trainer
11
+ from transformers.trainer import (
12
+ is_sagemaker_mp_enabled,
13
+ get_parameter_names,
14
+ has_length,
15
+ ALL_LAYERNORM_LAYERS,
16
+ logger,
17
+ TRAINER_STATE_NAME,
18
+ )
19
+
20
+
21
+ def maybe_zero_3(param, ignore_status=False, name=None):
22
+ from deepspeed import zero
23
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
24
+ if hasattr(param, "ds_id"):
25
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
26
+ if not ignore_status:
27
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
28
+ with zero.GatheredParameters([param]):
29
+ param = param.data.detach().cpu().clone()
30
+ else:
31
+ param = param.detach().cpu().clone()
32
+ return param
33
+
34
+
35
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
36
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
37
+ to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
38
+ return to_return
39
+
40
+
41
+ # Borrowed from peft.utils.get_peft_model_state_dict
42
+ def get_peft_state_maybe_zero_3(named_params, bias):
43
+ if bias == "none":
44
+ to_return = {k: t for k, t in named_params if "lora_" in k}
45
+ elif bias == "all":
46
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
47
+ elif bias == "lora_only":
48
+ to_return = {}
49
+ maybe_lora_bias = {}
50
+ lora_bias_names = set()
51
+ for k, t in named_params:
52
+ if "lora_" in k:
53
+ to_return[k] = t
54
+ bias_name = k.split("lora_")[0] + "bias"
55
+ lora_bias_names.add(bias_name)
56
+ elif "bias" in k:
57
+ maybe_lora_bias[k] = t
58
+ for k, t in maybe_lora_bias:
59
+ if bias_name in lora_bias_names:
60
+ to_return[bias_name] = t
61
+ else:
62
+ raise NotImplementedError
63
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
64
+ return to_return
65
+
66
+
67
+ def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
68
+ to_return = {k: t for k, t in named_params if "lora_" not in k}
69
+ if require_grad_only:
70
+ to_return = {k: t for k, t in to_return.items() if t.requires_grad}
71
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
72
+ return to_return
73
+
74
+
75
+ def find_all_linear_names(model):
76
+ cls = torch.nn.Linear
77
+ lora_module_names = set()
78
+ multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
79
+ for name, module in model.named_modules():
80
+ if any(mm_keyword in name for mm_keyword in multimodal_keywords):
81
+ continue
82
+ if isinstance(module, cls):
83
+ names = name.split('.')
84
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
85
+
86
+ if 'lm_head' in lora_module_names: # needed for 16-bit
87
+ lora_module_names.remove('lm_head')
88
+ return list(lora_module_names)
89
+
90
+
91
+ def safe_save_model_for_hf_trainer(trainer: Trainer,
92
+ output_dir: str):
93
+ """Collects the state dict and dump to disk."""
94
+
95
+ if getattr(trainer.args, "tune_mm_mlp_adapter", False):
96
+ # Only save Adapter
97
+ keys_to_match = ['mm_projector']
98
+
99
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
100
+ trainer.model.config.save_pretrained(output_dir)
101
+
102
+ current_folder = output_dir.split('/')[-1]
103
+ parent_folder = os.path.dirname(output_dir)
104
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
105
+ if current_folder.startswith('checkpoint-'):
106
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector")
107
+ os.makedirs(mm_projector_folder, exist_ok=True)
108
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
109
+ else:
110
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
111
+ return
112
+
113
+ if trainer.deepspeed:
114
+ torch.cuda.synchronize()
115
+ trainer.save_model(output_dir)
116
+ return
117
+
118
+ state_dict = trainer.model.state_dict()
119
+ if trainer.args.should_save:
120
+ cpu_state_dict = {
121
+ key: value.cpu()
122
+ for key, value in state_dict.items()
123
+ }
124
+ del state_dict
125
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
126
+
127
+
128
+ def split_to_even_chunks(indices, lengths, num_chunks):
129
+ """
130
+ Split a list of indices into `chunks` chunks of roughly equal lengths.
131
+ """
132
+
133
+ if len(indices) % num_chunks != 0:
134
+ return [indices[i::num_chunks] for i in range(num_chunks)]
135
+
136
+ num_indices_per_chunk = len(indices) // num_chunks
137
+
138
+ chunks = [[] for _ in range(num_chunks)]
139
+ chunks_lengths = [0 for _ in range(num_chunks)]
140
+ for index in indices:
141
+ shortest_chunk = chunks_lengths.index(min(chunks_lengths))
142
+ chunks[shortest_chunk].append(index)
143
+ chunks_lengths[shortest_chunk] += lengths[index]
144
+ if len(chunks[shortest_chunk]) == num_indices_per_chunk:
145
+ chunks_lengths[shortest_chunk] = float("inf")
146
+
147
+ return chunks
148
+
149
+
150
+ def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
151
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
152
+ assert all(l != 0 for l in lengths), "Should not have zero length."
153
+ if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
154
+ # all samples are in the same modality
155
+ return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
156
+ mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
157
+ lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
158
+
159
+ mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
160
+ lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
161
+ megabatch_size = world_size * batch_size
162
+ mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
163
+ lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
164
+
165
+ last_mm = mm_megabatches[-1]
166
+ last_lang = lang_megabatches[-1]
167
+ additional_batch = last_mm + last_lang
168
+ megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
169
+ megabatch_indices = torch.randperm(len(megabatches), generator=generator)
170
+ megabatches = [megabatches[i] for i in megabatch_indices]
171
+
172
+ if len(additional_batch) > 0:
173
+ megabatches.append(sorted(additional_batch))
174
+
175
+ return [i for megabatch in megabatches for i in megabatch]
176
+
177
+
178
+ def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
179
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
180
+ indices = torch.randperm(len(lengths), generator=generator)
181
+ megabatch_size = world_size * batch_size
182
+ megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
183
+ megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
184
+ megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
185
+
186
+ return [i for megabatch in megabatches for batch in megabatch for i in batch]
187
+
188
+
189
+ class LengthGroupedSampler(Sampler):
190
+ r"""
191
+ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
192
+ keeping a bit of randomness.
193
+ """
194
+
195
+ def __init__(
196
+ self,
197
+ batch_size: int,
198
+ world_size: int,
199
+ lengths: Optional[List[int]] = None,
200
+ generator=None,
201
+ group_by_modality: bool = False,
202
+ ):
203
+ if lengths is None:
204
+ raise ValueError("Lengths must be provided.")
205
+
206
+ self.batch_size = batch_size
207
+ self.world_size = world_size
208
+ self.lengths = lengths
209
+ self.generator = generator
210
+ self.group_by_modality = group_by_modality
211
+
212
+ def __len__(self):
213
+ return len(self.lengths)
214
+
215
+ def __iter__(self):
216
+ if self.group_by_modality:
217
+ indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
218
+ else:
219
+ indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
220
+ return iter(indices)
221
+
222
+
223
+ class VideoLLaMA2Trainer(Trainer):
224
+
225
+ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
226
+ if self.train_dataset is None or not has_length(self.train_dataset):
227
+ return None
228
+
229
+ if self.args.group_by_modality_length:
230
+ lengths = self.train_dataset.modality_lengths
231
+ return LengthGroupedSampler(
232
+ self.args.train_batch_size,
233
+ world_size=self.args.world_size * self.args.gradient_accumulation_steps,
234
+ lengths=lengths,
235
+ group_by_modality=True,
236
+ )
237
+ else:
238
+ return super()._get_train_sampler()
239
+
240
+ def create_optimizer(self):
241
+ """
242
+ Setup the optimizer.
243
+
244
+ We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
245
+ Trainer's init through `optimizers`, or subclass and override this method in a subclass.
246
+ """
247
+ if is_sagemaker_mp_enabled():
248
+ return super().create_optimizer()
249
+
250
+ opt_model = self.model
251
+
252
+ if self.optimizer is None:
253
+ decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
254
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
255
+ if self.args.mm_projector_lr is not None:
256
+ projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
257
+ optimizer_grouped_parameters = [
258
+ {
259
+ "params": [
260
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
261
+ ],
262
+ "weight_decay": self.args.weight_decay,
263
+ },
264
+ {
265
+ "params": [
266
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
267
+ ],
268
+ "weight_decay": 0.0,
269
+ },
270
+ {
271
+ "params": [
272
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
273
+ ],
274
+ "weight_decay": self.args.weight_decay,
275
+ "lr": self.args.mm_projector_lr,
276
+ },
277
+ {
278
+ "params": [
279
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
280
+ ],
281
+ "weight_decay": 0.0,
282
+ "lr": self.args.mm_projector_lr,
283
+ },
284
+ ]
285
+ else:
286
+ optimizer_grouped_parameters = [
287
+ {
288
+ "params": [
289
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
290
+ ],
291
+ "weight_decay": self.args.weight_decay,
292
+ },
293
+ {
294
+ "params": [
295
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
296
+ ],
297
+ "weight_decay": 0.0,
298
+ },
299
+ ]
300
+
301
+ optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
302
+
303
+ self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
304
+ if optimizer_cls.__name__ == "Adam8bit":
305
+ import bitsandbytes
306
+
307
+ manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
308
+
309
+ skipped = 0
310
+ for module in opt_model.modules():
311
+ if isinstance(module, nn.Embedding):
312
+ skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
313
+ logger.info(f"skipped {module}: {skipped/2**20}M params")
314
+ manager.register_module_override(module, "weight", {"optim_bits": 32})
315
+ logger.debug(f"bitsandbytes: will optimize {module} in fp32")
316
+ logger.info(f"skipped: {skipped/2**20}M params")
317
+
318
+ return self.optimizer
319
+
320
+ def _save_checkpoint(self, model, trial, metrics=None):
321
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
322
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
323
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
324
+
325
+ run_dir = self._get_output_dir(trial=trial)
326
+ output_dir = os.path.join(run_dir, checkpoint_folder)
327
+
328
+ # Only save Adapter
329
+ keys_to_match = ['mm_projector', 'vision_resampler']
330
+
331
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
332
+
333
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
334
+ self.model.config.save_pretrained(output_dir)
335
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
336
+ # Save optimizer and scheduler
337
+ self._save_optimizer_and_scheduler(output_dir)
338
+ # Save RNG state
339
+ self._save_rng_state(output_dir)
340
+ self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
341
+ self.args.distributed_state.wait_for_everyone()
342
+ else:
343
+ # NOTE: Supporting save complete lora checkpoint during training.
344
+ if self.args.lora_enable:
345
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
346
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
347
+
348
+ run_dir = self._get_output_dir(trial=trial)
349
+ output_dir = os.path.join(run_dir, checkpoint_folder)
350
+
351
+ state_dict = get_peft_state_maybe_zero_3(self.model.named_parameters(), self.args.lora_bias)
352
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(self.model.named_parameters())
353
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
354
+ # save for acquring `config.json`
355
+ self.model.config.save_pretrained(output_dir)
356
+ # save for acquring `adapter_config.json`, `adapter_model.bin`
357
+ # self.model.save_pretrained(output_dir, state_dict=state_dict)
358
+ torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
359
+
360
+ # save for acquring lora adapter parameters & trainer states: `adapter_config.json`, `adapter_model.safetensors`
361
+ super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
362
+ else:
363
+ super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
364
+
365
+ def _save(self, output_dir: Optional[str] = None, state_dict=None):
366
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
367
+ pass
368
+ else:
369
+ super(VideoLLaMA2Trainer, self)._save(output_dir, state_dict)