Update README.md
Browse files
README.md
CHANGED
@@ -201,6 +201,114 @@ for t in range(31):
|
|
201 |
t += 1
|
202 |
```
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
## Limitations
|
206 |
|
|
|
201 |
t += 1
|
202 |
```
|
203 |
|
204 |
+
Here we show a code snippet to show you how to do common video qa with `transformers` and the above utils:
|
205 |
+
```python
|
206 |
+
import functools, torch
|
207 |
+
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
|
208 |
+
apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
|
209 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
|
210 |
+
from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
|
211 |
+
from qwen_vl_utils import process_vision_info
|
212 |
+
|
213 |
+
class LiveCCDemoInfer:
|
214 |
+
fps = 2
|
215 |
+
initial_fps_frames = 6
|
216 |
+
streaming_fps_frames = 2
|
217 |
+
initial_time_interval = initial_fps_frames / fps
|
218 |
+
streaming_time_interval = streaming_fps_frames / fps
|
219 |
+
frame_time_interval = 1 / fps
|
220 |
+
|
221 |
+
def __init__(self, model_path: str = None, device: str = 'cuda'):
|
222 |
+
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
223 |
+
model_path, torch_dtype="auto",
|
224 |
+
device_map=device,
|
225 |
+
attn_implementation='sdpa'
|
226 |
+
)
|
227 |
+
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
228 |
+
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|
229 |
+
self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
|
230 |
+
message = {
|
231 |
+
"role": "user",
|
232 |
+
"content": [
|
233 |
+
{"type": "text", "text": 'livecc'},
|
234 |
+
]
|
235 |
+
}
|
236 |
+
texts = self.processor.apply_chat_template([message], tokenize=False)
|
237 |
+
self.system_prompt_offset = texts.index('<|im_start|>user')
|
238 |
+
self._cached_video_readers_with_hw = {}
|
239 |
+
|
240 |
+
@torch.inference_mode()
|
241 |
+
def video_qa(
|
242 |
+
self,
|
243 |
+
message: str,
|
244 |
+
state: dict,
|
245 |
+
history: list = [],
|
246 |
+
do_sample: bool = False,
|
247 |
+
repetition_penalty: float = 1.05,
|
248 |
+
hf_spaces: bool = False,
|
249 |
+
**kwargs,
|
250 |
+
):
|
251 |
+
"""
|
252 |
+
state: dict, (maybe) with keys:
|
253 |
+
video_path: str, video path
|
254 |
+
video_timestamp: float, current video timestamp
|
255 |
+
last_timestamp: float, last processed video timestamp
|
256 |
+
last_video_pts_index: int, last processed video frame index
|
257 |
+
video_pts: np.ndarray, video pts
|
258 |
+
last_history: list, last processed history
|
259 |
+
"""
|
260 |
+
video_path = state.get('video_path', None)
|
261 |
+
conversation = []
|
262 |
+
if hf_spaces:
|
263 |
+
for past_message in history:
|
264 |
+
content = [{"type": "text", "text": past_message['content']}]
|
265 |
+
if video_path: # only use once
|
266 |
+
content.insert(0, {"type": "video", "video": video_path})
|
267 |
+
video_path = None
|
268 |
+
conversation.append({"role": past_message["role"], "content": content})
|
269 |
+
else:
|
270 |
+
pass # use past_key_values
|
271 |
+
past_ids = state.get('past_ids', None)
|
272 |
+
content = [{"type": "text", "text": message}]
|
273 |
+
if past_ids is None and video_path: # only use once
|
274 |
+
content.insert(0, {"type": "video", "video": video_path})
|
275 |
+
conversation.append({"role": "user", "content": content})
|
276 |
+
print(conversation)
|
277 |
+
image_inputs, video_inputs = process_vision_info(conversation)
|
278 |
+
texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
|
279 |
+
if past_ids is not None:
|
280 |
+
texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
|
281 |
+
inputs = self.processor(
|
282 |
+
text=texts,
|
283 |
+
images=image_inputs,
|
284 |
+
videos=video_inputs,
|
285 |
+
return_tensors="pt",
|
286 |
+
return_attention_mask=False
|
287 |
+
)
|
288 |
+
inputs.to(self.model.device)
|
289 |
+
if past_ids is not None:
|
290 |
+
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
|
291 |
+
outputs = self.model.generate(
|
292 |
+
**inputs, past_key_values=state.get('past_key_values', None),
|
293 |
+
return_dict_in_generate=True, do_sample=do_sample,
|
294 |
+
repetition_penalty=repetition_penalty,
|
295 |
+
max_new_tokens=512,
|
296 |
+
)
|
297 |
+
state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
|
298 |
+
state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
|
299 |
+
response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
|
300 |
+
return response, state
|
301 |
+
|
302 |
+
model_path = 'chenjoya/LiveCC-7B-Instruct'
|
303 |
+
video_path = "spacex_falcon9.mp4"
|
304 |
+
|
305 |
+
infer = LiveCCDemoInfer(model_path=model_path)
|
306 |
+
state = {'video_path': video_path}
|
307 |
+
# first round
|
308 |
+
response, state = infer.video_qa(message='What is the video?', state=state)
|
309 |
+
# second round
|
310 |
+
response, state = infer.video_qa(message='What? Say again.', state=state)
|
311 |
+
```
|
312 |
|
313 |
## Limitations
|
314 |
|