Safetensors
English
qwen2_vl
qwen_vl
video
real-time
multimodal
LLM
chenjoya commited on
Commit
42827e2
·
verified ·
1 Parent(s): 6e0d506

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +108 -0
README.md CHANGED
@@ -201,6 +201,114 @@ for t in range(31):
201
  t += 1
202
  ```
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  ## Limitations
206
 
 
201
  t += 1
202
  ```
203
 
204
+ Here we show a code snippet to show you how to do common video qa with `transformers` and the above utils:
205
+ ```python
206
+ import functools, torch
207
+ from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
208
+ apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
209
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
210
+ from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
211
+ from qwen_vl_utils import process_vision_info
212
+
213
+ class LiveCCDemoInfer:
214
+ fps = 2
215
+ initial_fps_frames = 6
216
+ streaming_fps_frames = 2
217
+ initial_time_interval = initial_fps_frames / fps
218
+ streaming_time_interval = streaming_fps_frames / fps
219
+ frame_time_interval = 1 / fps
220
+
221
+ def __init__(self, model_path: str = None, device: str = 'cuda'):
222
+ self.model = Qwen2VLForConditionalGeneration.from_pretrained(
223
+ model_path, torch_dtype="auto",
224
+ device_map=device,
225
+ attn_implementation='sdpa'
226
+ )
227
+ self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
228
+ self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
229
+ self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
230
+ message = {
231
+ "role": "user",
232
+ "content": [
233
+ {"type": "text", "text": 'livecc'},
234
+ ]
235
+ }
236
+ texts = self.processor.apply_chat_template([message], tokenize=False)
237
+ self.system_prompt_offset = texts.index('<|im_start|>user')
238
+ self._cached_video_readers_with_hw = {}
239
+
240
+ @torch.inference_mode()
241
+ def video_qa(
242
+ self,
243
+ message: str,
244
+ state: dict,
245
+ history: list = [],
246
+ do_sample: bool = False,
247
+ repetition_penalty: float = 1.05,
248
+ hf_spaces: bool = False,
249
+ **kwargs,
250
+ ):
251
+ """
252
+ state: dict, (maybe) with keys:
253
+ video_path: str, video path
254
+ video_timestamp: float, current video timestamp
255
+ last_timestamp: float, last processed video timestamp
256
+ last_video_pts_index: int, last processed video frame index
257
+ video_pts: np.ndarray, video pts
258
+ last_history: list, last processed history
259
+ """
260
+ video_path = state.get('video_path', None)
261
+ conversation = []
262
+ if hf_spaces:
263
+ for past_message in history:
264
+ content = [{"type": "text", "text": past_message['content']}]
265
+ if video_path: # only use once
266
+ content.insert(0, {"type": "video", "video": video_path})
267
+ video_path = None
268
+ conversation.append({"role": past_message["role"], "content": content})
269
+ else:
270
+ pass # use past_key_values
271
+ past_ids = state.get('past_ids', None)
272
+ content = [{"type": "text", "text": message}]
273
+ if past_ids is None and video_path: # only use once
274
+ content.insert(0, {"type": "video", "video": video_path})
275
+ conversation.append({"role": "user", "content": content})
276
+ print(conversation)
277
+ image_inputs, video_inputs = process_vision_info(conversation)
278
+ texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
279
+ if past_ids is not None:
280
+ texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
281
+ inputs = self.processor(
282
+ text=texts,
283
+ images=image_inputs,
284
+ videos=video_inputs,
285
+ return_tensors="pt",
286
+ return_attention_mask=False
287
+ )
288
+ inputs.to(self.model.device)
289
+ if past_ids is not None:
290
+ inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
291
+ outputs = self.model.generate(
292
+ **inputs, past_key_values=state.get('past_key_values', None),
293
+ return_dict_in_generate=True, do_sample=do_sample,
294
+ repetition_penalty=repetition_penalty,
295
+ max_new_tokens=512,
296
+ )
297
+ state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
298
+ state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
299
+ response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
300
+ return response, state
301
+
302
+ model_path = 'chenjoya/LiveCC-7B-Instruct'
303
+ video_path = "spacex_falcon9.mp4"
304
+
305
+ infer = LiveCCDemoInfer(model_path=model_path)
306
+ state = {'video_path': video_path}
307
+ # first round
308
+ response, state = infer.video_qa(message='What is the video?', state=state)
309
+ # second round
310
+ response, state = infer.video_qa(message='What? Say again.', state=state)
311
+ ```
312
 
313
  ## Limitations
314