chenjoya
/

LiveCC-7B-Instruct

@@ -226,7 +226,7 @@ class LiveCCDemoInfer:
       self.model = Qwen2VLForConditionalGeneration.from_pretrained(
           model_path, torch_dtype="auto",
           device_map=device,
-          attn_implementation='sdpa'
       )
       self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
       self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
@@ -239,17 +239,13 @@ class LiveCCDemoInfer:
       }
       texts = self.processor.apply_chat_template([message], tokenize=False)
       self.system_prompt_offset = texts.index('<|im_start|>user')
-      self._cached_video_readers_with_hw = {}
-  @torch.inference_mode()
   def video_qa(
       self,
       message: str,
       state: dict,
-      history: list = [],
       do_sample: bool = False,
       repetition_penalty: float = 1.05,
-      hf_spaces: bool = False,
       **kwargs,
   ):
       """
@@ -263,15 +259,6 @@ class LiveCCDemoInfer:
       """
       video_path = state.get('video_path', None)
       conversation = []
-      if hf_spaces:
-          for past_message in history:
-              content = [{"type": "text", "text": past_message['content']}]
-              if video_path: # only use once
-                  content.insert(0, {"type": "video", "video": video_path})
-                  video_path = None
-              conversation.append({"role": past_message["role"], "content": content})
-      else:
-          pass # use past_key_values
       past_ids = state.get('past_ids', None)
       content = [{"type": "text", "text": message}]
       if past_ids is None and video_path: # only use once
@@ -297,20 +284,25 @@ class LiveCCDemoInfer:
           repetition_penalty=repetition_penalty,
           max_new_tokens=512,
       )
-      state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
-      state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
       response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
       return response, state
 model_path = 'chenjoya/LiveCC-7B-Instruct'
-video_path = "spacex_falcon9.mp4"
 infer = LiveCCDemoInfer(model_path=model_path)
 state = {'video_path': video_path}
 # first round
-response, state = infer.video_qa(message='What is the video?', state=state)
 # second round
-response, state = infer.video_qa(message='What? Say again.', state=state)
 ```
 ## Limitations

       self.model = Qwen2VLForConditionalGeneration.from_pretrained(
           model_path, torch_dtype="auto",
           device_map=device,
+          attn_implementation='flash_attention_2'
       )
       self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
       self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
       }
       texts = self.processor.apply_chat_template([message], tokenize=False)
       self.system_prompt_offset = texts.index('<|im_start|>user')
   def video_qa(
       self,
       message: str,
       state: dict,
       do_sample: bool = False,
       repetition_penalty: float = 1.05,
       **kwargs,
   ):
       """
       """
       video_path = state.get('video_path', None)
       conversation = []
       past_ids = state.get('past_ids', None)
       content = [{"type": "text", "text": message}]
       if past_ids is None and video_path: # only use once
           repetition_penalty=repetition_penalty,
           max_new_tokens=512,
       )
+      state['past_key_values'] = outputs.past_key_values
+      state['past_ids'] = outputs.sequences[:, :-1]
       response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
       return response, state
 model_path = 'chenjoya/LiveCC-7B-Instruct'
+# download a test video at: https://github.com/showlab/livecc/blob/main/demo/sources/howto_fix_laptop_mute_1080p.mp4
+video_path = "demo/sources/howto_fix_laptop_mute_1080p.mp4"
 infer = LiveCCDemoInfer(model_path=model_path)
 state = {'video_path': video_path}
 # first round
+query1 = 'What is the video?'
+response1, state = infer.video_qa(message=query1, state=state)
+print(f'Q1: {query1}\nA1: {response1}')
 # second round
+query2 = 'How do you know that?'
+response2, state = infer.video_qa(message=query2, state=state)
+print(f'Q2: {query2}\nA2: {response2}')
 ```
 ## Limitations