TIGER-Lab
/

VideoScore-anno-only

@@ -78,6 +78,11 @@ pip install git+https://github.com/TIGER-AI-Lab/MantisScore.git
 ```python
 import av
 import numpy as np
 def _read_video_pyav(
     frame_paths:List[str],
     max_frames:int,
@@ -94,6 +99,7 @@ def _read_video_pyav(
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 MAX_NUM_FRAMES=16
 REGRESSION_QUERY_PROMPT = """
 Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
 please watch the following frames of a given video and see the text prompt for generating the video,
@@ -118,7 +124,14 @@ For this video, the text prompt is "{text_prompt}",
 all the frames of video are as follows:
 """
 video_path="examples/video1.mp4"
 # sample uniformly 8 frames from the video
 container = av.open(video_path)
@@ -129,7 +142,7 @@ else:
     indices = np.arange(total_frames)
 frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
-eval_prompt = REGRESSION_QUERY_TEMPLATE.format(text_prompt=video_prompt)
 num_image_token = eval_prompt.count("<image>")
 if num_image_token < len(frames):
     eval_prompt += "<image> " * (len(frames) - num_image_token)
@@ -153,12 +166,11 @@ num_aspects = logits.shape[-1]
 aspect_scores = []
 for i in range(num_aspects):
     aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
-print(aspect_scores)
 """
 # model output on visual quality, temporal consistency, dynamic degree,
 # text-to-video alignment, factual consistency, respectively
-[2.2969, 2.4375, 2.8281, 2.5, 2.4688]
 """
 ```

 ```python
 import av
 import numpy as np
+from typing import List
+import torch
+from transformers import AutoProcessor
+from models.idefics2 import Idefics2ForSequenceClassification
 def _read_video_pyav(
     frame_paths:List[str],
     max_frames:int,
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 MAX_NUM_FRAMES=16
+ROUND_DIGIT=3
 REGRESSION_QUERY_PROMPT = """
 Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
 please watch the following frames of a given video and see the text prompt for generating the video,
 all the frames of video are as follows:
 """
+model_name="TIGER-Lab/MantisScore-anno-only"
 video_path="examples/video1.mp4"
+video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
+processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
+model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
 # sample uniformly 8 frames from the video
 container = av.open(video_path)
     indices = np.arange(total_frames)
 frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
+eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
 num_image_token = eval_prompt.count("<image>")
 if num_image_token < len(frames):
     eval_prompt += "<image> " * (len(frames) - num_image_token)
 aspect_scores = []
 for i in range(num_aspects):
     aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
+print(aspect_scores)
 """
 # model output on visual quality, temporal consistency, dynamic degree,
 # text-to-video alignment, factual consistency, respectively
+[2.453, 2.706, 2.468, 2.464, 2.572]
 """
 ```