hexuan21 commited on
Commit
752bf91
·
verified ·
1 Parent(s): 2fe9a47

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -4
README.md CHANGED
@@ -78,6 +78,11 @@ pip install git+https://github.com/TIGER-AI-Lab/MantisScore.git
78
  ```python
79
  import av
80
  import numpy as np
 
 
 
 
 
81
  def _read_video_pyav(
82
  frame_paths:List[str],
83
  max_frames:int,
@@ -94,6 +99,7 @@ def _read_video_pyav(
94
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
95
 
96
  MAX_NUM_FRAMES=16
 
97
  REGRESSION_QUERY_PROMPT = """
98
  Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
99
  please watch the following frames of a given video and see the text prompt for generating the video,
@@ -118,7 +124,14 @@ For this video, the text prompt is "{text_prompt}",
118
  all the frames of video are as follows:
119
  """
120
 
 
121
  video_path="examples/video1.mp4"
 
 
 
 
 
 
122
 
123
  # sample uniformly 8 frames from the video
124
  container = av.open(video_path)
@@ -129,7 +142,7 @@ else:
129
  indices = np.arange(total_frames)
130
 
131
  frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
132
- eval_prompt = REGRESSION_QUERY_TEMPLATE.format(text_prompt=video_prompt)
133
  num_image_token = eval_prompt.count("<image>")
134
  if num_image_token < len(frames):
135
  eval_prompt += "<image> " * (len(frames) - num_image_token)
@@ -153,12 +166,11 @@ num_aspects = logits.shape[-1]
153
  aspect_scores = []
154
  for i in range(num_aspects):
155
  aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
156
- print(aspect_scores)
157
-
158
  """
159
  # model output on visual quality, temporal consistency, dynamic degree,
160
  # text-to-video alignment, factual consistency, respectively
161
- [2.2969, 2.4375, 2.8281, 2.5, 2.4688]
162
  """
163
 
164
  ```
 
78
  ```python
79
  import av
80
  import numpy as np
81
+ from typing import List
82
+ import torch
83
+ from transformers import AutoProcessor
84
+ from models.idefics2 import Idefics2ForSequenceClassification
85
+
86
  def _read_video_pyav(
87
  frame_paths:List[str],
88
  max_frames:int,
 
99
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
100
 
101
  MAX_NUM_FRAMES=16
102
+ ROUND_DIGIT=3
103
  REGRESSION_QUERY_PROMPT = """
104
  Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
105
  please watch the following frames of a given video and see the text prompt for generating the video,
 
124
  all the frames of video are as follows:
125
  """
126
 
127
+ model_name="TIGER-Lab/MantisScore-anno-only"
128
  video_path="examples/video1.mp4"
129
+ video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
130
+
131
+ processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
132
+ model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
133
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134
+ model.to(device)
135
 
136
  # sample uniformly 8 frames from the video
137
  container = av.open(video_path)
 
142
  indices = np.arange(total_frames)
143
 
144
  frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
145
+ eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
146
  num_image_token = eval_prompt.count("<image>")
147
  if num_image_token < len(frames):
148
  eval_prompt += "<image> " * (len(frames) - num_image_token)
 
166
  aspect_scores = []
167
  for i in range(num_aspects):
168
  aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
169
+ print(aspect_scores)
 
170
  """
171
  # model output on visual quality, temporal consistency, dynamic degree,
172
  # text-to-video alignment, factual consistency, respectively
173
+ [2.453, 2.706, 2.468, 2.464, 2.572]
174
  """
175
 
176
  ```