Update README.md
Browse files
README.md
CHANGED
@@ -78,6 +78,11 @@ pip install git+https://github.com/TIGER-AI-Lab/MantisScore.git
|
|
78 |
```python
|
79 |
import av
|
80 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
81 |
def _read_video_pyav(
|
82 |
frame_paths:List[str],
|
83 |
max_frames:int,
|
@@ -94,6 +99,7 @@ def _read_video_pyav(
|
|
94 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
95 |
|
96 |
MAX_NUM_FRAMES=16
|
|
|
97 |
REGRESSION_QUERY_PROMPT = """
|
98 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
99 |
please watch the following frames of a given video and see the text prompt for generating the video,
|
@@ -118,7 +124,14 @@ For this video, the text prompt is "{text_prompt}",
|
|
118 |
all the frames of video are as follows:
|
119 |
"""
|
120 |
|
|
|
121 |
video_path="examples/video1.mp4"
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
# sample uniformly 8 frames from the video
|
124 |
container = av.open(video_path)
|
@@ -129,7 +142,7 @@ else:
|
|
129 |
indices = np.arange(total_frames)
|
130 |
|
131 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
132 |
-
eval_prompt =
|
133 |
num_image_token = eval_prompt.count("<image>")
|
134 |
if num_image_token < len(frames):
|
135 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|
@@ -153,12 +166,11 @@ num_aspects = logits.shape[-1]
|
|
153 |
aspect_scores = []
|
154 |
for i in range(num_aspects):
|
155 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
156 |
-
print(aspect_scores)
|
157 |
-
|
158 |
"""
|
159 |
# model output on visual quality, temporal consistency, dynamic degree,
|
160 |
# text-to-video alignment, factual consistency, respectively
|
161 |
-
[2.
|
162 |
"""
|
163 |
|
164 |
```
|
|
|
78 |
```python
|
79 |
import av
|
80 |
import numpy as np
|
81 |
+
from typing import List
|
82 |
+
import torch
|
83 |
+
from transformers import AutoProcessor
|
84 |
+
from models.idefics2 import Idefics2ForSequenceClassification
|
85 |
+
|
86 |
def _read_video_pyav(
|
87 |
frame_paths:List[str],
|
88 |
max_frames:int,
|
|
|
99 |
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
100 |
|
101 |
MAX_NUM_FRAMES=16
|
102 |
+
ROUND_DIGIT=3
|
103 |
REGRESSION_QUERY_PROMPT = """
|
104 |
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
105 |
please watch the following frames of a given video and see the text prompt for generating the video,
|
|
|
124 |
all the frames of video are as follows:
|
125 |
"""
|
126 |
|
127 |
+
model_name="TIGER-Lab/MantisScore-anno-only"
|
128 |
video_path="examples/video1.mp4"
|
129 |
+
video_prompt="Near the Elephant Gate village, they approach the haunted house at night. Rajiv feels anxious, but Bhavesh encourages him. As they reach the house, a mysterious sound in the air adds to the suspense."
|
130 |
+
|
131 |
+
processor = AutoProcessor.from_pretrained(model_name,torch_dtype=torch.bfloat16)
|
132 |
+
model = Idefics2ForSequenceClassification.from_pretrained(model_name,torch_dtype=torch.bfloat16).eval()
|
133 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
134 |
+
model.to(device)
|
135 |
|
136 |
# sample uniformly 8 frames from the video
|
137 |
container = av.open(video_path)
|
|
|
142 |
indices = np.arange(total_frames)
|
143 |
|
144 |
frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
|
145 |
+
eval_prompt = REGRESSION_QUERY_PROMPT.format(text_prompt=video_prompt)
|
146 |
num_image_token = eval_prompt.count("<image>")
|
147 |
if num_image_token < len(frames):
|
148 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|
|
|
166 |
aspect_scores = []
|
167 |
for i in range(num_aspects):
|
168 |
aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
|
169 |
+
print(aspect_scores)
|
|
|
170 |
"""
|
171 |
# model output on visual quality, temporal consistency, dynamic degree,
|
172 |
# text-to-video alignment, factual consistency, respectively
|
173 |
+
[2.453, 2.706, 2.468, 2.464, 2.572]
|
174 |
"""
|
175 |
|
176 |
```
|