hexuan21 commited on
Commit
b289481
·
verified ·
1 Parent(s): 9da055e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +82 -2
README.md CHANGED
@@ -10,8 +10,6 @@ library_name: transformers
10
  pipeline_tag: visual-question-answering
11
  ---
12
 
13
- # ![MantisScore_logo](https://tiger-ai-lab.github.io/MantisScore/static/images/logo3.png) MantisScore
14
-
15
 
16
  [Paper] | [Website](https://tiger-ai-lab.github.io/MantisScore/) | [Github](https://github.com/TIGER-AI-Lab/MantisScore) | [Datasets](https://huggingface.co/datasets/TIGER-Lab/VideoEval) | [Model](https://huggingface.co/TIGER-Lab/MantisScore) | [Demo](https://huggingface.co/spaces/Mantis-VL/MantisScore)
17
 
@@ -63,9 +61,91 @@ pip install git+https://github.com/TIGER-AI-Lab/MantisScore.git
63
  ```
64
 
65
  ### Inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  ### Training
 
68
 
69
  ### Evaluation
 
70
 
71
  ## Citation
 
10
  pipeline_tag: visual-question-answering
11
  ---
12
 
 
 
13
 
14
  [Paper] | [Website](https://tiger-ai-lab.github.io/MantisScore/) | [Github](https://github.com/TIGER-AI-Lab/MantisScore) | [Datasets](https://huggingface.co/datasets/TIGER-Lab/VideoEval) | [Model](https://huggingface.co/TIGER-Lab/MantisScore) | [Demo](https://huggingface.co/spaces/Mantis-VL/MantisScore)
15
 
 
61
  ```
62
 
63
  ### Inference
64
+ ```python
65
+ import av
66
+ import numpy as np
67
+ def _read_video_pyav(
68
+ frame_paths:List[str],
69
+ max_frames:int,
70
+ ):
71
+ frames = []
72
+ container.seek(0)
73
+ start_index = indices[0]
74
+ end_index = indices[-1]
75
+ for i, frame in enumerate(container.decode(video=0)):
76
+ if i > end_index:
77
+ break
78
+ if i >= start_index and i in indices:
79
+ frames.append(frame)
80
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
81
+
82
+ MAX_NUM_FRAMES=16
83
+ REGRESSION_QUERY_PROMPT = """
84
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
85
+ please watch the following frames of a given video and see the text prompt for generating the video,
86
+ then give scores from 5 different dimensions:
87
+ (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
88
+ (2) temporal consistency, both the consistency of objects or humans and the smoothness of motion or movements
89
+ (3) dynamic degree, the degree of dynamic changes
90
+ (4) text-to-video alignment, the alignment between the text prompt and the video content
91
+ (5) factual consistency, the consistency of the video content with the common-sense and factual knowledge
92
+
93
+ for each dimension, output a float number from 1.0 to 4.0,
94
+ the higher the number is, the better the video performs in that sub-score,
95
+ the lowest 1.0 means Bad, the highest 4.0 means Perfect/Real (the video is like a real video)
96
+ Here is an output example:
97
+ visual quality: 3.2
98
+ temporal consistency: 2.7
99
+ dynamic degree: 4.0
100
+ text-to-video alignment: 2.3
101
+ factual consistency: 1.8
102
+
103
+ For this video, the text prompt is "{text_prompt}",
104
+ all the frames of video are as follows:
105
+ """
106
+
107
+ video_path="examples/video1.mp4"
108
+
109
+ # sample uniformly 8 frames from the video
110
+ container = av.open(video_path)
111
+ total_frames = container.streams.video[0].frames
112
+ if total_frames > MAX_NUM_FRAMES:
113
+ indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
114
+ else:
115
+ indices = np.arange(total_frames)
116
+
117
+ frames = [Image.fromarray(x) for x in _read_video_pyav(container, indices)]
118
+ eval_prompt = REGRESSION_QUERY_TEMPLATE.format(text_prompt=video_prompt)
119
+ num_image_token = eval_prompt.count("<image>")
120
+ if num_image_token < len(frames):
121
+ eval_prompt += "<image> " * (len(frames) - num_image_token)
122
+
123
+ flatten_images = []
124
+ for x in [frames]:
125
+ if isinstance(x, list):
126
+ flatten_images.extend(x)
127
+ else:
128
+ flatten_images.append(x)
129
+ flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
130
+ inputs = processor(text=eval_prompt, images=flatten_images, return_tensors="pt")
131
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
132
+
133
+ with torch.no_grad():
134
+ outputs = model(**inputs)
135
+
136
+ logits = outputs.logits
137
+ num_aspects = logits.shape[-1]
138
+
139
+ aspect_scores = []
140
+ for i in range(num_aspects):
141
+ aspect_scores.append(round(logits[0, i].item(),ROUND_DIGIT))
142
+ print(aspect_scores)
143
+ ```
144
 
145
  ### Training
146
+ see [MantisScore/training](https://github.com/TIGER-AI-Lab/MantisScore/training) for details
147
 
148
  ### Evaluation
149
+ see [MantisScore/benchmark]((https://github.com/TIGER-AI-Lab/MantisScore/benchmark)) for details
150
 
151
  ## Citation