Spaces:
Running
Running
Commit
·
2b3f382
1
Parent(s):
9889fe8
intital changes to website
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- index.html +154 -244
- static/.DS_Store +0 -0
- static/images/method_case.jpg +0 -0
- static/images/motivation_case.jpg +0 -0
- static/images/usc_logo.png +0 -0
- static/images/vicrop_methods.jpg +0 -0
- static/images/vu_logo.png +0 -0
- static/interpolation/stacked/000000.jpg +0 -0
- static/interpolation/stacked/000001.jpg +0 -0
- static/interpolation/stacked/000002.jpg +0 -0
- static/interpolation/stacked/000003.jpg +0 -0
- static/interpolation/stacked/000004.jpg +0 -0
- static/interpolation/stacked/000005.jpg +0 -0
- static/interpolation/stacked/000006.jpg +0 -0
- static/interpolation/stacked/000007.jpg +0 -0
- static/interpolation/stacked/000008.jpg +0 -0
- static/interpolation/stacked/000009.jpg +0 -0
- static/interpolation/stacked/000010.jpg +0 -0
- static/interpolation/stacked/000011.jpg +0 -0
- static/interpolation/stacked/000012.jpg +0 -0
- static/interpolation/stacked/000013.jpg +0 -0
- static/interpolation/stacked/000014.jpg +0 -0
- static/interpolation/stacked/000015.jpg +0 -0
- static/interpolation/stacked/000016.jpg +0 -0
- static/interpolation/stacked/000017.jpg +0 -0
- static/interpolation/stacked/000018.jpg +0 -0
- static/interpolation/stacked/000019.jpg +0 -0
- static/interpolation/stacked/000020.jpg +0 -0
- static/interpolation/stacked/000021.jpg +0 -0
- static/interpolation/stacked/000022.jpg +0 -0
- static/interpolation/stacked/000023.jpg +0 -0
- static/interpolation/stacked/000024.jpg +0 -0
- static/interpolation/stacked/000025.jpg +0 -0
- static/interpolation/stacked/000026.jpg +0 -0
- static/interpolation/stacked/000027.jpg +0 -0
- static/interpolation/stacked/000028.jpg +0 -0
- static/interpolation/stacked/000029.jpg +0 -0
- static/interpolation/stacked/000030.jpg +0 -0
- static/interpolation/stacked/000031.jpg +0 -0
- static/interpolation/stacked/000032.jpg +0 -0
- static/interpolation/stacked/000033.jpg +0 -0
- static/interpolation/stacked/000034.jpg +0 -0
- static/interpolation/stacked/000035.jpg +0 -0
- static/interpolation/stacked/000036.jpg +0 -0
- static/interpolation/stacked/000037.jpg +0 -0
- static/interpolation/stacked/000038.jpg +0 -0
- static/interpolation/stacked/000039.jpg +0 -0
- static/interpolation/stacked/000040.jpg +0 -0
- static/interpolation/stacked/000041.jpg +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
index.html
CHANGED
@@ -33,34 +33,26 @@
|
|
33 |
<div class="container is-max-desktop">
|
34 |
<div class="columns is-centered">
|
35 |
<div class="column has-text-centered">
|
36 |
-
<h1 class="title is-1 publication-title">
|
37 |
<div class="is-size-5 publication-authors">
|
38 |
<span class="author-block">
|
39 |
-
<a href="https://
|
40 |
<span class="author-block">
|
41 |
-
<a href="https://
|
42 |
<span class="author-block">
|
43 |
-
<a href="https://
|
44 |
</span>
|
|
|
45 |
<span class="author-block">
|
46 |
-
<a href="
|
47 |
-
</span>
|
48 |
-
<span class="author-block">
|
49 |
-
<a href="https://www.danbgoldman.com" target="_blank">Dan B Goldman</a><sup>2</sup>,
|
50 |
-
</span>
|
51 |
-
<span class="author-block">
|
52 |
-
<a href="https://homes.cs.washington.edu/~seitz/" target="_blank">Steven M. Seitz</a><sup>1,2</sup>,
|
53 |
-
</span>
|
54 |
-
<span class="author-block">
|
55 |
-
<a href="http://www.ricardomartinbrualla.com" target="_blank">Ricardo Martin-Brualla</a><sup>2</sup>
|
56 |
</span>
|
57 |
</div>
|
58 |
-
|
59 |
<div class="is-size-5 publication-authors">
|
60 |
-
<span class="author-block"><
|
61 |
-
<span class="author-block"><
|
62 |
</div>
|
63 |
|
|
|
64 |
<div class="column has-text-centered">
|
65 |
<div class="publication-links">
|
66 |
<!-- PDF Link. -->
|
@@ -83,7 +75,7 @@
|
|
83 |
</a>
|
84 |
</span>
|
85 |
<!-- Video Link. -->
|
86 |
-
<span class="link-block">
|
87 |
<a href="https://www.youtube.com/watch?v=MrKrnHhk8IA" target="_blank"
|
88 |
class="external-link button is-normal is-rounded is-dark">
|
89 |
<span class="icon">
|
@@ -91,10 +83,10 @@
|
|
91 |
</span>
|
92 |
<span>Video</span>
|
93 |
</a>
|
94 |
-
</span>
|
95 |
<!-- Code Link. -->
|
96 |
<span class="link-block">
|
97 |
-
<a href="https://github.com/
|
98 |
class="external-link button is-normal is-rounded is-dark">
|
99 |
<span class="icon">
|
100 |
<i class="fab fa-github"></i>
|
@@ -103,7 +95,7 @@
|
|
103 |
</a>
|
104 |
</span>
|
105 |
<!-- Dataset Link. -->
|
106 |
-
<span class="link-block">
|
107 |
<a href="https://github.com/google/nerfies/releases/tag/0.1" target="_blank"
|
108 |
class="external-link button is-normal is-rounded is-dark">
|
109 |
<span class="icon">
|
@@ -111,83 +103,33 @@
|
|
111 |
</span>
|
112 |
<span>Data</span>
|
113 |
</a>
|
114 |
-
</div>
|
115 |
|
116 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
</div>
|
118 |
</div>
|
119 |
</div>
|
120 |
</div>
|
121 |
</section>
|
122 |
|
123 |
-
<section class="hero teaser">
|
124 |
-
<div class="container is-max-desktop">
|
125 |
-
<div class="hero-body">
|
126 |
-
<video id="teaser" autoplay muted loop playsinline height="100%">
|
127 |
-
<source src="./static/videos/teaser.mp4"
|
128 |
-
type="video/mp4">
|
129 |
-
</video>
|
130 |
-
<h2 class="subtitle has-text-centered">
|
131 |
-
<span class="dnerf">Nerfies</span> turns selfie videos from your phone into
|
132 |
-
free-viewpoint
|
133 |
-
portraits.
|
134 |
-
</h2>
|
135 |
-
</div>
|
136 |
-
</div>
|
137 |
-
</section>
|
138 |
|
139 |
|
140 |
-
<section class="
|
141 |
-
<div class="
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
<div class="item item-chair-tp">
|
151 |
-
<video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
|
152 |
-
<source src="./static/videos/chair-tp.mp4"
|
153 |
-
type="video/mp4">
|
154 |
-
</video>
|
155 |
-
</div>
|
156 |
-
<div class="item item-shiba">
|
157 |
-
<video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
|
158 |
-
<source src="./static/videos/shiba.mp4"
|
159 |
-
type="video/mp4">
|
160 |
-
</video>
|
161 |
-
</div>
|
162 |
-
<div class="item item-fullbody">
|
163 |
-
<video poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
|
164 |
-
<source src="./static/videos/fullbody.mp4"
|
165 |
-
type="video/mp4">
|
166 |
-
</video>
|
167 |
-
</div>
|
168 |
-
<div class="item item-blueshirt">
|
169 |
-
<video poster="" id="blueshirt" autoplay controls muted loop playsinline height="100%">
|
170 |
-
<source src="./static/videos/blueshirt.mp4"
|
171 |
-
type="video/mp4">
|
172 |
-
</video>
|
173 |
-
</div>
|
174 |
-
<div class="item item-mask">
|
175 |
-
<video poster="" id="mask" autoplay controls muted loop playsinline height="100%">
|
176 |
-
<source src="./static/videos/mask.mp4"
|
177 |
-
type="video/mp4">
|
178 |
-
</video>
|
179 |
-
</div>
|
180 |
-
<div class="item item-coffee">
|
181 |
-
<video poster="" id="coffee" autoplay controls muted loop playsinline height="100%">
|
182 |
-
<source src="./static/videos/coffee.mp4"
|
183 |
-
type="video/mp4">
|
184 |
-
</video>
|
185 |
-
</div>
|
186 |
-
<div class="item item-toby">
|
187 |
-
<video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
|
188 |
-
<source src="./static/videos/toby2.mp4"
|
189 |
-
type="video/mp4">
|
190 |
-
</video>
|
191 |
</div>
|
192 |
</div>
|
193 |
</div>
|
@@ -203,187 +145,155 @@
|
|
203 |
<h2 class="title is-3">Abstract</h2>
|
204 |
<div class="content has-text-justified">
|
205 |
<p>
|
206 |
-
We
|
207 |
-
deforming scene using photos/videos captured casually from mobile phones.
|
208 |
-
</p>
|
209 |
-
<p>
|
210 |
-
Our approach augments neural radiance fields
|
211 |
-
(NeRF) by optimizing an
|
212 |
-
additional continuous volumetric deformation field that warps each observed point into a
|
213 |
-
canonical 5D NeRF.
|
214 |
-
We observe that these NeRF-like deformation fields are prone to local minima, and
|
215 |
-
propose a coarse-to-fine optimization method for coordinate-based models that allows for
|
216 |
-
more robust optimization.
|
217 |
-
By adapting principles from geometry processing and physical simulation to NeRF-like
|
218 |
-
models, we propose an elastic regularization of the deformation field that further
|
219 |
-
improves robustness.
|
220 |
-
</p>
|
221 |
-
<p>
|
222 |
-
We show that <span class="dnerf">Nerfies</span> can turn casually captured selfie
|
223 |
-
photos/videos into deformable NeRF
|
224 |
-
models that allow for photorealistic renderings of the subject from arbitrary
|
225 |
-
viewpoints, which we dub <i>"nerfies"</i>. We evaluate our method by collecting data
|
226 |
-
using a
|
227 |
-
rig with two mobile phones that take time-synchronized photos, yielding train/validation
|
228 |
-
images of the same pose at different viewpoints. We show that our method faithfully
|
229 |
-
reconstructs non-rigidly deforming scenes and reproduces unseen views with high
|
230 |
-
fidelity.
|
231 |
</p>
|
232 |
</div>
|
233 |
</div>
|
234 |
</div>
|
235 |
<!--/ Abstract. -->
|
|
|
236 |
|
237 |
-
|
|
|
|
|
238 |
<div class="columns is-centered has-text-centered">
|
239 |
<div class="column is-four-fifths">
|
240 |
-
<
|
241 |
-
<div class="publication-
|
242 |
-
<
|
243 |
-
|
|
|
|
|
244 |
</div>
|
245 |
</div>
|
246 |
</div>
|
247 |
-
<!--/ Paper video. -->
|
248 |
</div>
|
249 |
</section>
|
250 |
|
251 |
-
|
252 |
<section class="section">
|
253 |
<div class="container is-max-desktop">
|
254 |
-
|
255 |
-
<div class="columns is-centered">
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
<
|
261 |
-
|
262 |
-
Using <i>nerfies</i> you can create fun visual effects. This Dolly zoom effect
|
263 |
-
would be impossible without nerfies since it would require going through a wall.
|
264 |
</p>
|
265 |
-
<video id="dollyzoom" autoplay controls muted loop playsinline height="100%">
|
266 |
-
<source src="./static/videos/dollyzoom-stacked.mp4"
|
267 |
-
type="video/mp4">
|
268 |
-
</video>
|
269 |
-
</div>
|
270 |
-
</div>
|
271 |
-
<!--/ Visual Effects. -->
|
272 |
-
|
273 |
-
<!-- Matting. -->
|
274 |
-
<div class="column">
|
275 |
-
<h2 class="title is-3">Matting</h2>
|
276 |
-
<div class="columns is-centered">
|
277 |
-
<div class="column content">
|
278 |
-
<p>
|
279 |
-
As a byproduct of our method, we can also solve the matting problem by ignoring
|
280 |
-
samples that fall outside of a bounding box during rendering.
|
281 |
-
</p>
|
282 |
-
<video id="matting-video" controls playsinline height="100%">
|
283 |
-
<source src="./static/videos/matting.mp4"
|
284 |
-
type="video/mp4">
|
285 |
-
</video>
|
286 |
-
</div>
|
287 |
-
|
288 |
</div>
|
289 |
</div>
|
290 |
</div>
|
291 |
-
|
292 |
-
|
293 |
-
<!-- Animation. -->
|
294 |
-
<div class="columns is-centered">
|
295 |
-
<div class="column is-full-width">
|
296 |
-
<h2 class="title is-3">Animation</h2>
|
297 |
-
|
298 |
-
<!-- Interpolating. -->
|
299 |
-
<h3 class="title is-4">Interpolating states</h3>
|
300 |
-
<div class="content has-text-justified">
|
301 |
-
<p>
|
302 |
-
We can also animate the scene by interpolating the deformation latent codes of two input
|
303 |
-
frames. Use the slider here to linearly interpolate between the left frame and the right
|
304 |
-
frame.
|
305 |
-
</p>
|
306 |
-
</div>
|
307 |
-
<div class="columns is-vcentered interpolation-panel">
|
308 |
-
<div class="column is-3 has-text-centered">
|
309 |
-
<img src="./static/images/interpolate_start.jpg"
|
310 |
-
class="interpolation-image"
|
311 |
-
alt="Interpolate start reference image."/>
|
312 |
-
<p>Start Frame</p>
|
313 |
-
</div>
|
314 |
-
<div class="column interpolation-video-column">
|
315 |
-
<div id="interpolation-image-wrapper">
|
316 |
-
Loading...
|
317 |
-
</div>
|
318 |
-
<input class="slider is-fullwidth is-large is-info"
|
319 |
-
id="interpolation-slider"
|
320 |
-
step="1" min="0" max="100" value="0" type="range">
|
321 |
-
</div>
|
322 |
-
<div class="column is-3 has-text-centered">
|
323 |
-
<img src="./static/images/interpolate_end.jpg"
|
324 |
-
class="interpolation-image"
|
325 |
-
alt="Interpolation end reference image."/>
|
326 |
-
<p class="is-bold">End Frame</p>
|
327 |
-
</div>
|
328 |
-
</div>
|
329 |
-
<br/>
|
330 |
-
<!--/ Interpolating. -->
|
331 |
-
|
332 |
-
<!-- Re-rendering. -->
|
333 |
-
<h3 class="title is-4">Re-rendering the input video</h3>
|
334 |
-
<div class="content has-text-justified">
|
335 |
-
<p>
|
336 |
-
Using <span class="dnerf">Nerfies</span>, you can re-render a video from a novel
|
337 |
-
viewpoint such as a stabilized camera by playing back the training deformations.
|
338 |
-
</p>
|
339 |
-
</div>
|
340 |
-
<div class="content has-text-centered">
|
341 |
-
<video id="replay-video"
|
342 |
-
controls
|
343 |
-
muted
|
344 |
-
preload
|
345 |
-
playsinline
|
346 |
-
width="75%">
|
347 |
-
<source src="./static/videos/replay.mp4"
|
348 |
-
type="video/mp4">
|
349 |
-
</video>
|
350 |
-
</div>
|
351 |
-
<!--/ Re-rendering. -->
|
352 |
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
|
358 |
-
<!-- Concurrent Work. -->
|
359 |
-
<div class="columns is-centered">
|
360 |
-
<div class="column is-full-width">
|
361 |
-
<h2 class="title is-3">Related Links</h2>
|
362 |
|
363 |
-
<div class="content has-text-justified">
|
364 |
-
<p>
|
365 |
-
There's a lot of excellent work that was introduced around the same time as ours.
|
366 |
-
</p>
|
367 |
-
<p>
|
368 |
-
<a href="https://arxiv.org/abs/2104.09125" target="_blank">Progressive Encoding for Neural Optimization</a> introduces an idea similar to our windowed position encoding for coarse-to-fine optimization.
|
369 |
-
</p>
|
370 |
-
<p>
|
371 |
-
<a href="https://www.albertpumarola.com/research/D-NeRF/index.html" target="_blank">D-NeRF</a> and <a href="https://gvv.mpi-inf.mpg.de/projects/nonrigid_nerf/" target="_blank">NR-NeRF</a>
|
372 |
-
both use deformation fields to model non-rigid scenes.
|
373 |
-
</p>
|
374 |
-
<p>
|
375 |
-
Some works model videos with a NeRF by directly modulating the density, such as <a href="https://video-nerf.github.io/" target="_blank">Video-NeRF</a>, <a href="https://www.cs.cornell.edu/~zl548/NSFF/" target="_blank">NSFF</a>, and <a href="https://neural-3d-video.github.io/" target="_blank">DyNeRF</a>
|
376 |
-
</p>
|
377 |
-
<p>
|
378 |
-
There are probably many more by the time you are reading this. Check out <a href="https://dellaert.github.io/NeRF/" target="_blank">Frank Dellart's survey on recent NeRF papers</a>, and <a href="https://github.com/yenchenlin/awesome-NeRF" target="_blank">Yen-Chen Lin's curated list of NeRF papers</a>.
|
379 |
-
</p>
|
380 |
-
</div>
|
381 |
-
</div>
|
382 |
-
</div>
|
383 |
-
<!--/ Concurrent Work. -->
|
384 |
|
385 |
-
</div>
|
386 |
-
</section>
|
387 |
|
388 |
|
389 |
<section class="section" id="BibTeX">
|
|
|
33 |
<div class="container is-max-desktop">
|
34 |
<div class="columns is-centered">
|
35 |
<div class="column has-text-centered">
|
36 |
+
<h1 class="title is-1 publication-title">MLLMs Know Where to Look:<br>Training-Free Perception of Small Visual Details with Multimodal LLMs</h1>
|
37 |
<div class="is-size-5 publication-authors">
|
38 |
<span class="author-block">
|
39 |
+
<a href="https://saccharomycetes.github.io/" target="_blank">Jiarui Zhang </a><img src="./static/images/usc_logo.png" style="height: 1em; vertical-align: middle;">,</span>
|
40 |
<span class="author-block">
|
41 |
+
<a href="https://mahyarkoy.github.io/" target="_blank">Mahyar Khayatkhoei </a><img src="./static/images/usc_logo.png" style="height: 1em; vertical-align: middle;">,</span>
|
42 |
<span class="author-block">
|
43 |
+
<a href="https://www.prateekchhikara.com/" target="_blank">Prateek Chhikara </a><img src="./static/images/usc_logo.png" style="height: 1em; vertical-align: middle;">,
|
44 |
</span>
|
45 |
+
and
|
46 |
<span class="author-block">
|
47 |
+
<a href="https://www.ilievski.info/" target="_blank">Filip Ilievski </a><img src="./static/images/vu_logo.png" style="height: 1em; vertical-align: middle;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
</span>
|
49 |
</div>
|
|
|
50 |
<div class="is-size-5 publication-authors">
|
51 |
+
<span class="author-block"><img src="./static/images/usc_logo.png" style="height: 1em; vertical-align: middle;"> University of Southern California, USA</span> <br>
|
52 |
+
<span class="author-block"><img src="./static/images/vu_logo.png" style="height: 1em; vertical-align: middle;"> Vrije Universiteit Amsterdam, The Netherlands</span>
|
53 |
</div>
|
54 |
|
55 |
+
|
56 |
<div class="column has-text-centered">
|
57 |
<div class="publication-links">
|
58 |
<!-- PDF Link. -->
|
|
|
75 |
</a>
|
76 |
</span>
|
77 |
<!-- Video Link. -->
|
78 |
+
<!-- <span class="link-block">
|
79 |
<a href="https://www.youtube.com/watch?v=MrKrnHhk8IA" target="_blank"
|
80 |
class="external-link button is-normal is-rounded is-dark">
|
81 |
<span class="icon">
|
|
|
83 |
</span>
|
84 |
<span>Video</span>
|
85 |
</a>
|
86 |
+
</span> -->
|
87 |
<!-- Code Link. -->
|
88 |
<span class="link-block">
|
89 |
+
<a href="https://github.com/saccharomycetes/mllms_know" target="_blank"
|
90 |
class="external-link button is-normal is-rounded is-dark">
|
91 |
<span class="icon">
|
92 |
<i class="fab fa-github"></i>
|
|
|
95 |
</a>
|
96 |
</span>
|
97 |
<!-- Dataset Link. -->
|
98 |
+
<!-- <span class="link-block">
|
99 |
<a href="https://github.com/google/nerfies/releases/tag/0.1" target="_blank"
|
100 |
class="external-link button is-normal is-rounded is-dark">
|
101 |
<span class="icon">
|
|
|
103 |
</span>
|
104 |
<span>Data</span>
|
105 |
</a>
|
106 |
+
</div> -->
|
107 |
|
108 |
</div>
|
109 |
+
|
110 |
+
<div class="column has-text-centered" style="margin: 1.5rem 0; padding: 0.75rem; background: linear-gradient(45deg, #4a90e2, #50e3c2); color: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); text-align: center;">
|
111 |
+
Accepted at ICLR 2025
|
112 |
+
</div>
|
113 |
+
|
114 |
+
|
115 |
</div>
|
116 |
</div>
|
117 |
</div>
|
118 |
</div>
|
119 |
</section>
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
|
123 |
+
<section class="section">
|
124 |
+
<div class="container is-max-desktop">
|
125 |
+
<!-- Paper poster. -->
|
126 |
+
<div class="columns is-centered has-text-centered">
|
127 |
+
<div class="column is-four-fifths">
|
128 |
+
<div class="publication-image">
|
129 |
+
<img src="./static/images/motivation_case.jpg" alt="Paper poster image">
|
130 |
+
<p class="caption">
|
131 |
+
Examples of MLLMs knowing where to look despite answering incorrectly. The right panel in each example displays relative attention to image of one layer in the MLLM.
|
132 |
+
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
</div>
|
134 |
</div>
|
135 |
</div>
|
|
|
145 |
<h2 class="title is-3">Abstract</h2>
|
146 |
<div class="content has-text-justified">
|
147 |
<p>
|
148 |
+
Multimodal Large Language Models (MLLMs) have experience rapid progress in recent years. Given their potential integration into many critical applications, it is important to understand the limitations of their perception ability. In this work, we study whether MLLMs can perceive small detailed visual information as well as large ones in images. In particular, we observe that their accuracy in answering visual questions is very sensitive to the size of the visual subject of the question. We further show that this effect is causal by observing that human visual cropping can significantly mitigate this sensitivity. Next, we study the attention patterns of MLLMs when answering visual questions, and intriguingly find that they consistently know where to look, even when they provide the wrong answer. Based on these findings, we then construct automatic visual cropping methods that leverage the internal knowledge of any MLLM itself, in the form of attention and gradient maps, to help it better perceive the small visual subject of any question. We study our proposed methods on two popular MLLMs and seven multimodal benchmarks, and show that they can significantly improve MLLMs' accuracy <b>without requiring any training</b>. Our findings suggest that MLLMs should be used with caution in detail-sensitive applications, and that visual cropping with model's own knowledge is a promising direction to improve their performance.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
</p>
|
150 |
</div>
|
151 |
</div>
|
152 |
</div>
|
153 |
<!--/ Abstract. -->
|
154 |
+
</section>
|
155 |
|
156 |
+
<section class="section">
|
157 |
+
<div class="container is-max-desktop">
|
158 |
+
<!-- Paper poster. -->
|
159 |
<div class="columns is-centered has-text-centered">
|
160 |
<div class="column is-four-fifths">
|
161 |
+
<h3 class="title is-3" style="text-align: left">Automatic Visual Cropping</h3>
|
162 |
+
<div class="publication-image">
|
163 |
+
<img src="./static/images/vicrop_methods.jpg" alt="Paper poster image">
|
164 |
+
<p class="caption">
|
165 |
+
Illustration of the proposed visual cropping approach applied to two MLLMs.
|
166 |
+
</p>
|
167 |
</div>
|
168 |
</div>
|
169 |
</div>
|
|
|
170 |
</div>
|
171 |
</section>
|
172 |
|
|
|
173 |
<section class="section">
|
174 |
<div class="container is-max-desktop">
|
175 |
+
<!-- Paper poster. -->
|
176 |
+
<div class="columns is-centered has-text-centered">
|
177 |
+
<div class="column is-four-fifths">
|
178 |
+
<h3 class="title is-3" style="text-align: left">Visual Cropping Methods Analysis</h3>
|
179 |
+
<div class="publication-image">
|
180 |
+
<img src="./static/images/method_case.jpg" alt="Paper poster image">
|
181 |
+
<p class="caption">
|
182 |
+
Examples of rel-att helping MLLMs correct their mistakes (<i>cyan-colored bounding box shows cropped region by rel-att; zoom-in insets are displayed for better readability</i>).
|
|
|
|
|
183 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
</div>
|
185 |
</div>
|
186 |
</div>
|
187 |
+
</div>
|
188 |
+
</section>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable" style="margin: 0 auto; width: auto; font-size: 0.9em;">
|
191 |
+
<caption style="caption-side: top; margin-bottom: 1em; font-weight: bold; color: #363636;">Accuracy of the proposed ViCrop methods on visual question answering benchmarks.</caption>
|
192 |
+
<thead>
|
193 |
+
<tr>
|
194 |
+
<th rowspan="2" colspan="2" style="vertical-align: middle; background-color: #f5f5f5;">Model</th>
|
195 |
+
<th colspan="4" style="text-align: center; background-color: #f0f8ff;">Smaller Visual Concepts</th>
|
196 |
+
<th colspan="3" style="text-align: center; background-color: #fff0f5;">Larger Visual Concepts</th>
|
197 |
+
</tr>
|
198 |
+
<tr>
|
199 |
+
<th style="background-color: #f0f8ff;">TextVQA†</th>
|
200 |
+
<th style="background-color: #f0f8ff;">V*</th>
|
201 |
+
<th style="background-color: #f0f8ff;">POPE</th>
|
202 |
+
<th style="background-color: #f0f8ff;">DocVQA</th>
|
203 |
+
<th style="background-color: #fff0f5;">AOKVQA</th>
|
204 |
+
<th style="background-color: #fff0f5;">GQA</th>
|
205 |
+
<th style="background-color: #fff0f5;">VQAv2</th>
|
206 |
+
</tr>
|
207 |
+
</thead>
|
208 |
+
<tbody>
|
209 |
+
<tr>
|
210 |
+
<td rowspan="4" style="font-weight: bold; background-color: #f5f5f5;">LLAVA-1.5</td>
|
211 |
+
<td style="font-weight: bold;">no cropping</td>
|
212 |
+
<td>47.80</td>
|
213 |
+
<td>42.41</td>
|
214 |
+
<td>85.27</td>
|
215 |
+
<td>15.97</td>
|
216 |
+
<td>59.01</td>
|
217 |
+
<td>60.48</td>
|
218 |
+
<td>75.57</td>
|
219 |
+
</tr>
|
220 |
+
<tr>
|
221 |
+
<td style="font-weight: bold;">rel-att</td>
|
222 |
+
<td>55.17</td>
|
223 |
+
<td style="font-weight: bold; color: #000000;">62.30</td>
|
224 |
+
<td style="font-weight: bold; color: #000000;">87.25</td>
|
225 |
+
<td>19.63</td>
|
226 |
+
<td style="font-weight: bold; color: #000000;">60.66</td>
|
227 |
+
<td>60.97</td>
|
228 |
+
<td style="font-weight: bold; color: #000000;">76.51</td>
|
229 |
+
</tr>
|
230 |
+
<tr>
|
231 |
+
<td style="font-weight: bold;">grad-att</td>
|
232 |
+
<td style="font-weight: bold; color: #000000;">56.06</td>
|
233 |
+
<td>57.07</td>
|
234 |
+
<td>87.03</td>
|
235 |
+
<td style="font-weight: bold; color: #000000;">19.84</td>
|
236 |
+
<td>59.94</td>
|
237 |
+
<td style="font-weight: bold; color: #000000;">60.98</td>
|
238 |
+
<td>76.06</td>
|
239 |
+
</tr>
|
240 |
+
<tr>
|
241 |
+
<td style="font-weight: bold;">pure-grad</td>
|
242 |
+
<td>51.67</td>
|
243 |
+
<td>46.07</td>
|
244 |
+
<td>86.06</td>
|
245 |
+
<td>17.70</td>
|
246 |
+
<td>59.92</td>
|
247 |
+
<td>60.54</td>
|
248 |
+
<td>75.94</td>
|
249 |
+
</tr>
|
250 |
+
<tr>
|
251 |
+
<td rowspan="4" style="font-weight: bold; background-color: #f5f5f5;">InstructBLIP</td>
|
252 |
+
<td style="font-weight: bold;">no cropping</td>
|
253 |
+
<td>33.48</td>
|
254 |
+
<td>35.60</td>
|
255 |
+
<td>84.89</td>
|
256 |
+
<td>9.20</td>
|
257 |
+
<td>60.06</td>
|
258 |
+
<td>49.41</td>
|
259 |
+
<td>76.25</td>
|
260 |
+
</tr>
|
261 |
+
<tr>
|
262 |
+
<td style="font-weight: bold;">rel-att</td>
|
263 |
+
<td>45.44</td>
|
264 |
+
<td style="font-weight: bold; color: #000000;">42.41</td>
|
265 |
+
<td>86.64</td>
|
266 |
+
<td>9.95</td>
|
267 |
+
<td>61.28</td>
|
268 |
+
<td>49.75</td>
|
269 |
+
<td style="font-weight: bold; color: #000000;">76.84</td>
|
270 |
+
</tr>
|
271 |
+
<tr>
|
272 |
+
<td style="font-weight: bold;">grad-att</td>
|
273 |
+
<td style="font-weight: bold; color: #000000;">45.71</td>
|
274 |
+
<td>37.70</td>
|
275 |
+
<td style="font-weight: bold; color: #000000;">86.99</td>
|
276 |
+
<td style="font-weight: bold; color: #000000;">10.81</td>
|
277 |
+
<td style="font-weight: bold; color: #000000;">61.77</td>
|
278 |
+
<td style="font-weight: bold; color: #000000;">50.33</td>
|
279 |
+
<td>76.08</td>
|
280 |
+
</tr>
|
281 |
+
<tr>
|
282 |
+
<td style="font-weight: bold;">pure-grad</td>
|
283 |
+
<td>42.23</td>
|
284 |
+
<td>37.17</td>
|
285 |
+
<td>86.84</td>
|
286 |
+
<td>8.99</td>
|
287 |
+
<td>61.60</td>
|
288 |
+
<td>50.08</td>
|
289 |
+
<td>76.71</td>
|
290 |
+
</tr>
|
291 |
+
</tbody>
|
292 |
+
</table>
|
293 |
|
294 |
|
|
|
|
|
|
|
|
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
|
|
|
|
297 |
|
298 |
|
299 |
<section class="section" id="BibTeX">
|
static/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
static/images/method_case.jpg
ADDED
![]() |
static/images/motivation_case.jpg
ADDED
![]() |
static/images/usc_logo.png
ADDED
![]() |
static/images/vicrop_methods.jpg
ADDED
![]() |
static/images/vu_logo.png
ADDED
![]() |
static/interpolation/stacked/000000.jpg
DELETED
Binary file (128 kB)
|
|
static/interpolation/stacked/000001.jpg
DELETED
Binary file (128 kB)
|
|
static/interpolation/stacked/000002.jpg
DELETED
Binary file (128 kB)
|
|
static/interpolation/stacked/000003.jpg
DELETED
Binary file (128 kB)
|
|
static/interpolation/stacked/000004.jpg
DELETED
Binary file (128 kB)
|
|
static/interpolation/stacked/000005.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000006.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000007.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000008.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000009.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000010.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000011.jpg
DELETED
Binary file (129 kB)
|
|
static/interpolation/stacked/000012.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000013.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000014.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000015.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000016.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000017.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000018.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000019.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000020.jpg
DELETED
Binary file (130 kB)
|
|
static/interpolation/stacked/000021.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000022.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000023.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000024.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000025.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000026.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000027.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000028.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000029.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000030.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000031.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000032.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000033.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000034.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000035.jpg
DELETED
Binary file (131 kB)
|
|
static/interpolation/stacked/000036.jpg
DELETED
Binary file (132 kB)
|
|
static/interpolation/stacked/000037.jpg
DELETED
Binary file (132 kB)
|
|
static/interpolation/stacked/000038.jpg
DELETED
Binary file (132 kB)
|
|
static/interpolation/stacked/000039.jpg
DELETED
Binary file (132 kB)
|
|
static/interpolation/stacked/000040.jpg
DELETED
Binary file (132 kB)
|
|
static/interpolation/stacked/000041.jpg
DELETED
Binary file (132 kB)
|
|