diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..988b6ed47aa64018e3c5c85ac99120fca2830024 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,38 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/fenceflower/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/mattcecsit/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/mattwrite/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..194e236cbd708160926c3513b4232285eb47b029
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+data/
+checkpoints/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..c950ef981a8d2e47599dd7acbbe1bf8de9a42aca
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "croco"]
+ path = croco
+ url = https://github.com/naver/croco
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1c2fda565b94d0f2b94cb65ba7cca866e7a25478
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/dust3r.iml b/.idea/dust3r.iml
new file mode 100644
index 0000000000000000000000000000000000000000..bb34444daa649d3f067846b9968327bc1a7bbc92
--- /dev/null
+++ b/.idea/dust3r.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..435bbe22b00f9b9a1482166532075912909946a0
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e06983fc6f59398257503edc06ae534d7c029189
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/other.xml b/.idea/other.xml
new file mode 100644
index 0000000000000000000000000000000000000000..4c89e05cf52029dbb6c8a4bed1cf2c782727520f
--- /dev/null
+++ b/.idea/other.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000000000000000000000000000000000000..572b3e4829fcd634d2d1edd8e3a54a57390d26b5
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,653 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {
+ "associatedIndex": 6
+}
+
+
+
+
+
+ {
+ "keyToString": {
+ "Python.base_opt.executor": "Debug",
+ "Python.demo.executor": "Debug",
+ "Python.evaluate.executor": "Run",
+ "Python.gys_util.executor": "Run",
+ "Python.load_nvos.executor": "Debug",
+ "Python.prepare_prompts.executor": "Debug",
+ "Python.segment_eval_mask.executor": "Run",
+ "Python.test_vis.executor": "Run",
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "last_opened_file_path": "D:/XMU/mac/hujie/3D/DUSt3R/dust3r/data/nerf_llff_data(NVOS-all)/orchids",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "nodejs_package_manager_path": "npm",
+ "settings.editor.selected.configurable": "editor.preferences.fonts.default",
+ "vue.rearranger.settings.migration": "true"
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1713236486096
+
+
+ 1713236486096
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ file://$PROJECT_DIR$/demo.py
+ 352
+
+
+
+ file://$PROJECT_DIR$/demo.py
+ 350
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 49
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 347
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 39
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 307
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 301
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 64
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 107
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 106
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 204
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 75
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 187
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 179
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 166
+
+
+
+ file://$PROJECT_DIR$/croco/models/dpt_block.py
+ 444
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 71
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 184
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 58
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 163
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 185
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 187
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 184
+
+
+
+ file://$PROJECT_DIR$/evaluate.py
+ 90
+
+
+
+ file://$PROJECT_DIR$/evaluate.py
+ 94
+
+
+
+ file://$PROJECT_DIR$/evaluate.py
+ 56
+
+
+
+ file://$PROJECT_DIR$/evaluate.py
+ 89
+
+
+
+ file://$PROJECT_DIR$/evaluate.py
+ 95
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 533
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 171
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 172
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 167
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 166
+
+
+
+ file://$PROJECT_DIR$/load_nvos.py
+ 170
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 37
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 128
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 131
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 146
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 136
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 140
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 143
+
+
+
+ file://$PROJECT_DIR$/dust3r/post_process.py
+ 16
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 291
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py
+ 292
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 370
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 270
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 269
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py
+ 179
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py
+ 195
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py
+ 176
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py
+ 197
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py
+ 187
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 30
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 377
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 140
+
+
+
+ file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py
+ 139
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 167
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 173
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 171
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 162
+
+
+
+ file://$PROJECT_DIR$/SAM/predictor.py
+ 162
+
+
+
+ file://$PROJECT_DIR$/SAM/predictor.py
+ 153
+
+
+
+ file://$PROJECT_DIR$/SAM/predictor.py
+ 237
+
+
+
+ file://$PROJECT_DIR$/SAM/predictor.py
+ 239
+
+
+
+ file://$PROJECT_DIR$/gys_util.py
+ 109
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 20
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 21
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 94
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 337
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 350
+
+
+
+ file://$PROJECT_DIR$/segment_eval_mask.py
+ 505
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..31d92d26f1b665d0f06b23378ef1e1d558b648d7
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,13 @@
+DUSt3R
+Copyright 2024-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+naver/croco
+https://github.com/naver/croco/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+
diff --git a/README.md b/README.md
index e57733f951abe6839b51d58ae91d8149a9c3c4dc..c8ba786340b23cb30849ef2ae0f130cfa56e6103 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
----
-title: Our3D
-emoji: 🏆
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 4.42.0
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
+title: 3D
+emoji: 🐨
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 4.42.0
+app_file: app.py
+pinned: false
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/SAM/__init__.py b/SAM/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65fe71691ee281310fb821441bc6d14285044322
--- /dev/null
+++ b/SAM/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .build_sam import (
+ build_sam,
+ build_sam_vit_h,
+ build_sam_vit_l,
+ build_sam_vit_b,
+ sam_model_registry,
+)
+from .predictor import SamPredictor
+from .automatic_mask_generator import SamAutomaticMaskGenerator
\ No newline at end of file
diff --git a/SAM/__pycache__/__init__.cpython-310.pyc b/SAM/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4232ca38eca64052b110ac7db53b31a96decb924
Binary files /dev/null and b/SAM/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc b/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee29a97176397f9a9a7eeaf2663f61cdd7c10909
Binary files /dev/null and b/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc differ
diff --git a/SAM/__pycache__/build_sam.cpython-310.pyc b/SAM/__pycache__/build_sam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fd9f031bf3367e8698126d84f6de4b0a1b565ee
Binary files /dev/null and b/SAM/__pycache__/build_sam.cpython-310.pyc differ
diff --git a/SAM/__pycache__/predictor.cpython-310.pyc b/SAM/__pycache__/predictor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..334ea40f774d11ad495980f8f6081b3b9847c0a1
Binary files /dev/null and b/SAM/__pycache__/predictor.cpython-310.pyc differ
diff --git a/SAM/automatic_mask_generator.py b/SAM/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..da2f60cef62b7da80b02a835aca21eff328e65aa
--- /dev/null
+++ b/SAM/automatic_mask_generator.py
@@ -0,0 +1,372 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from .modeling import Sam
+from .predictor import SamPredictor
+from .utils.amg import (
+ MaskData,
+ area_from_rle,
+ batch_iterator,
+ batched_mask_to_box,
+ box_xyxy_to_xywh,
+ build_all_layer_point_grids,
+ calculate_stability_score,
+ coco_encode_rle,
+ generate_crop_boxes,
+ is_box_near_crop_edge,
+ mask_to_rle_pytorch,
+ remove_small_regions,
+ rle_to_mask,
+ uncrop_boxes_xyxy,
+ uncrop_masks,
+ uncrop_points,
+)
+
+
+class SamAutomaticMaskGenerator:
+ def __init__(
+ self,
+ model: Sam,
+ points_per_side: Optional[int] = 32,
+ points_per_batch: int = 64,
+ pred_iou_thresh: float = 0.88,
+ stability_score_thresh: float = 0.95,
+ stability_score_offset: float = 1.0,
+ box_nms_thresh: float = 0.7,
+ crop_n_layers: int = 0,
+ crop_nms_thresh: float = 0.7,
+ crop_overlap_ratio: float = 512 / 1500,
+ crop_n_points_downscale_factor: int = 1,
+ point_grids: Optional[List[np.ndarray]] = None,
+ min_mask_region_area: int = 0,
+ output_mode: str = "binary_mask",
+ ) -> None:
+ """
+ Using a SAM model, generates masks for the entire image.
+ Generates a grid of point prompts over the image, then filters
+ low quality and duplicate masks. The default settings are chosen
+ for SAM with a ViT-H backbone.
+
+ Arguments:
+ model (Sam): The SAM model to use for mask prediction.
+ points_per_side (int or None): The number of points to be sampled
+ along one side of the image. The total number of points is
+ points_per_side**2. If None, 'point_grids' must provide explicit
+ point sampling.
+ points_per_batch (int): Sets the number of points run simultaneously
+ by the model. Higher numbers may be faster but use more GPU memory.
+ pred_iou_thresh (float): A filtering threshold in [0,1], using the
+ model's predicted mask quality.
+ stability_score_thresh (float): A filtering threshold in [0,1], using
+ the stability of the mask under changes to the cutoff used to binarize
+ the model's mask predictions.
+ stability_score_offset (float): The amount to shift the cutoff when
+ calculated the stability score.
+ box_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks.
+ crop_n_layers (int): If >0, mask prediction will be run again on
+ crops of the image. Sets the number of layers to run, where each
+ layer has 2**i_layer number of image crops.
+ crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks between different crops.
+ crop_overlap_ratio (float): Sets the degree to which crops overlap.
+ In the first crop layer, crops will overlap by this fraction of
+ the image length. Later layers with more crops scale down this overlap.
+ crop_n_points_downscale_factor (int): The number of points-per-side
+ sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+ point_grids (list(np.ndarray) or None): A list over explicit grids
+ of points used for sampling, normalized to [0,1]. The nth grid in the
+ list is used in the nth crop layer. Exclusive with points_per_side.
+ min_mask_region_area (int): If >0, postprocessing will be applied
+ to remove disconnected regions and holes in masks with area smaller
+ than min_mask_region_area. Requires opencv.
+ output_mode (str): The form masks are returned in. Can be 'binary_mask',
+ 'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+ For large resolutions, 'binary_mask' may consume large amounts of
+ memory.
+ """
+
+ assert (points_per_side is None) != (
+ point_grids is None
+ ), "Exactly one of points_per_side or point_grid must be provided."
+ if points_per_side is not None:
+ self.point_grids = build_all_layer_point_grids(
+ points_per_side,
+ crop_n_layers,
+ crop_n_points_downscale_factor,
+ )
+ elif point_grids is not None:
+ self.point_grids = point_grids
+ else:
+ raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+ assert output_mode in [
+ "binary_mask",
+ "uncompressed_rle",
+ "coco_rle",
+ ], f"Unknown output_mode {output_mode}."
+ if output_mode == "coco_rle":
+ from pycocotools import mask as mask_utils # type: ignore # noqa: F401
+
+ if min_mask_region_area > 0:
+ import cv2 # type: ignore # noqa: F401
+
+ self.predictor = SamPredictor(model)
+ self.points_per_batch = points_per_batch
+ self.pred_iou_thresh = pred_iou_thresh
+ self.stability_score_thresh = stability_score_thresh
+ self.stability_score_offset = stability_score_offset
+ self.box_nms_thresh = box_nms_thresh
+ self.crop_n_layers = crop_n_layers
+ self.crop_nms_thresh = crop_nms_thresh
+ self.crop_overlap_ratio = crop_overlap_ratio
+ self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+ self.min_mask_region_area = min_mask_region_area
+ self.output_mode = output_mode
+
+ @torch.no_grad()
+ def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+ """
+ Generates masks for the given image.
+
+ Arguments:
+ image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+ Returns:
+ list(dict(str, any)): A list over records for masks. Each record is
+ a dict containing the following keys:
+ segmentation (dict(str, any) or np.ndarray): The mask. If
+ output_mode='binary_mask', is an array of shape HW. Otherwise,
+ is a dictionary containing the RLE.
+ bbox (list(float)): The box around the mask, in XYWH format.
+ area (int): The area in pixels of the mask.
+ predicted_iou (float): The model's own prediction of the mask's
+ quality. This is filtered by the pred_iou_thresh parameter.
+ point_coords (list(list(float))): The point coordinates input
+ to the model to generate this mask.
+ stability_score (float): A measure of the mask's quality. This
+ is filtered on using the stability_score_thresh parameter.
+ crop_box (list(float)): The crop of the image used to generate
+ the mask, given in XYWH format.
+ """
+
+ # Generate masks
+ mask_data = self._generate_masks(image)
+
+ # Filter small disconnected regions and holes in masks
+ if self.min_mask_region_area > 0:
+ mask_data = self.postprocess_small_regions(
+ mask_data,
+ self.min_mask_region_area,
+ max(self.box_nms_thresh, self.crop_nms_thresh),
+ )
+
+ # Encode masks
+ if self.output_mode == "coco_rle":
+ mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+ elif self.output_mode == "binary_mask":
+ mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+ else:
+ mask_data["segmentations"] = mask_data["rles"]
+
+ # Write mask records
+ curr_anns = []
+ for idx in range(len(mask_data["segmentations"])):
+ ann = {
+ "segmentation": mask_data["segmentations"][idx],
+ "area": area_from_rle(mask_data["rles"][idx]),
+ "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+ "predicted_iou": mask_data["iou_preds"][idx].item(),
+ "point_coords": [mask_data["points"][idx].tolist()],
+ "stability_score": mask_data["stability_score"][idx].item(),
+ "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+ }
+ curr_anns.append(ann)
+
+ return curr_anns
+
+ def _generate_masks(self, image: np.ndarray) -> MaskData:
+ orig_size = image.shape[:2]
+ crop_boxes, layer_idxs = generate_crop_boxes(
+ orig_size, self.crop_n_layers, self.crop_overlap_ratio
+ )
+
+ # Iterate over image crops
+ data = MaskData()
+ for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+ crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+ data.cat(crop_data)
+
+ # Remove duplicate masks between crops
+ if len(crop_boxes) > 1:
+ # Prefer masks from smaller crops
+ scores = 1 / box_area(data["crop_boxes"])
+ scores = scores.to(data["boxes"].device)
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ scores,
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.crop_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+
+ data.to_numpy()
+ return data
+
+ def _process_crop(
+ self,
+ image: np.ndarray,
+ crop_box: List[int],
+ crop_layer_idx: int,
+ orig_size: Tuple[int, ...],
+ ) -> MaskData:
+ # Crop the image and calculate embeddings
+ x0, y0, x1, y1 = crop_box
+ cropped_im = image[y0:y1, x0:x1, :]
+ cropped_im_size = cropped_im.shape[:2]
+ self.predictor.set_image(cropped_im)
+
+ # Get points for this crop
+ points_scale = np.array(cropped_im_size)[None, ::-1]
+ points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+ # Generate masks for this crop in batches
+ data = MaskData()
+ for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+ batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size)
+ data.cat(batch_data)
+ del batch_data
+ self.predictor.reset_image()
+
+ # Remove duplicates within this crop.
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ data["iou_preds"],
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.box_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+
+ # Return to the original image frame
+ data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+ data["points"] = uncrop_points(data["points"], crop_box)
+ data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+ return data
+
+ def _process_batch(
+ self,
+ points: np.ndarray,
+ im_size: Tuple[int, ...],
+ crop_box: List[int],
+ orig_size: Tuple[int, ...],
+ ) -> MaskData:
+ orig_h, orig_w = orig_size
+
+ # Run model on this batch
+ transformed_points = self.predictor.transform.apply_coords(points, im_size)
+ in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
+ in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device)
+ masks, iou_preds, _ = self.predictor.predict_torch(
+ in_points[:, None, :],
+ in_labels[:, None],
+ multimask_output=True,
+ return_logits=True,
+ )
+
+ # Serialize predictions and store in MaskData
+ data = MaskData(
+ masks=masks.flatten(0, 1),
+ iou_preds=iou_preds.flatten(0, 1),
+ points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+ )
+ del masks
+
+ # Filter by predicted IoU
+ if self.pred_iou_thresh > 0.0:
+ keep_mask = data["iou_preds"] > self.pred_iou_thresh
+ data.filter(keep_mask)
+
+ # Calculate stability score
+ data["stability_score"] = calculate_stability_score(
+ data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset
+ )
+ if self.stability_score_thresh > 0.0:
+ keep_mask = data["stability_score"] >= self.stability_score_thresh
+ data.filter(keep_mask)
+
+ # Threshold masks and calculate boxes
+ data["masks"] = data["masks"] > self.predictor.model.mask_threshold
+ data["boxes"] = batched_mask_to_box(data["masks"])
+
+ # Filter boxes that touch crop boundaries
+ keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+ if not torch.all(keep_mask):
+ data.filter(keep_mask)
+
+ # Compress to RLE
+ data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+ data["rles"] = mask_to_rle_pytorch(data["masks"])
+ del data["masks"]
+
+ return data
+
+ @staticmethod
+ def postprocess_small_regions(
+ mask_data: MaskData, min_area: int, nms_thresh: float
+ ) -> MaskData:
+ """
+ Removes small disconnected regions and holes in masks, then reruns
+ box NMS to remove any new duplicates.
+
+ Edits mask_data in place.
+
+ Requires open-cv as a dependency.
+ """
+ if len(mask_data["rles"]) == 0:
+ return mask_data
+
+ # Filter small disconnected regions and holes
+ new_masks = []
+ scores = []
+ for rle in mask_data["rles"]:
+ mask = rle_to_mask(rle)
+
+ mask, changed = remove_small_regions(mask, min_area, mode="holes")
+ unchanged = not changed
+ mask, changed = remove_small_regions(mask, min_area, mode="islands")
+ unchanged = unchanged and not changed
+
+ new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+ # Give score=0 to changed masks and score=1 to unchanged masks
+ # so NMS will prefer ones that didn't need postprocessing
+ scores.append(float(unchanged))
+
+ # Recalculate boxes and remove any new duplicates
+ masks = torch.cat(new_masks, dim=0)
+ boxes = batched_mask_to_box(masks)
+ keep_by_nms = batched_nms(
+ boxes.float(),
+ torch.as_tensor(scores),
+ torch.zeros_like(boxes[:, 0]), # categories
+ iou_threshold=nms_thresh,
+ )
+
+ # Only recalculate RLEs for masks that have changed
+ for i_mask in keep_by_nms:
+ if scores[i_mask] == 0.0:
+ mask_torch = masks[i_mask].unsqueeze(0)
+ mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+ mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly
+ mask_data.filter(keep_by_nms)
+
+ return mask_data
\ No newline at end of file
diff --git a/SAM/build_sam.py b/SAM/build_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf07ae6373f722ad7f78ce515338b03d422a72a7
--- /dev/null
+++ b/SAM/build_sam.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from functools import partial
+
+from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+
+
+def build_sam_vit_h(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=1280,
+ encoder_depth=32,
+ encoder_num_heads=16,
+ encoder_global_attn_indexes=[7, 15, 23, 31],
+ checkpoint=checkpoint,
+ )
+
+
+build_sam = build_sam_vit_h
+
+
+def build_sam_vit_l(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=1024,
+ encoder_depth=24,
+ encoder_num_heads=16,
+ encoder_global_attn_indexes=[5, 11, 17, 23],
+ checkpoint=checkpoint,
+ )
+
+
+def build_sam_vit_b(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=768,
+ encoder_depth=12,
+ encoder_num_heads=12,
+ encoder_global_attn_indexes=[2, 5, 8, 11],
+ checkpoint=checkpoint,
+ )
+
+
+sam_model_registry = {
+ "default": build_sam_vit_h,
+ "vit_h": build_sam_vit_h,
+ "vit_l": build_sam_vit_l,
+ "vit_b": build_sam_vit_b,
+}
+
+
+def _build_sam(
+ encoder_embed_dim,
+ encoder_depth,
+ encoder_num_heads,
+ encoder_global_attn_indexes,
+ checkpoint=None,
+):
+ prompt_embed_dim = 256
+ image_size = 1024
+ vit_patch_size = 16
+ image_embedding_size = image_size // vit_patch_size
+ sam = Sam(
+ image_encoder=ImageEncoderViT(
+ depth=encoder_depth,
+ embed_dim=encoder_embed_dim,
+ img_size=image_size,
+ mlp_ratio=4,
+ norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+ num_heads=encoder_num_heads,
+ patch_size=vit_patch_size,
+ qkv_bias=True,
+ use_rel_pos=True,
+ global_attn_indexes=encoder_global_attn_indexes,
+ window_size=14,
+ out_chans=prompt_embed_dim,
+ ),
+ prompt_encoder=PromptEncoder(
+ embed_dim=prompt_embed_dim,
+ image_embedding_size=(image_embedding_size, image_embedding_size),
+ input_image_size=(image_size, image_size),
+ mask_in_chans=16,
+ ),
+ mask_decoder=MaskDecoder(
+ num_multimask_outputs=3,
+ transformer=TwoWayTransformer(
+ depth=2,
+ embedding_dim=prompt_embed_dim,
+ mlp_dim=2048,
+ num_heads=8,
+ ),
+ transformer_dim=prompt_embed_dim,
+ iou_head_depth=3,
+ iou_head_hidden_dim=256,
+ ),
+ pixel_mean=[123.675, 116.28, 103.53],
+ pixel_std=[58.395, 57.12, 57.375],
+ )
+ sam.eval()
+ if checkpoint is not None:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ sam.load_state_dict(state_dict)
+ return sam
\ No newline at end of file
diff --git a/SAM/modeling/__init__.py b/SAM/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..392f6cbc118573ee18f946312c06af34716f9836
--- /dev/null
+++ b/SAM/modeling/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .sam import Sam
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .transformer import TwoWayTransformer
+from .image_encoder import ImageEncoderViT
\ No newline at end of file
diff --git a/SAM/modeling/__pycache__/__init__.cpython-310.pyc b/SAM/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aac45024e3f3fc300f0bf3bf822a0057a1dc13d7
Binary files /dev/null and b/SAM/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/common.cpython-310.pyc b/SAM/modeling/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47f8c8efb755572200fa19fe93143fdd76488ef0
Binary files /dev/null and b/SAM/modeling/__pycache__/common.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc b/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeaf50923294930fbe14020a4c326c4b0a7a30d2
Binary files /dev/null and b/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc b/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7074bf0ef5a1c3d623aa4c57d24aad3e87905ae7
Binary files /dev/null and b/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc b/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13910b6023c9111c5427dcfbf16e6a2b1a3f7d22
Binary files /dev/null and b/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/sam.cpython-310.pyc b/SAM/modeling/__pycache__/sam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5045ad7c082a3c18a36d5e3674a97fde72da086
Binary files /dev/null and b/SAM/modeling/__pycache__/sam.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/transformer.cpython-310.pyc b/SAM/modeling/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b38537c70cf84caaf28ef49ef2b62317e154e6ec
Binary files /dev/null and b/SAM/modeling/__pycache__/transformer.cpython-310.pyc differ
diff --git a/SAM/modeling/common.py b/SAM/modeling/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..d67662c6a517be28bf3b8d037056a6e376cf7a7e
--- /dev/null
+++ b/SAM/modeling/common.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from typing import Type
+
+
+class MLPBlock(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ mlp_dim: int,
+ act: Type[nn.Module] = nn.GELU,
+ ) -> None:
+ super().__init__()
+ self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+ self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+ self.act = act()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
+class LayerNorm2d(nn.Module):
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(num_channels))
+ self.bias = nn.Parameter(torch.zeros(num_channels))
+ self.eps = eps
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
\ No newline at end of file
diff --git a/SAM/modeling/image_encoder.py b/SAM/modeling/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7030c033aa3147e9a30fb9a137cc7829e786993c
--- /dev/null
+++ b/SAM/modeling/image_encoder.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional, Tuple, Type
+
+from .common import LayerNorm2d, MLPBlock
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+ def __init__(
+ self,
+ img_size: int = 1024,
+ patch_size: int = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ depth: int = 12,
+ num_heads: int = 12,
+ mlp_ratio: float = 4.0,
+ out_chans: int = 256,
+ qkv_bias: bool = True,
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
+ act_layer: Type[nn.Module] = nn.GELU,
+ use_abs_pos: bool = True,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ window_size: int = 0,
+ global_attn_indexes: Tuple[int, ...] = (),
+ ) -> None:
+ """
+ Args:
+ img_size (int): Input image size.
+ patch_size (int): Patch size.
+ in_chans (int): Number of input image channels.
+ embed_dim (int): Patch embedding dimension.
+ depth (int): Depth of ViT.
+ num_heads (int): Number of attention heads in each ViT block.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ norm_layer (nn.Module): Normalization layer.
+ act_layer (nn.Module): Activation layer.
+ use_abs_pos (bool): If True, use absolute positional embeddings.
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ window_size (int): Window size for window attention blocks.
+ global_attn_indexes (list): Indexes for blocks using global attention.
+ """
+ super().__init__()
+ self.img_size = img_size
+
+ self.patch_embed = PatchEmbed(
+ kernel_size=(patch_size, patch_size),
+ stride=(patch_size, patch_size),
+ in_chans=in_chans,
+ embed_dim=embed_dim,
+ )
+
+ self.pos_embed: Optional[nn.Parameter] = None
+ if use_abs_pos:
+ # Initialize absolute positional embedding with pretrain image size.
+ self.pos_embed = nn.Parameter(
+ torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+ )
+
+ self.blocks = nn.ModuleList()
+ for i in range(depth):
+ block = Block(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ use_rel_pos=use_rel_pos,
+ rel_pos_zero_init=rel_pos_zero_init,
+ window_size=window_size if i not in global_attn_indexes else 0,
+ input_size=(img_size // patch_size, img_size // patch_size),
+ )
+ self.blocks.append(block)
+
+ self.neck = nn.Sequential(
+ nn.Conv2d(
+ embed_dim,
+ out_chans,
+ kernel_size=1,
+ bias=False,
+ ),
+ LayerNorm2d(out_chans),
+ nn.Conv2d(
+ out_chans,
+ out_chans,
+ kernel_size=3,
+ padding=1,
+ bias=False,
+ ),
+ LayerNorm2d(out_chans),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.patch_embed(x)
+ if self.pos_embed is not None:
+ x = x + self.pos_embed
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.neck(x.permute(0, 3, 1, 2))
+
+ return x
+
+
+class Block(nn.Module):
+ """Transformer blocks with support of window attention and residual propagation blocks"""
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = True,
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
+ act_layer: Type[nn.Module] = nn.GELU,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ window_size: int = 0,
+ input_size: Optional[Tuple[int, int]] = None,
+ ) -> None:
+ """
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads in each ViT block.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ norm_layer (nn.Module): Normalization layer.
+ act_layer (nn.Module): Activation layer.
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ window_size (int): Window size for window attention blocks. If it equals 0, then
+ use global attention.
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
+ positional parameter size.
+ """
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ use_rel_pos=use_rel_pos,
+ rel_pos_zero_init=rel_pos_zero_init,
+ input_size=input_size if window_size == 0 else (window_size, window_size),
+ )
+
+ self.norm2 = norm_layer(dim)
+ self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+
+ self.window_size = window_size
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ shortcut = x
+ x = self.norm1(x)
+ # Window partition
+ if self.window_size > 0:
+ H, W = x.shape[1], x.shape[2]
+ x, pad_hw = window_partition(x, self.window_size)
+
+ x = self.attn(x)
+ # Reverse window partition
+ if self.window_size > 0:
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+ x = shortcut + x
+ x = x + self.mlp(self.norm2(x))
+
+ return x
+
+
+class Attention(nn.Module):
+ """Multi-head Attention block with relative position embeddings."""
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = True,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ input_size: Optional[Tuple[int, int]] = None,
+ ) -> None:
+ """
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
+ positional parameter size.
+ """
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+
+ self.use_rel_pos = use_rel_pos
+ if self.use_rel_pos:
+ assert (
+ input_size is not None
+ ), "Input size must be provided if using relative positional encoding."
+ # initialize relative positional embeddings
+ self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+ self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ B, H, W, _ = x.shape
+ # qkv with shape (3, B, nHead, H * W, C)
+ qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+ # q, k, v with shape (B * nHead, H * W, C)
+ q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+ attn = (q * self.scale) @ k.transpose(-2, -1)
+
+ if self.use_rel_pos:
+ attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+ attn = attn.softmax(dim=-1)
+ x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+ x = self.proj(x)
+
+ return x
+
+
+def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+ """
+ Partition into non-overlapping windows with padding if needed.
+ Args:
+ x (tensor): input tokens with [B, H, W, C].
+ window_size (int): window size.
+
+ Returns:
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
+ (Hp, Wp): padded height and width before partition
+ """
+ B, H, W, C = x.shape
+
+ pad_h = (window_size - H % window_size) % window_size
+ pad_w = (window_size - W % window_size) % window_size
+ if pad_h > 0 or pad_w > 0:
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+ Hp, Wp = H + pad_h, W + pad_w
+
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows, (Hp, Wp)
+
+
+def window_unpartition(
+ windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+) -> torch.Tensor:
+ """
+ Window unpartition into original sequences and removing padding.
+ Args:
+ windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+ window_size (int): window size.
+ pad_hw (Tuple): padded height and width (Hp, Wp).
+ hw (Tuple): original height and width (H, W) before padding.
+
+ Returns:
+ x: unpartitioned sequences with [B, H, W, C].
+ """
+ Hp, Wp = pad_hw
+ H, W = hw
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+ x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+ if Hp > H or Wp > W:
+ x = x[:, :H, :W, :].contiguous()
+ return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+ """
+ Get relative positional embeddings according to the relative positions of
+ query and key sizes.
+ Args:
+ q_size (int): size of query q.
+ k_size (int): size of key k.
+ rel_pos (Tensor): relative position embeddings (L, C).
+
+ Returns:
+ Extracted positional embeddings according to relative positions.
+ """
+ max_rel_dist = int(2 * max(q_size, k_size) - 1)
+ # Interpolate rel pos if needed.
+ if rel_pos.shape[0] != max_rel_dist:
+ # Interpolate rel pos.
+ rel_pos_resized = F.interpolate(
+ rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+ size=max_rel_dist,
+ mode="linear",
+ )
+ rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+ else:
+ rel_pos_resized = rel_pos
+
+ # Scale the coords with short length if shapes for q and k are different.
+ q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+ k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+ relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+ return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+ attn: torch.Tensor,
+ q: torch.Tensor,
+ rel_pos_h: torch.Tensor,
+ rel_pos_w: torch.Tensor,
+ q_size: Tuple[int, int],
+ k_size: Tuple[int, int],
+) -> torch.Tensor:
+ """
+ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+ https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
+ Args:
+ attn (Tensor): attention map.
+ q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+ rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+ rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+ q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+ k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+ Returns:
+ attn (Tensor): attention map with added relative positional embeddings.
+ """
+ q_h, q_w = q_size
+ k_h, k_w = k_size
+ Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+ Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+ B, _, dim = q.shape
+ r_q = q.reshape(B, q_h, q_w, dim)
+ rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+ rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+ attn = (
+ attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+ ).view(B, q_h * q_w, k_h * k_w)
+
+ return attn
+
+
+class PatchEmbed(nn.Module):
+ """
+ Image to Patch Embedding.
+ """
+
+ def __init__(
+ self,
+ kernel_size: Tuple[int, int] = (16, 16),
+ stride: Tuple[int, int] = (16, 16),
+ padding: Tuple[int, int] = (0, 0),
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ ) -> None:
+ """
+ Args:
+ kernel_size (Tuple): kernel size of the projection layer.
+ stride (Tuple): stride of the projection layer.
+ padding (Tuple): padding size of the projection layer.
+ in_chans (int): Number of input image channels.
+ embed_dim (int): Patch embedding dimension.
+ """
+ super().__init__()
+
+ self.proj = nn.Conv2d(
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.proj(x)
+ # B C H W -> B H W C
+ x = x.permute(0, 2, 3, 1)
+ return x
\ No newline at end of file
diff --git a/SAM/modeling/mask_decoder.py b/SAM/modeling/mask_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6abb60143e4ea8e826a635f79b5d3a4df488add2
--- /dev/null
+++ b/SAM/modeling/mask_decoder.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import List, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class MaskDecoder(nn.Module):
+ def __init__(
+ self,
+ *,
+ transformer_dim: int,
+ transformer: nn.Module,
+ num_multimask_outputs: int = 3,
+ activation: Type[nn.Module] = nn.GELU,
+ iou_head_depth: int = 3,
+ iou_head_hidden_dim: int = 256,
+ ) -> None:
+ """
+ Predicts masks given an image and prompt embeddings, using a
+ transformer architecture.
+
+ Arguments:
+ transformer_dim (int): the channel dimension of the transformer
+ transformer (nn.Module): the transformer used to predict masks
+ num_multimask_outputs (int): the number of masks to predict
+ when disambiguating masks
+ activation (nn.Module): the type of activation to use when
+ upscaling masks
+ iou_head_depth (int): the depth of the MLP used to predict
+ mask quality
+ iou_head_hidden_dim (int): the hidden dimension of the MLP
+ used to predict mask quality
+ """
+ super().__init__()
+ self.transformer_dim = transformer_dim
+ self.transformer = transformer
+
+ self.num_multimask_outputs = num_multimask_outputs
+
+ self.iou_token = nn.Embedding(1, transformer_dim)
+ self.num_mask_tokens = num_multimask_outputs + 1
+ self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+ self.output_upscaling = nn.Sequential(
+ nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
+ LayerNorm2d(transformer_dim // 4),
+ activation(),
+ nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
+ activation(),
+ )
+ self.output_hypernetworks_mlps = nn.ModuleList(
+ [
+ MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+ for i in range(self.num_mask_tokens)
+ ]
+ )
+
+ self.iou_prediction_head = MLP(
+ transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+ )
+
+ def forward(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ multimask_output: bool,
+ batch_ind_list: List[int] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Predict masks given image and prompt embeddings.
+
+ Arguments:
+ image_embeddings (torch.Tensor): the embeddings from the image encoder
+ image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+ sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+ dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+ multimask_output (bool): Whether to return multiple masks or a single
+ mask.
+
+ Returns:
+ torch.Tensor: batched predicted masks
+ torch.Tensor: batched predictions of mask quality
+ """
+ masks, iou_pred = self.predict_masks(
+ image_embeddings=image_embeddings,
+ image_pe=image_pe,
+ sparse_prompt_embeddings=sparse_prompt_embeddings,
+ dense_prompt_embeddings=dense_prompt_embeddings,
+ batch_ind_list=batch_ind_list,
+ )
+
+ # Select the correct mask or masks for output
+ if multimask_output:
+ mask_slice = slice(1, None)
+ else:
+ mask_slice = slice(0, 1)
+ masks = masks[:, mask_slice, :, :]
+ iou_pred = iou_pred[:, mask_slice]
+
+ # Prepare output
+ return masks, iou_pred
+
+ def predict_masks(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ batch_ind_list: List[int],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Predicts masks. See 'forward' for more details."""
+ # Concatenate output tokens
+ if batch_ind_list is None:
+ output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+ output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
+ tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+ # Expand per-image data in batch direction to be per-mask
+ src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+ src = src + dense_prompt_embeddings
+ pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+ b, c, h, w = src.shape
+ else:
+ num_instances = int(sparse_prompt_embeddings.size(0))
+ output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+ output_tokens = output_tokens.unsqueeze(0).expand(num_instances, -1, -1)
+ tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+ # Expand per-image data in batch direction to be per-mask
+ image_embeddings = torch.cat([image_embeddings[i].unsqueeze(0).repeat(n, 1, 1, 1) for i, n in enumerate(batch_ind_list)], dim=0)
+ src = image_embeddings
+ src = src + dense_prompt_embeddings
+ pos_src = torch.repeat_interleave(image_pe, num_instances, dim=0)
+ b, c, h, w = src.shape
+
+ # Run the transformer
+ hs, src = self.transformer(src, pos_src, tokens)
+ iou_token_out = hs[:, 0, :]
+ mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+
+ # Upscale mask embeddings and predict masks using the mask tokens
+ src = src.transpose(1, 2).view(b, c, h, w)
+ upscaled_embedding = self.output_upscaling(src)
+ hyper_in_list: List[torch.Tensor] = []
+ for i in range(self.num_mask_tokens):
+ hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
+ hyper_in = torch.stack(hyper_in_list, dim=1)
+ b, c, h, w = upscaled_embedding.shape
+ masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+
+ # Generate mask quality predictions
+ iou_pred = self.iou_prediction_head(iou_token_out)
+
+ return masks, iou_pred
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+ def __init__(
+ self,
+ input_dim: int,
+ hidden_dim: int,
+ output_dim: int,
+ num_layers: int,
+ sigmoid_output: bool = False,
+ ) -> None:
+ super().__init__()
+ self.num_layers = num_layers
+ h = [hidden_dim] * (num_layers - 1)
+ self.layers = nn.ModuleList(
+ nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+ )
+ self.sigmoid_output = sigmoid_output
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+ if self.sigmoid_output:
+ x = F.sigmoid(x)
+ return x
\ No newline at end of file
diff --git a/SAM/modeling/prompt_encoder.py b/SAM/modeling/prompt_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ea3a1d02e02232a7928ac235024d433a85be97
--- /dev/null
+++ b/SAM/modeling/prompt_encoder.py
@@ -0,0 +1,214 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch import nn
+
+from typing import Any, Optional, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class PromptEncoder(nn.Module):
+ def __init__(
+ self,
+ embed_dim: int,
+ image_embedding_size: Tuple[int, int],
+ input_image_size: Tuple[int, int],
+ mask_in_chans: int,
+ activation: Type[nn.Module] = nn.GELU,
+ ) -> None:
+ """
+ Encodes prompts for input to SAM's mask decoder.
+
+ Arguments:
+ embed_dim (int): The prompts' embedding dimension
+ image_embedding_size (tuple(int, int)): The spatial size of the
+ image embedding, as (H, W).
+ input_image_size (int): The padded size of the image as input
+ to the image encoder, as (H, W).
+ mask_in_chans (int): The number of hidden channels used for
+ encoding input masks.
+ activation (nn.Module): The activation to use when encoding
+ input masks.
+ """
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.input_image_size = input_image_size
+ self.image_embedding_size = image_embedding_size
+ self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+ self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
+ point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
+ self.point_embeddings = nn.ModuleList(point_embeddings)
+ self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+ self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
+ self.mask_downscaling = nn.Sequential(
+ nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans // 4),
+ activation(),
+ nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans),
+ activation(),
+ nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+ )
+ self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+ def get_dense_pe(self) -> torch.Tensor:
+ """
+ Returns the positional encoding used to encode point prompts,
+ applied to a dense set of points the shape of the image encoding.
+
+ Returns:
+ torch.Tensor: Positional encoding with shape
+ 1x(embed_dim)x(embedding_h)x(embedding_w)
+ """
+ return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+ def _embed_points(
+ self,
+ points: torch.Tensor,
+ labels: torch.Tensor,
+ pad: bool,
+ ) -> torch.Tensor:
+ """Embeds point prompts."""
+ points = points + 0.5 # Shift to center of pixel
+ if pad:
+ padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+ padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+ points = torch.cat([points, padding_point], dim=1)
+ labels = torch.cat([labels, padding_label], dim=1)
+ point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
+ point_embedding[labels == -1] = 0.0
+ point_embedding[labels == -1] += self.not_a_point_embed.weight
+ point_embedding[labels == 0] += self.point_embeddings[0].weight
+ point_embedding[labels == 1] += self.point_embeddings[1].weight
+ return point_embedding
+
+ def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+ """Embeds box prompts."""
+ boxes = boxes + 0.5 # Shift to center of pixel
+ coords = boxes.reshape(-1, 2, 2)
+ corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
+ corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+ corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+ return corner_embedding
+
+ def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+ """Embeds mask inputs."""
+ mask_embedding = self.mask_downscaling(masks)
+ return mask_embedding
+
+ def _get_batch_size(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> int:
+ """
+ Gets the batch size of the output given the batch size of the input prompts.
+ """
+ if points is not None:
+ return points[0].shape[0]
+ elif boxes is not None:
+ return boxes.shape[0]
+ elif masks is not None:
+ return masks.shape[0]
+ else:
+ return 1
+
+ def _get_device(self) -> torch.device:
+ return self.point_embeddings[0].weight.device
+
+ def forward(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Embeds different types of prompts, returning both sparse and dense
+ embeddings.
+
+ Arguments:
+ points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+ and labels to embed.
+ boxes (torch.Tensor or none): boxes to embed
+ masks (torch.Tensor or none): masks to embed
+
+ Returns:
+ torch.Tensor: sparse embeddings for the points and boxes, with shape
+ BxNx(embed_dim), where N is determined by the number of input points
+ and boxes.
+ torch.Tensor: dense embeddings for the masks, in the shape
+ Bx(embed_dim)x(embed_H)x(embed_W)
+ """
+ bs = self._get_batch_size(points, boxes, masks)
+ sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+ if points is not None:
+ coords, labels = points
+ point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+ sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+ if boxes is not None:
+ box_embeddings = self._embed_boxes(boxes)
+ sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+
+ if masks is not None:
+ dense_embeddings = self._embed_masks(masks)
+ else:
+ dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+ bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+ )
+
+ return sparse_embeddings, dense_embeddings
+
+
+class PositionEmbeddingRandom(nn.Module):
+ """
+ Positional encoding using random spatial frequencies.
+ """
+
+ def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+ super().__init__()
+ if scale is None or scale <= 0.0:
+ scale = 1.0
+ self.register_buffer(
+ "positional_encoding_gaussian_matrix",
+ scale * torch.randn((2, num_pos_feats)),
+ )
+
+ def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+ """Positionally encode points that are normalized to [0,1]."""
+ # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+ coords = 2 * coords - 1
+ coords = coords @ self.positional_encoding_gaussian_matrix
+ coords = 2 * np.pi * coords
+ # outputs d_1 x ... x d_n x C shape
+ return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+ def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+ """Generate positional encoding for a grid of the specified size."""
+ h, w = size
+ device: Any = self.positional_encoding_gaussian_matrix.device
+ grid = torch.ones((h, w), device=device, dtype=torch.float32)
+ y_embed = grid.cumsum(dim=0) - 0.5
+ x_embed = grid.cumsum(dim=1) - 0.5
+ y_embed = y_embed / h
+ x_embed = x_embed / w
+
+ pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+ return pe.permute(2, 0, 1) # C x H x W
+
+ def forward_with_coords(
+ self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+ ) -> torch.Tensor:
+ """Positionally encode points that are not normalized to [0,1]."""
+ coords = coords_input.clone()
+ coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+ coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+ return self._pe_encoding(coords.to(torch.float)) # B x N x C
\ No newline at end of file
diff --git a/SAM/modeling/sam.py b/SAM/modeling/sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b4655187c22d06d63f27139ae469f20d33b749
--- /dev/null
+++ b/SAM/modeling/sam.py
@@ -0,0 +1,187 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import Any, Dict, List, Tuple
+
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .image_encoder import ImageEncoderViT
+
+class Sam(nn.Module):
+ mask_threshold: float = 0.0
+ image_format: str = "RGB"
+
+ def __init__(
+ self,
+ image_encoder: ImageEncoderViT,
+ prompt_encoder: PromptEncoder,
+ mask_decoder: MaskDecoder,
+ pixel_mean: List[float] = [123.675, 116.28, 103.53],
+ pixel_std: List[float] = [58.395, 57.12, 57.375],
+ ) -> None:
+ """
+ SAM predicts object masks from an image and input prompts.
+
+ Arguments:
+ image_encoder (ImageEncoderViT): The backbone used to encode the
+ image into image embeddings that allow for efficient mask prediction.
+ prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+ mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+ and encoded prompts.
+ pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+ pixel_std (list(float)): Std values for normalizing pixels in the input image.
+ """
+ super().__init__()
+ self.image_encoder = image_encoder
+ self.prompt_encoder = prompt_encoder
+ self.mask_decoder = mask_decoder
+ self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+ self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+ @property
+ def device(self) -> Any:
+ return self.pixel_mean.device
+
+ @torch.no_grad()
+ def forward(
+ self,
+ batched_input: List[Dict[str, Any]],
+ multimask_output: bool,
+ ) -> List[Dict[str, torch.Tensor]]:
+ """
+ Predicts masks end-to-end from provided images and prompts.
+ If prompts are not known in advance, using SamPredictor is
+ recommended over calling the model directly.
+
+ Arguments:
+ batched_input (list(dict)): A list over input images, each a
+ dictionary with the following keys. A prompt key can be
+ excluded if it is not present.
+ 'image': The image as a torch tensor in 3xHxW format,
+ already transformed for input to the model.
+ 'original_size': (tuple(int, int)) The original size of
+ the image before transformation, as (H, W).
+ 'point_coords': (torch.Tensor) Batched point prompts for
+ this image, with shape BxNx2. Already transformed to the
+ input frame of the model.
+ 'point_labels': (torch.Tensor) Batched labels for point prompts,
+ with shape BxN.
+ 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
+ Already transformed to the input frame of the model.
+ 'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
+ in the form Bx1xHxW.
+ multimask_output (bool): Whether the model should predict multiple
+ disambiguating masks, or return a single mask.
+
+ Returns:
+ (list(dict)): A list over input images, where each element is
+ as dictionary with the following keys.
+ 'masks': (torch.Tensor) Batched binary mask predictions,
+ with shape BxCxHxW, where B is the number of input prompts,
+ C is determined by multimask_output, and (H, W) is the
+ original size of the image.
+ 'iou_predictions': (torch.Tensor) The model's predictions
+ of mask quality, in shape BxC.
+ 'low_res_logits': (torch.Tensor) Low resolution logits with
+ shape BxCxHxW, where H=W=256. Can be passed as mask input
+ to subsequent iterations of prediction.
+ """
+ spase_embed_list = []
+ dense_embed_list = []
+ batch_ind_list = []
+ input_images_list = []
+ for idx, image_record in enumerate(batched_input):
+ input_images_list.append(self.preprocess(image_record["image"]))
+ if "point_coords" in image_record:
+ points = (image_record["point_coords"], image_record["point_labels"])
+ else:
+ points = None
+ sparse_embed, dense_embed = self.prompt_encoder(
+ points=points,
+ boxes=image_record.get("boxes", None),
+ masks=image_record.get("mask_inputs", None),
+ )
+ assert len(sparse_embed) == len(dense_embed)
+ spase_embed_list.append(sparse_embed)
+ dense_embed_list.append(dense_embed)
+ batch_ind_list.append(len(sparse_embed))
+
+ image_embeddings = self.image_encoder(torch.stack(input_images_list, dim=0))
+ sparse_embed = torch.cat(spase_embed_list)
+ dense_embed = torch.cat(dense_embed_list)
+ low_res_masks, iou_predictions = self.mask_decoder(
+ image_embeddings=image_embeddings,
+ image_pe=self.prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embed,
+ dense_prompt_embeddings=dense_embed,
+ multimask_output=multimask_output,
+ batch_ind_list=batch_ind_list,
+ )
+ low_res_masks = torch.split(low_res_masks, batch_ind_list, dim=0)
+ iou_predictions = torch.split(iou_predictions, batch_ind_list, dim=0)
+ outputs = []
+ for image_record, low_res_mask, iou_prediction in zip(batched_input, low_res_masks, iou_predictions):
+ masks = self.postprocess_masks(
+ low_res_mask,
+ input_size=image_record["image"].shape[-2:],
+ original_size=image_record["original_size"],
+ )
+ masks = masks > self.mask_threshold
+ outputs.append(
+ {
+ "masks": masks,
+ "iou_predictions": iou_prediction,
+ "low_res_logits": low_res_mask,
+ }
+ )
+ return outputs
+
+ def postprocess_masks(
+ self,
+ masks: torch.Tensor,
+ input_size: Tuple[int, ...],
+ original_size: Tuple[int, ...],
+ ) -> torch.Tensor:
+ """
+ Remove padding and upscale masks to the original image size.
+
+ Arguments:
+ masks (torch.Tensor): Batched masks from the mask_decoder,
+ in BxCxHxW format.
+ input_size (tuple(int, int)): The size of the image input to the
+ model, in (H, W) format. Used to remove padding.
+ original_size (tuple(int, int)): The original size of the image
+ before resizing for input to the model, in (H, W) format.
+
+ Returns:
+ (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
+ is given by original_size.
+ """
+ masks = F.interpolate(
+ masks,
+ (self.image_encoder.img_size, self.image_encoder.img_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ masks = masks[..., : input_size[0], : input_size[1]]
+ masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
+ return masks
+
+ def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+ """Normalize pixel values and pad to a square input."""
+ # Normalize colors
+ x = (x - self.pixel_mean) / self.pixel_std
+
+ # Pad
+ h, w = x.shape[-2:]
+ padh = self.image_encoder.img_size - h
+ padw = self.image_encoder.img_size - w
+ x = F.pad(x, (0, padw, 0, padh))
+ return x
\ No newline at end of file
diff --git a/SAM/modeling/transformer.py b/SAM/modeling/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..33b8070d3c18d11a43fcca3e1b3d7d33dc9a1147
--- /dev/null
+++ b/SAM/modeling/transformer.py
@@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import Tensor, nn
+
+import math
+from typing import Tuple, Type
+
+from .common import MLPBlock
+
+
+class TwoWayTransformer(nn.Module):
+ def __init__(
+ self,
+ depth: int,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ ) -> None:
+ """
+ A transformer decoder that attends to an input image using
+ queries whose positional embedding is supplied.
+
+ Args:
+ depth (int): number of layers in the transformer
+ embedding_dim (int): the channel dimension for the input embeddings
+ num_heads (int): the number of heads for multihead attention. Must
+ divide embedding_dim
+ mlp_dim (int): the channel dimension internal to the MLP block
+ activation (nn.Module): the activation to use in the MLP block
+ """
+ super().__init__()
+ self.depth = depth
+ self.embedding_dim = embedding_dim
+ self.num_heads = num_heads
+ self.mlp_dim = mlp_dim
+ self.layers = nn.ModuleList()
+
+ for i in range(depth):
+ self.layers.append(
+ TwoWayAttentionBlock(
+ embedding_dim=embedding_dim,
+ num_heads=num_heads,
+ mlp_dim=mlp_dim,
+ activation=activation,
+ attention_downsample_rate=attention_downsample_rate,
+ skip_first_layer_pe=(i == 0),
+ )
+ )
+
+ self.final_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+ def forward(
+ self,
+ image_embedding: Tensor,
+ image_pe: Tensor,
+ point_embedding: Tensor,
+ ) -> Tuple[Tensor, Tensor]:
+ """
+ Args:
+ image_embedding (torch.Tensor): image to attend to. Should be shape
+ B x embedding_dim x h x w for any h and w.
+ image_pe (torch.Tensor): the positional encoding to add to the image. Must
+ have the same shape as image_embedding.
+ point_embedding (torch.Tensor): the embedding to add to the query points.
+ Must have shape B x N_points x embedding_dim for any N_points.
+
+ Returns:
+ torch.Tensor: the processed point_embedding
+ torch.Tensor: the processed image_embedding
+ """
+ # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+ bs, c, h, w = image_embedding.shape
+ image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+ image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+ # Prepare queries
+ queries = point_embedding
+ keys = image_embedding
+
+ # Apply transformer blocks and final layernorm
+ for layer in self.layers:
+ queries, keys = layer(
+ queries=queries,
+ keys=keys,
+ query_pe=point_embedding,
+ key_pe=image_pe,
+ )
+
+ # Apply the final attention layer from the points to the image
+ q = queries + point_embedding
+ k = keys + image_pe
+ attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm_final_attn(queries)
+
+ return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int = 2048,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ skip_first_layer_pe: bool = False,
+ ) -> None:
+ """
+ A transformer block with four layers: (1) self-attention of sparse
+ inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+ block on sparse inputs, and (4) cross attention of dense inputs to sparse
+ inputs.
+
+ Arguments:
+ embedding_dim (int): the channel dimension of the embeddings
+ num_heads (int): the number of heads in the attention layers
+ mlp_dim (int): the hidden dimension of the mlp block
+ activation (nn.Module): the activation of the mlp block
+ skip_first_layer_pe (bool): skip the PE on the first layer
+ """
+ super().__init__()
+ self.self_attn = Attention(embedding_dim, num_heads)
+ self.norm1 = nn.LayerNorm(embedding_dim)
+
+ self.cross_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm2 = nn.LayerNorm(embedding_dim)
+
+ self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+ self.norm3 = nn.LayerNorm(embedding_dim)
+
+ self.norm4 = nn.LayerNorm(embedding_dim)
+ self.cross_attn_image_to_token = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+
+ self.skip_first_layer_pe = skip_first_layer_pe
+
+ def forward(
+ self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+ ) -> Tuple[Tensor, Tensor]:
+ # Self attention block
+ if self.skip_first_layer_pe:
+ queries = self.self_attn(q=queries, k=queries, v=queries)
+ else:
+ q = queries + query_pe
+ attn_out = self.self_attn(q=q, k=q, v=queries)
+ queries = queries + attn_out
+ queries = self.norm1(queries)
+
+ # Cross attention block, tokens attending to image embedding
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm2(queries)
+
+ # MLP block
+ mlp_out = self.mlp(queries)
+ queries = queries + mlp_out
+ queries = self.norm3(queries)
+
+ # Cross attention block, image embedding attending to tokens
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+ keys = keys + attn_out
+ keys = self.norm4(keys)
+
+ return queries, keys
+
+
+class Attention(nn.Module):
+ """
+ An attention layer that allows for downscaling the size of the embedding
+ after projection to queries, keys, and values.
+ """
+
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ downsample_rate: int = 1,
+ ) -> None:
+ super().__init__()
+ self.embedding_dim = embedding_dim
+ self.internal_dim = embedding_dim // downsample_rate
+ self.num_heads = num_heads
+ assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+
+ self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+ def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+ b, n, c = x.shape
+ x = x.reshape(b, n, num_heads, c // num_heads)
+ return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
+
+ def _recombine_heads(self, x: Tensor) -> Tensor:
+ b, n_heads, n_tokens, c_per_head = x.shape
+ x = x.transpose(1, 2)
+ return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
+
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+ # Input projections
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+
+ # Separate into heads
+ q = self._separate_heads(q, self.num_heads)
+ k = self._separate_heads(k, self.num_heads)
+ v = self._separate_heads(v, self.num_heads)
+
+ # Attention
+ _, _, _, c_per_head = q.shape
+ attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens
+ attn = attn / math.sqrt(c_per_head)
+ attn = torch.softmax(attn, dim=-1)
+
+ # Get output
+ out = attn @ v
+ out = self._recombine_heads(out)
+ out = self.out_proj(out)
+
+ return out
\ No newline at end of file
diff --git a/SAM/predictor.py b/SAM/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..877324aa85f4f22e44f22df6889188e544c9c412
--- /dev/null
+++ b/SAM/predictor.py
@@ -0,0 +1,269 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from .modeling import Sam
+
+from typing import Optional, Tuple
+
+from .utils.transforms import ResizeLongestSide
+
+
+class SamPredictor:
+ def __init__(
+ self,
+ sam_model: Sam,
+ ) -> None:
+ """
+ Uses SAM to calculate the image embedding for an image, and then
+ allow repeated, efficient mask prediction given prompts.
+
+ Arguments:
+ sam_model (Sam): The model to use for mask prediction.
+ """
+ super().__init__()
+ self.model = sam_model
+ self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
+ self.reset_image()
+
+ def set_image(
+ self,
+ image: np.ndarray,
+ image_format: str = "RGB",
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image, allowing
+ masks to be predicted with the 'predict' method.
+
+ Arguments:
+ image (np.ndarray): The image for calculating masks. Expects an
+ image in HWC uint8 format, with pixel values in [0, 255].
+ image_format (str): The color format of the image, in ['RGB', 'BGR'].
+ """
+ assert image_format in [
+ "RGB",
+ "BGR",
+ ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+ if image_format != self.model.image_format:
+ image = image[..., ::-1]
+
+ # Transform the image to the form expected by the model
+ input_image = self.transform.apply_image(image)
+ input_image_torch = torch.as_tensor(input_image, device=self.device)
+ input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+ self.set_torch_image(input_image_torch, image.shape[:2])
+
+ @torch.no_grad()
+ def set_torch_image(
+ self,
+ transformed_image: torch.Tensor,
+ original_image_size: Tuple[int, ...],
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image, allowing
+ masks to be predicted with the 'predict' method. Expects the input
+ image to be already transformed to the format expected by the model.
+
+ Arguments:
+ transformed_image (torch.Tensor): The input image, with shape
+ 1x3xHxW, which has been transformed with ResizeLongestSide.
+ original_image_size (tuple(int, int)): The size of the image
+ before transformation, in (H, W) format.
+ """
+ assert (
+ len(transformed_image.shape) == 4
+ and transformed_image.shape[1] == 3
+ and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size
+ ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}."
+ self.reset_image()
+
+ self.original_size = original_image_size
+ self.input_size = tuple(transformed_image.shape[-2:])
+ input_image = self.model.preprocess(transformed_image)
+ self.features = self.model.image_encoder(input_image)
+ self.is_image_set = True
+
+ def predict(
+ self,
+ point_coords: Optional[np.ndarray] = None,
+ point_labels: Optional[np.ndarray] = None,
+ box: Optional[np.ndarray] = None,
+ mask_input: Optional[np.ndarray] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+
+ Arguments:
+ point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (np.ndarray or None): A length N array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ box (np.ndarray or None): A length 4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form 1xHxW, where
+ for SAM, H=W=256.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+
+ Returns:
+ (np.ndarray): The output masks in CxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (np.ndarray): An array of length C containing the model's
+ predictions for the quality of each mask.
+ (np.ndarray): An array of shape CxHxW, where C is the number
+ of masks and H=W=256. These low resolution logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self.is_image_set:
+ raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+ # Transform input prompts
+ coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+ if point_coords is not None:
+ assert (
+ point_labels is not None
+ ), "point_labels must be supplied if point_coords is supplied."
+ point_coords = self.transform.apply_coords(point_coords, self.original_size)
+ coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device)
+ labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+ coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+ if box is not None:
+ box = self.transform.apply_boxes(box, self.original_size)
+ box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
+ box_torch = box_torch[None, :]
+ if mask_input is not None:
+ mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device)
+ mask_input_torch = mask_input_torch[None, :, :, :]
+
+ masks, iou_predictions, low_res_masks = self.predict_torch(
+ coords_torch,
+ labels_torch,
+ box_torch,
+ mask_input_torch,
+ multimask_output,
+ return_logits=return_logits,
+ )
+
+ masks_np = masks[0].detach().cpu().numpy()
+ iou_predictions_np = iou_predictions[0].detach().cpu().numpy()
+ low_res_masks_np = low_res_masks[0].detach().cpu().numpy()
+ return masks_np, iou_predictions_np, low_res_masks_np
+
+ @torch.no_grad()
+ def predict_torch(
+ self,
+ point_coords: Optional[torch.Tensor],
+ point_labels: Optional[torch.Tensor],
+ boxes: Optional[torch.Tensor] = None,
+ mask_input: Optional[torch.Tensor] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+ Input prompts are batched torch tensors and are expected to already be
+ transformed to the input frame using ResizeLongestSide.
+
+ Arguments:
+ point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (torch.Tensor or None): A BxN array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form Bx1xHxW, where
+ for SAM, H=W=256. Masks returned by a previous iteration of the
+ predict method do not need further transformation.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+
+ Returns:
+ (torch.Tensor): The output masks in BxCxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (torch.Tensor): An array of shape BxC containing the model's
+ predictions for the quality of each mask.
+ (torch.Tensor): An array of shape BxCxHxW, where C is the number
+ of masks and H=W=256. These low res logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self.is_image_set:
+ raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+ if point_coords is not None:
+ points = (point_coords, point_labels)
+ else:
+ points = None
+
+ # Embed prompts
+ sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
+ points=points,
+ boxes=boxes,
+ masks=mask_input,
+ )
+
+ # Predict masks
+ low_res_masks, iou_predictions = self.model.mask_decoder(
+ image_embeddings=self.features,
+ image_pe=self.model.prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ )
+
+ # Upscale the masks to the original image resolution
+ masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+
+ if not return_logits:
+ masks = masks > self.model.mask_threshold
+
+ return masks, iou_predictions, low_res_masks
+
+ def get_image_embedding(self) -> torch.Tensor:
+ """
+ Returns the image embeddings for the currently set image, with
+ shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+ the embedding spatial dimension of SAM (typically C=256, H=W=64).
+ """
+ if not self.is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image(...) to generate an embedding."
+ )
+ assert self.features is not None, "Features must exist if an image has been set."
+ return self.features
+
+ @property
+ def device(self) -> torch.device:
+ return self.model.device
+
+ def reset_image(self) -> None:
+ """Resets the currently set image."""
+ self.is_image_set = False
+ self.features = None
+ self.orig_h = None
+ self.orig_w = None
+ self.input_h = None
+ self.input_w = None
\ No newline at end of file
diff --git a/SAM/utils/__pycache__/amg.cpython-310.pyc b/SAM/utils/__pycache__/amg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71ad4b29f690a99c0740652956b53dcf23ef30f9
Binary files /dev/null and b/SAM/utils/__pycache__/amg.cpython-310.pyc differ
diff --git a/SAM/utils/__pycache__/transforms.cpython-310.pyc b/SAM/utils/__pycache__/transforms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4635c30403ce8c463ce28eb8bc840471354f2ef2
Binary files /dev/null and b/SAM/utils/__pycache__/transforms.cpython-310.pyc differ
diff --git a/SAM/utils/amg.py b/SAM/utils/amg.py
new file mode 100644
index 0000000000000000000000000000000000000000..29deb156d2c6d1a350812db4aaca5c667d5f8960
--- /dev/null
+++ b/SAM/utils/amg.py
@@ -0,0 +1,346 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+
+class MaskData:
+ """
+ A structure for storing masks and their related data in batched format.
+ Implements basic filtering and concatenation.
+ """
+
+ def __init__(self, **kwargs) -> None:
+ for v in kwargs.values():
+ assert isinstance(
+ v, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats = dict(**kwargs)
+
+ def __setitem__(self, key: str, item: Any) -> None:
+ assert isinstance(
+ item, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats[key] = item
+
+ def __delitem__(self, key: str) -> None:
+ del self._stats[key]
+
+ def __getitem__(self, key: str) -> Any:
+ return self._stats[key]
+
+ def items(self) -> ItemsView[str, Any]:
+ return self._stats.items()
+
+ def filter(self, keep: torch.Tensor) -> None:
+ for k, v in self._stats.items():
+ if v is None:
+ self._stats[k] = None
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = v[keep.detach().cpu().numpy()]
+ elif isinstance(v, list) and keep.dtype == torch.bool:
+ self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+ elif isinstance(v, list):
+ self._stats[k] = [v[i] for i in keep]
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def cat(self, new_stats: "MaskData") -> None:
+ for k, v in new_stats.items():
+ if k not in self._stats or self._stats[k] is None:
+ self._stats[k] = deepcopy(v)
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+ elif isinstance(v, list):
+ self._stats[k] = self._stats[k] + deepcopy(v)
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def to_numpy(self) -> None:
+ for k, v in self._stats.items():
+ if isinstance(v, torch.Tensor):
+ self._stats[k] = v.detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(
+ boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+ """Filter masks at the edge of a crop, but not at the edge of the original image."""
+ crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+ orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+ boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+ near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+ near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+ near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+ return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+ box_xywh = deepcopy(box_xyxy)
+ box_xywh[2] = box_xywh[2] - box_xywh[0]
+ box_xywh[3] = box_xywh[3] - box_xywh[1]
+ return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+ assert len(args) > 0 and all(
+ len(a) == len(args[0]) for a in args
+ ), "Batched iteration must have inputs of all the same size."
+ n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+ for b in range(n_batches):
+ yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+ """
+ Encodes masks to an uncompressed RLE, in the format expected by
+ pycoco tools.
+ """
+ # Put in fortran order and flatten h,w
+ b, h, w = tensor.shape
+ tensor = tensor.permute(0, 2, 1).flatten(1)
+
+ # Compute change indices
+ diff = tensor[:, 1:] ^ tensor[:, :-1]
+ change_indices = diff.nonzero()
+
+ # Encode run length
+ out = []
+ for i in range(b):
+ cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+ cur_idxs = torch.cat(
+ [
+ torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ cur_idxs + 1,
+ torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ ]
+ )
+ btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+ counts = [] if tensor[i, 0] == 0 else [0]
+ counts.extend(btw_idxs.detach().cpu().tolist())
+ out.append({"size": [h, w], "counts": counts})
+ return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+ """Compute a binary mask from an uncompressed RLE."""
+ h, w = rle["size"]
+ mask = np.empty(h * w, dtype=bool)
+ idx = 0
+ parity = False
+ for count in rle["counts"]:
+ mask[idx : idx + count] = parity
+ idx += count
+ parity ^= True
+ mask = mask.reshape(w, h)
+ return mask.transpose() # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+ return sum(rle["counts"][1::2])
+
+
+def calculate_stability_score(
+ masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+ """
+ Computes the stability score for a batch of masks. The stability
+ score is the IoU between the binary masks obtained by thresholding
+ the predicted mask logits at high and low values.
+ """
+ # One mask is always contained inside the other.
+ # Save memory by preventing unnecessary cast to torch.int64
+ intersections = (
+ (masks > (mask_threshold + threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ unions = (
+ (masks > (mask_threshold - threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+ """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+ offset = 1 / (2 * n_per_side)
+ points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+ points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+ points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+ points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+ return points
+
+
+def build_all_layer_point_grids(
+ n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+ """Generates point grids for all crop layers."""
+ points_by_layer = []
+ for i in range(n_layers + 1):
+ n_points = int(n_per_side / (scale_per_layer**i))
+ points_by_layer.append(build_point_grid(n_points))
+ return points_by_layer
+
+
+def generate_crop_boxes(
+ im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+ """
+ Generates a list of crop boxes of different sizes. Each layer
+ has (2**i)**2 boxes for the ith layer.
+ """
+ crop_boxes, layer_idxs = [], []
+ im_h, im_w = im_size
+ short_side = min(im_h, im_w)
+
+ # Original image
+ crop_boxes.append([0, 0, im_w, im_h])
+ layer_idxs.append(0)
+
+ def crop_len(orig_len, n_crops, overlap):
+ return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+ for i_layer in range(n_layers):
+ n_crops_per_side = 2 ** (i_layer + 1)
+ overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+ crop_w = crop_len(im_w, n_crops_per_side, overlap)
+ crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+ crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+ crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+
+ # Crops in XYWH format
+ for x0, y0 in product(crop_box_x0, crop_box_y0):
+ box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+ crop_boxes.append(box)
+ layer_idxs.append(i_layer + 1)
+
+ return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+ # Check if boxes has a channel dimension
+ if len(boxes.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0]], device=points.device)
+ # Check if points has a channel dimension
+ if len(points.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return points + offset
+
+
+def uncrop_masks(
+ masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+ x0, y0, x1, y1 = crop_box
+ if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+ return masks
+ # Coordinate transform masks
+ pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+ pad = (x0, pad_x - x0, y0, pad_y - y0)
+ return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(
+ mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+ """
+ Removes small disconnected regions and holes in a mask. Returns the
+ mask and an indicator of if the mask has been modified.
+ """
+ import cv2 # type: ignore
+
+ assert mode in ["holes", "islands"]
+ correct_holes = mode == "holes"
+ working_mask = (correct_holes ^ mask).astype(np.uint8)
+ n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+ sizes = stats[:, -1][1:] # Row 0 is background label
+ small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+ if len(small_regions) == 0:
+ return mask, False
+ fill_labels = [0] + small_regions
+ if not correct_holes:
+ fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+ # If every region is below threshold, keep largest
+ if len(fill_labels) == 0:
+ fill_labels = [int(np.argmax(sizes)) + 1]
+ mask = np.isin(regions, fill_labels)
+ return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+ from pycocotools import mask as mask_utils # type: ignore
+
+ h, w = uncompressed_rle["size"]
+ rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+ rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json
+ return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+ """
+ Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+ an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+ """
+ # torch.max below raises an error on empty inputs, just skip in this case
+ if torch.numel(masks) == 0:
+ return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+ # Normalize shape to CxHxW
+ shape = masks.shape
+ h, w = shape[-2:]
+ if len(shape) > 2:
+ masks = masks.flatten(0, -3)
+ else:
+ masks = masks.unsqueeze(0)
+
+ # Get top and bottom edges
+ in_height, _ = torch.max(masks, dim=-1)
+ in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+ bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+ in_height_coords = in_height_coords + h * (~in_height)
+ top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+ # Get left and right edges
+ in_width, _ = torch.max(masks, dim=-2)
+ in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+ right_edges, _ = torch.max(in_width_coords, dim=-1)
+ in_width_coords = in_width_coords + w * (~in_width)
+ left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+ # If the mask is empty the right edge will be to the left of the left edge.
+ # Replace these boxes with [0, 0, 0, 0]
+ empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+ out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+ out = out * (~empty_filter).unsqueeze(-1)
+
+ # Return to original shape
+ if len(shape) > 2:
+ out = out.reshape(*shape[:-2], 4)
+ else:
+ out = out[0]
+
+ return out
\ No newline at end of file
diff --git a/SAM/utils/transforms.py b/SAM/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cdd0e02b123bd508180e3697a94e1af1a1aa570
--- /dev/null
+++ b/SAM/utils/transforms.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image # type: ignore
+
+from copy import deepcopy
+from typing import Tuple
+
+
+class ResizeLongestSide:
+ """
+ Resizes images to the longest side 'target_length', as well as provides
+ methods for resizing coordinates and boxes. Provides methods for
+ transforming both numpy array and batched torch tensors.
+ """
+
+ def __init__(self, target_length: int) -> None:
+ self.target_length = target_length
+
+ def apply_image(self, image: np.ndarray) -> np.ndarray:
+ """
+ Expects a numpy array with shape HxWxC in uint8 format.
+ """
+ target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+ return np.array(resize(to_pil_image(image), target_size))
+
+ def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+ """
+ Expects a numpy array of length 2 in the final dimension. Requires the
+ original image size in (H, W) format.
+ """
+ old_h, old_w = original_size
+ new_h, new_w = self.get_preprocess_shape(
+ original_size[0], original_size[1], self.target_length
+ )
+ coords = deepcopy(coords).astype(float)
+ coords[..., 0] = coords[..., 0] * (new_w / old_w)
+ coords[..., 1] = coords[..., 1] * (new_h / old_h)
+ return coords
+
+ def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+ """
+ Expects a numpy array shape Bx4. Requires the original image size
+ in (H, W) format.
+ """
+ boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+ return boxes.reshape(-1, 4)
+
+ def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+ """
+ Expects batched images with shape BxCxHxW and float format. This
+ transformation may not exactly match apply_image. apply_image is
+ the transformation expected by the model.
+ """
+ # Expects an image in BCHW format. May not exactly match apply_image.
+ target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+ return F.interpolate(
+ image, target_size, mode="bilinear", align_corners=False, antialias=True
+ )
+
+ def apply_coords_torch(
+ self, coords: torch.Tensor, original_size: Tuple[int, ...]
+ ) -> torch.Tensor:
+ """
+ Expects a torch tensor with length 2 in the last dimension. Requires the
+ original image size in (H, W) format.
+ """
+ old_h, old_w = original_size
+ new_h, new_w = self.get_preprocess_shape(
+ original_size[0], original_size[1], self.target_length
+ )
+ coords = deepcopy(coords).to(torch.float)
+ coords[..., 0] = coords[..., 0] * (new_w / old_w)
+ coords[..., 1] = coords[..., 1] * (new_h / old_h)
+ return coords
+
+ def apply_boxes_torch(
+ self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+ ) -> torch.Tensor:
+ """
+ Expects a torch tensor with shape Bx4. Requires the original image
+ size in (H, W) format.
+ """
+ boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+ return boxes.reshape(-1, 4)
+
+ @staticmethod
+ def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+ """
+ Compute the output size given input size and target long side length.
+ """
+ scale = long_side_length * 1.0 / max(oldh, oldw)
+ newh, neww = oldh * scale, oldw * scale
+ neww = int(neww + 0.5)
+ newh = int(newh + 0.5)
+ return (newh, neww)
\ No newline at end of file
diff --git a/__pycache__/evaluate.cpython-310.pyc b/__pycache__/evaluate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1913b754054bf7fbe9af2c83edd5ae72ddbef55
Binary files /dev/null and b/__pycache__/evaluate.cpython-310.pyc differ
diff --git a/__pycache__/load_nvos.cpython-310.pyc b/__pycache__/load_nvos.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1135c654028930ec45969849db96c7796a02500
Binary files /dev/null and b/__pycache__/load_nvos.cpython-310.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1dcd6ad80fdfc29e27eea6e64be08b4fec23215
--- /dev/null
+++ b/app.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import argparse
+import gradio
+import os
+import torch
+import numpy as np
+import tempfile
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+
+from dust3r.inference import inference, load_model
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+batch_size = 1
+
+def show_mask(mask, ax, random_color=False):
+ if random_color:
+ color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+ else:
+ color = np.array([30/255, 144/255, 255/255, 0.6])
+ h, w = mask.shape[-2:]
+ mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+ ax.imshow(mask_image)
+
+def show_points(coords, labels, ax, marker_size=375):
+ pos_points = coords[labels==1]
+ neg_points = coords[labels==0]
+ ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+ ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+
+def show_box(box, ax):
+ x0, y0 = box[0], box[1]
+ w, h = box[2] - box[0], box[3] - box[1]
+ ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+
+from SAM import SamPredictor
+from SAM.build_sam import sam_model_registry
+sam_checkpoint = "checkpoints/sam_vit_b_01ec64.pth"
+model_type = "vit_b"
+
+sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+sam.to(device='cuda')
+predictor = SamPredictor(sam)
+
+def get_args_parser():
+ parser = argparse.ArgumentParser()
+ parser_url = parser.add_mutually_exclusive_group()
+ parser_url.add_argument("--local_network", action='store_true', default=False,
+ help="make app accessible on local network: address will be set to 0.0.0.0")
+ parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1")
+ parser.add_argument("--image_size", type=int, default=512, choices=[512, 224], help="image size")
+ parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). "
+ "If None, will search for an available port starting at 7860."),
+ default=None)
+ parser.add_argument("--weights", type=str, required=True, help="path to the model weights")
+ parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+ parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir")
+ return parser
+
+
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+ cam_color=None, as_pointcloud=False, transparent_cams=False):
+ assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+ pts3d = to_numpy(pts3d)
+ imgs = to_numpy(imgs)
+ focals = to_numpy(focals)
+ cams2world = to_numpy(cams2world)
+
+ scene = trimesh.Scene()
+
+ # full pointcloud
+ if as_pointcloud:
+ pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+ col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+ pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+ scene.add_geometry(pct)
+ else:
+ meshes = []
+ for i in range(len(imgs)):
+ meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+ mesh = trimesh.Trimesh(**cat_meshes(meshes))
+ scene.add_geometry(mesh)
+
+ # add each camera
+ for i, pose_c2w in enumerate(cams2world):
+ if isinstance(cam_color, list):
+ camera_edge_color = cam_color[i]
+ else:
+ camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+ add_scene_cam(scene, pose_c2w, camera_edge_color,
+ None if transparent_cams else imgs[i], focals[i],
+ imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+
+ rot = np.eye(4)
+ rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+ scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+ outfile = os.path.join(outdir, 'scene.glb')
+ print('(exporting 3D scene to', outfile, ')')
+ scene.export(file_obj=outfile)
+ return outfile
+
+
+def get_3D_model_from_scene(outdir, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
+ clean_depth=False, transparent_cams=False, cam_size=0.05):
+ """
+ extract 3D_model (glb file) from a reconstructed scene
+ """
+ if scene is None:
+ return None
+ # post processes
+ if clean_depth:
+ scene = scene.clean_pointcloud()
+ if mask_sky:
+ scene = scene.mask_sky()
+
+ # get optimized values from scene
+ rgbimg = scene.imgs
+ # print('SAM step...')
+ # predictor.set_image((rgbimg[0] * 255).astype(np.uint8))
+ # h,w,c = rgbimg[0].shape
+ # input_point = np.array([
+ # [int(w/2), int(h/2)],
+ # [int(w/2), int(h/2)-20]
+ # ])
+ # input_label = np.array([1,1])
+ # masks1, scores, logits = predictor.predict(
+ # point_coords=input_point,
+ # point_labels=input_label,
+ # multimask_output=False,
+ # )
+ # fig, ax = plt.subplots(4, 2, figsize=(20, 20))
+ # show_mask(masks1[0], ax[0][0], random_color=True)
+ # show_points(input_point, input_label, ax[0][0])
+ # ax[0][1].imshow(rgbimg[0])
+
+ # predictor.set_image((rgbimg[1] * 255).astype(np.uint8))
+ # h,w,c = rgbimg[1].shape
+ # input_point = np.array([
+ # [int(w/2), int(h/2)],
+ # [int(w/2), int(h/2)-20]
+ # ])
+ # input_label = np.array([1,1])
+ # masks2, scores, logits = predictor.predict(
+ # point_coords=input_point,
+ # point_labels=input_label,
+ # multimask_output=False,
+ # )
+ focals = scene.get_focals().cpu()
+ cams2world = scene.get_im_poses().cpu()
+ # 3D pointcloud from depthmap, poses and intrinsics
+ pts3d = to_numpy(scene.get_pts3d())
+ scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+ msk = to_numpy(scene.get_masks())
+ # ax[1][0].imshow(msk[0])
+ # msk[0] = msk[0] & masks1[0]
+ # ax[1][1].imshow(msk[0])
+ # ax[2][1].imshow(rgbimg[1])
+ # show_mask(masks2[0], ax[2][0], random_color=True)
+ # show_points(input_point, input_label, ax[2][0])
+ # ax[3][0].imshow(msk[1])
+ # # msk[1] = msk[1] & masks2[0]
+ # ax[3][1].imshow(msk[1])
+ # plt.savefig("rgb.png")
+ # import pdb
+ # pdb.set_trace()
+ return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+ transparent_cams=transparent_cams, cam_size=cam_size)
+
+
+def get_reconstructed_scene(outdir, model, device, image_size, filelist, schedule, niter, min_conf_thr,
+ as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+ scenegraph_type, winsize, refid):
+ """
+ from a list of images, run dust3r inference, global aligner.
+ then run get_3D_model_from_scene
+ """
+ imgs = load_images(filelist, size=image_size)
+ if len(imgs) == 1:
+ imgs = [imgs[0], copy.deepcopy(imgs[0])]
+ imgs[1]['idx'] = 1
+ if scenegraph_type == "swin":
+ scenegraph_type = scenegraph_type + "-" + str(winsize)
+ elif scenegraph_type == "oneref":
+ scenegraph_type = scenegraph_type + "-" + str(refid)
+
+ pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+ output = inference(pairs, model, device, batch_size=batch_size)
+
+ mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+ scene = global_aligner(output, device=device, mode=mode)
+ lr = 0.01
+
+ if mode == GlobalAlignerMode.PointCloudOptimizer:
+ loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+
+ outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size)
+
+ # also return rgb, depth and confidence imgs
+ # depth is normalized with the max value for all images
+ # we apply the jet colormap on the confidence maps
+ rgbimg = scene.imgs
+ depths = to_numpy(scene.get_depthmaps())
+ confs = to_numpy([c for c in scene.im_conf])
+ cmap = plt.get_cmap('jet')
+ depths_max = max([d.max() for d in depths])
+ depths = [d/depths_max for d in depths]
+ confs_max = max([d.max() for d in confs])
+ confs = [cmap(d/confs_max) for d in confs]
+
+ imgs = []
+ for i in range(len(rgbimg)):
+ imgs.append(rgbimg[i])
+ imgs.append(rgb(depths[i]))
+ imgs.append(rgb(confs[i]))
+
+ return scene, outfile, imgs
+
+
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+ num_files = len(inputfiles) if inputfiles is not None else 1
+ max_winsize = max(1, (num_files - 1)//2)
+ if scenegraph_type == "swin":
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=True)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files-1, step=1, visible=False)
+ elif scenegraph_type == "oneref":
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files-1, step=1, visible=True)
+ else:
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+ minimum=1, maximum=max_winsize, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+ maximum=num_files-1, step=1, visible=False)
+ return winsize, refid
+
+
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port):
+ recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, image_size)
+ model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname)
+ with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="DUSt3R Demo") as demo:
+ # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+ scene = gradio.State(None)
+ gradio.HTML('
DUSt3R Demo
')
+ with gradio.Column():
+ inputfiles = gradio.File(file_count="multiple")
+ with gradio.Row():
+ schedule = gradio.Dropdown(["linear", "cosine"],
+ value='linear', label="schedule", info="For global alignment!")
+ niter = gradio.Number(value=300, precision=0, minimum=0, maximum=5000,
+ label="num_iterations", info="For global alignment!")
+ scenegraph_type = gradio.Dropdown(["complete", "swin", "oneref"],
+ value='complete', label="Scenegraph",
+ info="Define how to make pairs",
+ interactive=True)
+ winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+ minimum=1, maximum=1, step=1, visible=False)
+ refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+
+ run_btn = gradio.Button("Run")
+
+ with gradio.Row():
+ # adjust the confidence threshold
+ min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1)
+ # adjust the camera size in the output pointcloud
+ cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001)
+ with gradio.Row():
+ as_pointcloud = gradio.Checkbox(value=False, label="As pointcloud")
+ # two post process implemented
+ mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+ clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+ transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+
+ outmodel = gradio.Model3D()
+ outgallery = gradio.Gallery(label='rgb,depth,confidence', columns=3, height="100%")
+
+ # events
+ scenegraph_type.change(set_scenegraph_options,
+ inputs=[inputfiles, winsize, refid, scenegraph_type],
+ outputs=[winsize, refid])
+ inputfiles.change(set_scenegraph_options,
+ inputs=[inputfiles, winsize, refid, scenegraph_type],
+ outputs=[winsize, refid])
+ run_btn.click(fn=recon_fun,
+ inputs=[inputfiles, schedule, niter, min_conf_thr, as_pointcloud,
+ mask_sky, clean_depth, transparent_cams, cam_size,
+ scenegraph_type, winsize, refid],
+ outputs=[scene, outmodel, outgallery])
+ min_conf_thr.release(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ cam_size.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ as_pointcloud.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ mask_sky.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ clean_depth.change(fn=model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ transparent_cams.change(model_from_scene_fun,
+ inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+ clean_depth, transparent_cams, cam_size],
+ outputs=outmodel)
+ demo.launch(share=False, server_name=server_name, server_port=server_port)
+
+
+if __name__ == '__main__':
+ parser = get_args_parser()
+ args = parser.parse_args()
+
+ if args.tmp_dir is not None:
+ tmp_path = args.tmp_dir
+ os.makedirs(tmp_path, exist_ok=True)
+ tempfile.tempdir = tmp_path
+
+ if args.server_name is not None:
+ server_name = args.server_name
+ else:
+ server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+ model = load_model(args.weights, args.device)
+ # dust3r will write the 3D model inside tmpdirname
+ with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+ print('Outputing stuff in', tmpdirname)
+ main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port)
diff --git a/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth b/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90014c0f6bab509e081b52712cc31e1f191d2a4a
--- /dev/null
+++ b/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e8bbf0c4d1d6007f5343f3f45814b956ddc5bbb4d00cb66beaf73afe5c53b34
+size 2285019929
diff --git a/checkpoints/sam_vit_b_01ec64.pth b/checkpoints/sam_vit_b_01ec64.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ab7d111e57bd052a76fe669986560e3555e9c8f6
--- /dev/null
+++ b/checkpoints/sam_vit_b_01ec64.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912
+size 375042383
diff --git a/configs/default.py b/configs/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb15c0add64e922053afbcaa15a7d37755130d83
--- /dev/null
+++ b/configs/default.py
@@ -0,0 +1,119 @@
+from copy import deepcopy
+
+expname = None # experiment name
+basedir = './logs/' # where to store ckpts and logs
+
+''' Template of data options
+'''
+data = dict(
+ datadir=None, # path to dataset root folder
+ dataset_type=None, # blender | nsvf | blendedmvs | tankstemple | deepvoxels | co3d
+ inverse_y=False, # intrinsict mode (to support blendedmvs, nsvf, tankstemple)
+ flip_x=False, # to support co3d
+ flip_y=False, # to suppo/= 10
+ annot_path='', # to support co3d
+ split_path='', # to support co3d
+ sequence_name='', # to support co3d
+# load2gpu_on_the_fly=False, # do not load all images into gpu (to save gpu memory)
+ load2gpu_on_the_fly=True, # do not load all images into gpu (to save gpu memory)
+ testskip=5, # subsample testset to preview results
+ white_bkgd=True, # use white background (note that some dataset don't provide alpha and with blended bg color)
+ rand_bkgd=False, # use random background during training
+ half_res=False, # [TODO]
+ bd_factor=.75,
+ movie_render_kwargs=dict(),
+
+ # Below are forward-facing llff specific settings.
+ ndc=False, # use ndc coordinate (only for forward-facing; not support yet)
+ spherify=False, # inward-facing
+ factor=4, # [TODO]
+ width=None, # enforce image width
+ height=None, # enforce image height
+ llffhold=8, # testsplit
+ load_depths=False, # load depth
+
+ # Below are unbounded inward-facing specific settings.
+ unbounded_inward=False,
+ unbounded_inner_r=1.0,
+)
+
+''' Template of training options
+'''
+coarse_train = dict(
+ N_iters=5000, # number of optimization steps
+ N_rand=8192, # batch size (number of random rays per optimization step)
+ #N_rand=1024, # batch size (number of random rays per optimization step)
+ lrate_density=1e-1, # lr of density voxel grid
+ lrate_k0=1e-1, # lr of color/feature voxel grid
+ lrate_rgbnet=1e-3, # lr of the mlp to preduct view-dependent color
+ lrate_decay=20, # lr decay by 0.1 after every lrate_decay*1000 steps
+ pervoxel_lr=True, # view-count-based lr
+ pervoxel_lr_downrate=1, # downsampled image for computing view-count-based lr
+ ray_sampler='random', # ray sampling strategies
+ weight_main=1.0, # weight of photometric loss
+ weight_entropy_last=0.01, # weight of background entropy loss
+ weight_nearclip=0,
+ weight_distortion=0,
+ weight_rgbper=0.1, # weight of per-point rgb loss
+ tv_every=1, # count total variation loss every tv_every step
+ tv_after=0, # count total variation loss from tv_from step
+ tv_before=0, # count total variation before the given number of iterations
+ tv_dense_before=0, # count total variation densely before the given number of iterations
+ weight_tv_density=0.0, # weight of total variation loss of density voxel grid
+ weight_tv_k0=0.0, # weight of total variation loss of color/feature voxel grid
+ pg_scale=[], # checkpoints for progressive scaling
+ decay_after_scale=1.0, # decay act_shift after scaling
+ skip_zero_grad_fields=[], # the variable name to skip optimizing parameters w/ zero grad in each iteration
+ maskout_lt_nviews=0,
+)
+
+fine_train = deepcopy(coarse_train)
+fine_train.update(dict(
+ N_iters=20000,
+ pervoxel_lr=False,
+ ray_sampler='flatten',
+ weight_entropy_last=0.001,
+ weight_rgbper=0.01,
+ pg_scale=[1000, 2000, 3000, 4000],
+ skip_zero_grad_fields=['density', 'k0'],
+))
+
+''' Template of model and rendering options
+'''
+coarse_model_and_render = dict(
+ num_voxels=1024000, # expected number of voxel
+ num_voxels_base=1024000, # to rescale delta distance
+ density_type='DenseGrid', # DenseGrid, TensoRFGrid
+ k0_type='TensoRFGrid', # DenseGrid, TensoRFGrid
+ density_config=dict(),
+ k0_config=dict(n_comp=48),
+ mpi_depth=128, # the number of planes in Multiplane Image (work when ndc=True)
+ nearest=False, # nearest interpolation
+ pre_act_density=False, # pre-activated trilinear interpolation
+ in_act_density=False, # in-activated trilinear interpolation
+ bbox_thres=1e-3, # threshold to determine known free-space in the fine stage
+ mask_cache_thres=1e-3, # threshold to determine a tighten BBox in the fine stage
+ rgbnet_dim=0, # feature voxel grid dim
+ rgbnet_full_implicit=False, # let the colors MLP ignore feature voxel grid
+ rgbnet_direct=True, # set to False to treat the first 3 dim of feature voxel grid as diffuse rgb
+ rgbnet_depth=3, # depth of the colors MLP (there are rgbnet_depth-1 intermediate features)
+ rgbnet_width=128, # width of the colors MLP
+ alpha_init=1e-6, # set the alpha values everywhere at the begin of training
+ fast_color_thres=1e-7, # threshold of alpha value to skip the fine stage sampled point
+ maskout_near_cam_vox=True, # maskout grid points that between cameras and their near planes
+ world_bound_scale=1, # rescale the BBox enclosing the scene
+ stepsize=0.5, # sampling stepsize in volume rendering
+)
+
+fine_model_and_render = deepcopy(coarse_model_and_render)
+fine_model_and_render.update(dict(
+ num_voxels=160**3,
+ num_voxels_base=160**3,
+ rgbnet_dim=12,
+ alpha_init=1e-2,
+ fast_color_thres=1e-4,
+ maskout_near_cam_vox=False,
+ world_bound_scale=1.05,
+))
+
+del deepcopy
diff --git a/configs/lerf/book_store.py b/configs/lerf/book_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..999f66d56eac44c535e7eebe72955cc4b617852f
--- /dev/null
+++ b/configs/lerf/book_store.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_book_store'
+
+data = dict(
+ datadir='./data/lerf_data/book_store',
+ factor=2, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.5, # positive right
+ shift_y=0.5, # negative down
+ shift_z=1,
+ scale_r=0,
+ pitch_deg=0, # negative look downward
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/bouquet.py b/configs/lerf/bouquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..761b6653a4d1db9ffd565891e047d1ef7576e011
--- /dev/null
+++ b/configs/lerf/bouquet.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_bouquet'
+
+data = dict(
+ datadir='./data/lerf_data/bouquet',
+ factor=2, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.0, # negative down
+ shift_z=0,
+ scale_r=0.2,
+ pitch_deg=0, # negative look downward
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/donuts.py b/configs/lerf/donuts.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ee922f9043cb57c63ec3cd97d5d0a2bb618402c
--- /dev/null
+++ b/configs/lerf/donuts.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_donuts'
+
+data = dict(
+ datadir='./data/lerf_data/donuts',
+ factor=2, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=-0.2,
+ shift_y=0.2,
+ shift_z=0.1,
+ scale_r=1.3,
+ pitch_deg=60,
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/dozer_nerfgun_waldo.py b/configs/lerf/dozer_nerfgun_waldo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6888f66c5eb3848b98075f0f6ab132b300d64d
--- /dev/null
+++ b/configs/lerf/dozer_nerfgun_waldo.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_dozer_nerfgun_waldo'
+
+data = dict(
+ datadir='./data/lerf_data/dozer_nerfgun_waldo',
+ factor=2, # 497 * 369
+ # factor=4,
+# movie_render_kwargs=dict(
+# shift_x=0.0, # positive right
+# shift_y=-0.3, # negative down
+# shift_z=0,
+# scale_r=0.2,
+# pitch_deg=-40, # negative look downward
+# ),
+)
\ No newline at end of file
diff --git a/configs/lerf/espresso.py b/configs/lerf/espresso.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af0f7b84a68874195bfd08af6bba547cdaf61f3
--- /dev/null
+++ b/configs/lerf/espresso.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_espresso'
+
+data = dict(
+ datadir='./data/lerf_data/espresso',
+ factor=2, # 497 * 369
+ # factor=4,
+# movie_render_kwargs=dict(
+# shift_x=0.0, # positive right
+# shift_y=-0.3, # negative down
+# shift_z=0,
+# scale_r=0.2,
+# pitch_deg=-40, # negative look downward
+# ),
+)
\ No newline at end of file
diff --git a/configs/lerf/figurines.py b/configs/lerf/figurines.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b76d6cdfeac1116c1b1bdff89d63d431fd33941
--- /dev/null
+++ b/configs/lerf/figurines.py
@@ -0,0 +1,15 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_figurines'
+
+data = dict(
+ datadir='./data/lerf_data/figurines',
+ factor=2, # 497 * 369
+ movie_render_kwargs=dict(
+ shift_x=0.0,
+ shift_y=0.0,
+ shift_z=0.0,
+ scale_r=1.0,
+ pitch_deg=55,
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/lerf_default.py b/configs/lerf/lerf_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..64859cb80f6e34b7d3b74ed78878932d288f6ab0
--- /dev/null
+++ b/configs/lerf/lerf_default.py
@@ -0,0 +1,52 @@
+# copy from nerf unbounded
+_base_ = '../default.py'
+
+basedir = './logs/lerf'
+
+data = dict(
+ dataset_type='lerf',
+ spherify=False,
+ factor=2,
+ white_bkgd=True,
+ rand_bkgd=True,
+ inverse_y=False, # llff format
+ unbounded_inward=True,
+ load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+ N_iters=800000,
+ N_rand=1024 * 4,
+ lrate_decay=80,
+ ray_sampler='flatten',
+ weight_nearclip=1.0,
+ weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+ tv_before=20000,
+ tv_dense_before=20000,
+ weight_tv_density=1e-6,
+ weight_tv_k0=1e-7
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=160**3,
+ alpha_init=alpha_init,
+ stepsize=stepsize,
+ fast_color_thres={
+ '_delete_': True,
+ 0 : alpha_init*stepsize/10,
+ 1500: min(alpha_init, 1e-4)*stepsize/5,
+ 2500: min(alpha_init, 1e-4)*stepsize/2,
+ 3500: min(alpha_init, 1e-4)*stepsize/1.5,
+ 4500: min(alpha_init, 1e-4)*stepsize,
+ 5500: min(alpha_init, 1e-4),
+ 6500: 1e-4,
+ },
+ world_bound_scale=1,
+)
diff --git a/configs/lerf/room.py b/configs/lerf/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/lerf/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/room',
+ # factor=2, # 1557x1038
+ factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.3, # negative down
+ shift_z=0,
+ scale_r=0.2,
+ pitch_deg=-40, # negative look downward
+ ),
+)
+
diff --git a/configs/lerf/seg_lerf/book_store.py b/configs/lerf/seg_lerf/book_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..999f66d56eac44c535e7eebe72955cc4b617852f
--- /dev/null
+++ b/configs/lerf/seg_lerf/book_store.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_book_store'
+
+data = dict(
+ datadir='./data/lerf_data/book_store',
+ factor=2, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.5, # positive right
+ shift_y=0.5, # negative down
+ shift_z=1,
+ scale_r=0,
+ pitch_deg=0, # negative look downward
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/bouquet.py b/configs/lerf/seg_lerf/bouquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..761b6653a4d1db9ffd565891e047d1ef7576e011
--- /dev/null
+++ b/configs/lerf/seg_lerf/bouquet.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_bouquet'
+
+data = dict(
+ datadir='./data/lerf_data/bouquet',
+ factor=2, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.0, # negative down
+ shift_z=0,
+ scale_r=0.2,
+ pitch_deg=0, # negative look downward
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/donuts.py b/configs/lerf/seg_lerf/donuts.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccadb75ddafac0dec1838176e28082261be1b51
--- /dev/null
+++ b/configs/lerf/seg_lerf/donuts.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_donuts'
+
+data = dict(
+ datadir='./data/lerf_data/donuts',
+ factor=1, # 497 * 369
+ # factor=4,
+ movie_render_kwargs=dict(
+ shift_x=-0.2,
+ shift_y=0.2,
+ shift_z=0.1,
+ scale_r=1.3,
+ pitch_deg=60,
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py b/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6888f66c5eb3848b98075f0f6ab132b300d64d
--- /dev/null
+++ b/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_dozer_nerfgun_waldo'
+
+data = dict(
+ datadir='./data/lerf_data/dozer_nerfgun_waldo',
+ factor=2, # 497 * 369
+ # factor=4,
+# movie_render_kwargs=dict(
+# shift_x=0.0, # positive right
+# shift_y=-0.3, # negative down
+# shift_z=0,
+# scale_r=0.2,
+# pitch_deg=-40, # negative look downward
+# ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/espresso.py b/configs/lerf/seg_lerf/espresso.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af0f7b84a68874195bfd08af6bba547cdaf61f3
--- /dev/null
+++ b/configs/lerf/seg_lerf/espresso.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_espresso'
+
+data = dict(
+ datadir='./data/lerf_data/espresso',
+ factor=2, # 497 * 369
+ # factor=4,
+# movie_render_kwargs=dict(
+# shift_x=0.0, # positive right
+# shift_y=-0.3, # negative down
+# shift_z=0,
+# scale_r=0.2,
+# pitch_deg=-40, # negative look downward
+# ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/figurines.py b/configs/lerf/seg_lerf/figurines.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b76d6cdfeac1116c1b1bdff89d63d431fd33941
--- /dev/null
+++ b/configs/lerf/seg_lerf/figurines.py
@@ -0,0 +1,15 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_figurines'
+
+data = dict(
+ datadir='./data/lerf_data/figurines',
+ factor=2, # 497 * 369
+ movie_render_kwargs=dict(
+ shift_x=0.0,
+ shift_y=0.0,
+ shift_z=0.0,
+ scale_r=1.0,
+ pitch_deg=55,
+ ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/lerf_default.py b/configs/lerf/seg_lerf/lerf_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba20465281f0f06c12a0f94436b51055b2d81ecf
--- /dev/null
+++ b/configs/lerf/seg_lerf/lerf_default.py
@@ -0,0 +1,52 @@
+# copy from nerf unbounded
+_base_ = '../../seg_default.py'
+
+basedir = './logs/lerf'
+
+data = dict(
+ dataset_type='lerf',
+ spherify=False,
+ factor=2,
+ white_bkgd=True,
+ rand_bkgd=True,
+ inverse_y=False, # llff format
+ unbounded_inward=True,
+ load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+ N_iters=800000,
+ N_rand=1024 * 4,
+ lrate_decay=80,
+ ray_sampler='flatten',
+ weight_nearclip=1.0,
+ weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+ tv_before=20000,
+ tv_dense_before=20000,
+ weight_tv_density=1e-6,
+ weight_tv_k0=1e-7
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=160**3,
+ alpha_init=alpha_init,
+ stepsize=stepsize,
+ fast_color_thres={
+ '_delete_': True,
+ 0 : alpha_init*stepsize/10,
+ 1500: min(alpha_init, 1e-4)*stepsize/5,
+ 2500: min(alpha_init, 1e-4)*stepsize/2,
+ 3500: min(alpha_init, 1e-4)*stepsize/1.5,
+ 4500: min(alpha_init, 1e-4)*stepsize,
+ 5500: min(alpha_init, 1e-4),
+ 6500: 1e-4,
+ },
+ world_bound_scale=1,
+)
diff --git a/configs/lerf/seg_lerf/room.py b/configs/lerf/seg_lerf/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/lerf/seg_lerf/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/room',
+ # factor=2, # 1557x1038
+ factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.3, # negative down
+ shift_z=0,
+ scale_r=0.2,
+ pitch_deg=-40, # negative look downward
+ ),
+)
+
diff --git a/configs/llff/airplants.py b/configs/llff/airplants.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a2c01555963c058cb2905cc0929b534df53116
--- /dev/null
+++ b/configs/llff/airplants.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'airplants'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/airplants',
+)
diff --git a/configs/llff/apeskeleton.py b/configs/llff/apeskeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..d94364261872b6aaaff27b4549fa4a4279d3fc76
--- /dev/null
+++ b/configs/llff/apeskeleton.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'apeskeleton'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/apeskeleton',
+)
diff --git a/configs/llff/bikes.py b/configs/llff/bikes.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1c6f0ede20193edf593a76db61e953b56efc79
--- /dev/null
+++ b/configs/llff/bikes.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'bikes'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/bikes',
+)
diff --git a/configs/llff/butcher.py b/configs/llff/butcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f3fda298aa01133399ff37ef304be21e60d208
--- /dev/null
+++ b/configs/llff/butcher.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'butcher'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_butcher',
+)
diff --git a/configs/llff/chesstable.py b/configs/llff/chesstable.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb76b05e61d8078458809a27791580d58b2b854
--- /dev/null
+++ b/configs/llff/chesstable.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'chesstable'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data2_chesstable',
+)
diff --git a/configs/llff/colorfountain.py b/configs/llff/colorfountain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e936d2ebec9a4f9372ccf22e47abffeec2c44a9
--- /dev/null
+++ b/configs/llff/colorfountain.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'colorfountain'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/colorfountain',
+)
diff --git a/configs/llff/fern.py b/configs/llff/fern.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9b8ed01b5aa6676420b9867bb4cf361d296fdc7
--- /dev/null
+++ b/configs/llff/fern.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'fern'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/fern',
+)
diff --git a/configs/llff/flower.py b/configs/llff/flower.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0578b735b125abc5630b66cc29aaa59118a9c0
--- /dev/null
+++ b/configs/llff/flower.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'flower'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/flower',
+)
diff --git a/configs/llff/fortress.py b/configs/llff/fortress.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd675d52697b25a46832cd0c22cf65b281b767a1
--- /dev/null
+++ b/configs/llff/fortress.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'fortress'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/fortress',
+)
+
diff --git a/configs/llff/horns.py b/configs/llff/horns.py
new file mode 100644
index 0000000000000000000000000000000000000000..e12a4fe46bffacceb6f28012eec4b795447d3aa4
--- /dev/null
+++ b/configs/llff/horns.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'horns'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/horns',
+)
diff --git a/configs/llff/kitchen.py b/configs/llff/kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9a715b269591213c7d61accf10a5ab2a830e581
--- /dev/null
+++ b/configs/llff/kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './llff_default.py'
+
+expname = '360_dvgo_kitchen_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/kitchen',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/llff/lab_desk.py b/configs/llff/lab_desk.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efc25edd5f68dc4347193d81d4f92ce78f110e6
--- /dev/null
+++ b/configs/llff/lab_desk.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'lab_desk'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/lab_desk',
+)
diff --git a/configs/llff/leaves.py b/configs/llff/leaves.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d7cb3ee23158068fdd9fe1f2cb4267ab772d67
--- /dev/null
+++ b/configs/llff/leaves.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'leaves'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/leaves',
+)
+
diff --git a/configs/llff/llff_default.py b/configs/llff/llff_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0fd785a56891a9db1a2a20ada71c147b88c7350
--- /dev/null
+++ b/configs/llff/llff_default.py
@@ -0,0 +1,60 @@
+_base_ = '../default.py'
+
+basedir = './logs/llff'
+
+data = dict(
+ dataset_type='llff',
+ ndc=True,
+# width=1008,
+# height=756,
+ factor=4,
+)
+
+coarse_train = dict(
+ N_iters=0,
+)
+
+coarse_model_and_render = dict(
+ num_voxels=320**3,
+ f_num_voxels=320**3,
+ num_voxels_base=320**3,
+ f_num_voxels_base=320**3,
+ density_type='DenseGrid',
+ density_config=dict(n_comp=1),
+ k0_type='TensoRFGrid',
+ k0_config=dict(n_comp=48),
+ f_k0_type='TensoRFGrid',
+ f_k0_config=dict(n_comp=64),
+)
+
+fine_train = dict(
+ N_iters=30000,
+ #N_iters=60000,
+ N_rand=4096 * 1,
+ #weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000],
+ ray_sampler='flatten',
+ tv_before=1e9,
+ tv_dense_before=10000,
+ weight_tv_density=1e-5,
+ weight_tv_k0=1e-6,
+)
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ f_num_voxels=320**3,
+ num_voxels_base=320**3,
+ f_num_voxels_base=320**3,
+ density_type='DenseGrid',
+ density_config=dict(n_comp=1),
+ k0_type='TensoRFGrid',
+ k0_config=dict(n_comp=48),
+ f_k0_type='TensoRFGrid',
+ f_k0_config=dict(n_comp=64),
+
+ mpi_depth=128,
+ rgbnet_dim=9,
+ rgbnet_width=64,
+ world_bound_scale=1,
+ fast_color_thres=1e-3,
+)
diff --git a/configs/llff/orchids.py b/configs/llff/orchids.py
new file mode 100644
index 0000000000000000000000000000000000000000..385f52bfa2783ddc2e815f53be4236fe61874c0f
--- /dev/null
+++ b/configs/llff/orchids.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'orchids'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/orchids',
+)
+
diff --git a/configs/llff/plants.py b/configs/llff/plants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8646158bcbf5316cfd58b2391f249c389758f7
--- /dev/null
+++ b/configs/llff/plants.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'plants'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/plants',
+)
+
diff --git a/configs/llff/playground.py b/configs/llff/playground.py
new file mode 100644
index 0000000000000000000000000000000000000000..e909469a4330b61d10921c43eddc39defebf47e6
--- /dev/null
+++ b/configs/llff/playground.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'playground'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/playground',
+)
diff --git a/configs/llff/pond.py b/configs/llff/pond.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5f9099edb0b2646de1a86121bf886d6c4eefbb
--- /dev/null
+++ b/configs/llff/pond.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'pond'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/pond',
+)
diff --git a/configs/llff/room.py b/configs/llff/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..8310ac73bf90f63808c755914a2cab702e6e40b6
--- /dev/null
+++ b/configs/llff/room.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'room'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/room',
+)
diff --git a/configs/llff/santarex.py b/configs/llff/santarex.py
new file mode 100644
index 0000000000000000000000000000000000000000..db548034a5c62b7b778c31bcc2000e644f772d2b
--- /dev/null
+++ b/configs/llff/santarex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'santarex'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data_santarex',
+)
diff --git a/configs/llff/seg/llff_seg_default.py b/configs/llff/seg/llff_seg_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..65a089fee9a984f52d53cda46e63ccc0077f6d98
--- /dev/null
+++ b/configs/llff/seg/llff_seg_default.py
@@ -0,0 +1,52 @@
+_base_ = '../../seg_default.py'
+
+basedir = './logs/llff'
+
+data = dict(
+ dataset_type='llff',
+ ndc=True,
+# width=1008,
+# height=756,
+ factor=4,
+)
+
+coarse_train = dict(
+ N_iters=0,
+)
+
+coarse_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=320**3,
+ density_type='DenseGrid',
+ density_config=dict(n_comp=1),
+ k0_type='TensoRFGrid',
+ k0_config=dict(n_comp=48),
+)
+
+fine_train = dict(
+ N_iters=30000,
+ #N_iters=60000,
+ N_rand=4096 * 1,
+ #weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000],
+ ray_sampler='flatten',
+ tv_before=1e9,
+ tv_dense_before=10000,
+ weight_tv_density=1e-5,
+ weight_tv_k0=1e-6,
+)
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=320**3,
+ density_type='DenseGrid',
+ density_config=dict(n_comp=1),
+ k0_type='TensoRFGrid',
+ k0_config=dict(n_comp=48),
+
+ mpi_depth=128,
+ rgbnet_dim=9,
+ rgbnet_width=64,
+ world_bound_scale=1,
+ fast_color_thres=1e-1,
+)
diff --git a/configs/llff/seg/seg_butcher.py b/configs/llff/seg/seg_butcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d98eed69df067cf4e0b79b1eb27cda41cd1cb20b
--- /dev/null
+++ b/configs/llff/seg/seg_butcher.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'butcher'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_butcher',
+)
diff --git a/configs/llff/seg/seg_chesstable.py b/configs/llff/seg/seg_chesstable.py
new file mode 100644
index 0000000000000000000000000000000000000000..c845fc88d73dcca44b815af1dd545ed1a794a562
--- /dev/null
+++ b/configs/llff/seg/seg_chesstable.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'chesstable'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data2_chesstable',
+)
diff --git a/configs/llff/seg/seg_fern.py b/configs/llff/seg/seg_fern.py
new file mode 100644
index 0000000000000000000000000000000000000000..658bb21756b31070ccbe586397756b68cbfc98b5
--- /dev/null
+++ b/configs/llff/seg/seg_fern.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'fern'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/fern',
+)
diff --git a/configs/llff/seg/seg_flower.py b/configs/llff/seg/seg_flower.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5b6c368419da4ff99481407406c2523ed0e99c
--- /dev/null
+++ b/configs/llff/seg/seg_flower.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'flower'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/flower',
+)
diff --git a/configs/llff/seg/seg_fortress.py b/configs/llff/seg/seg_fortress.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d996616ff3e309b4a2521730aad7d6253ca5f5
--- /dev/null
+++ b/configs/llff/seg/seg_fortress.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'fortress'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/fortress',
+)
+
diff --git a/configs/llff/seg/seg_horns.py b/configs/llff/seg/seg_horns.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7ff38bc7b8700dac4fec4ed9dfd66dbb1870d4
--- /dev/null
+++ b/configs/llff/seg/seg_horns.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'horns'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/horns',
+)
diff --git a/configs/llff/seg/seg_kitchen.py b/configs/llff/seg/seg_kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac02e2539d933ce0c6e787b77f4650dde082dad
--- /dev/null
+++ b/configs/llff/seg/seg_kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './llff_seg_default.py'
+
+expname = '360_dvgo_kitchen_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/kitchen',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/llff/seg/seg_leaves.py b/configs/llff/seg/seg_leaves.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb4362626bb5f0d1d9db80a185dcb7eef0f2899
--- /dev/null
+++ b/configs/llff/seg/seg_leaves.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'leaves'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/leaves',
+)
diff --git a/configs/llff/seg/seg_orchids.py b/configs/llff/seg/seg_orchids.py
new file mode 100644
index 0000000000000000000000000000000000000000..7288f1151f20f5c292c331fa340815a05276e622
--- /dev/null
+++ b/configs/llff/seg/seg_orchids.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'orchids'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/orchids',
+)
+
diff --git a/configs/llff/seg/seg_pond.py b/configs/llff/seg/seg_pond.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c6e69f858bc8c47da4c189f84b0546da2cb7b3d
--- /dev/null
+++ b/configs/llff/seg/seg_pond.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'pond'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/pond',
+)
diff --git a/configs/llff/seg/seg_room.py b/configs/llff/seg/seg_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef1c4b99b03f0a8b4bd52fe99895cf571e338e0
--- /dev/null
+++ b/configs/llff/seg/seg_room.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'room'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/room',
+)
diff --git a/configs/llff/seg/seg_santarex.py b/configs/llff/seg/seg_santarex.py
new file mode 100644
index 0000000000000000000000000000000000000000..422abff46e9494208c8e64deeab8ceaf07a229ad
--- /dev/null
+++ b/configs/llff/seg/seg_santarex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'santarex'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data_santarex',
+)
diff --git a/configs/llff/seg/seg_shoerack.py b/configs/llff/seg/seg_shoerack.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd93c1ff08f3290f6a5501b6ae9288353672338
--- /dev/null
+++ b/configs/llff/seg/seg_shoerack.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'shoerack'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_shoerack',
+)
+
diff --git a/configs/llff/seg/seg_statue.py b/configs/llff/seg/seg_statue.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0286c7ea48d10a6a045362e21d7543f9a2b2d7
--- /dev/null
+++ b/configs/llff/seg/seg_statue.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'statue'
+
+data = dict(
+ datadir='./data/statue',
+ factor=1,
+)
diff --git a/configs/llff/seg/seg_stove.py b/configs/llff/seg/seg_stove.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e06241a0b76e371ca34187abdc25ed97f43b51
--- /dev/null
+++ b/configs/llff/seg/seg_stove.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'stove'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_stove',
+)
diff --git a/configs/llff/seg/seg_trex.py b/configs/llff/seg/seg_trex.py
new file mode 100644
index 0000000000000000000000000000000000000000..528721d4051fadcd769b725588675713ef799ab1
--- /dev/null
+++ b/configs/llff/seg/seg_trex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'trex'
+
+data = dict(
+ datadir='./data/nerf_data/nerf_llff_data(NVOS)/trex',
+)
diff --git a/configs/llff/shelves.py b/configs/llff/shelves.py
new file mode 100644
index 0000000000000000000000000000000000000000..002f35d6ef6df9cecb9149946640034ed0a20ae6
--- /dev/null
+++ b/configs/llff/shelves.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'shelves'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/shelves',
+)
diff --git a/configs/llff/shoerack.py b/configs/llff/shoerack.py
new file mode 100644
index 0000000000000000000000000000000000000000..08fae68c3abb3c176c46627b2011e0785c78fa88
--- /dev/null
+++ b/configs/llff/shoerack.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'shoerack'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_shoerack',
+)
+
diff --git a/configs/llff/statue.py b/configs/llff/statue.py
new file mode 100644
index 0000000000000000000000000000000000000000..825b8024563843b5998071b79ef22842c1b93a5a
--- /dev/null
+++ b/configs/llff/statue.py
@@ -0,0 +1,14 @@
+_base_ = './llff_default.py'
+
+expname = 'statue'
+
+data = dict(
+ datadir='data/statue',
+ factor=1,
+ # ndc=True,
+ # spherify=False,
+ # white_bkgd=True,
+ # rand_bkgd=False,
+ # unbounded_inward=False,
+ # load2gpu_on_the_fly=False,
+)
diff --git a/configs/llff/stove.py b/configs/llff/stove.py
new file mode 100644
index 0000000000000000000000000000000000000000..026aa7a4abff8a325d8e9bfdf5207f186990ed14
--- /dev/null
+++ b/configs/llff/stove.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'stove'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_stove',
+)
diff --git a/configs/llff/succtrough.py b/configs/llff/succtrough.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd699b49221d7750186d038a4cb37d5fd51e8d3
--- /dev/null
+++ b/configs/llff/succtrough.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'succtrough'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/succtrough',
+)
diff --git a/configs/llff/trex.py b/configs/llff/trex.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b299e74fa14d3bcec0bdf68e4c47d0647e7e774
--- /dev/null
+++ b/configs/llff/trex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'trex'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/trex',
+)
diff --git a/configs/nerf_unbounded/bicycle.py b/configs/nerf_unbounded/bicycle.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95e646989e0f48b448327dc0efdc2b1a9b9c0e5
--- /dev/null
+++ b/configs/nerf_unbounded/bicycle.py
@@ -0,0 +1,14 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_bicycle_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/bicycle',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
+
diff --git a/configs/nerf_unbounded/bonsai.py b/configs/nerf_unbounded/bonsai.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f8d5b0be377547adbf3fd2dfbcb314984864d3
--- /dev/null
+++ b/configs/nerf_unbounded/bonsai.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_bonsai_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/bonsai',
+ factor=4, # 1559x1039
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=0, # negative down
+ shift_z=0,
+ scale_r=1.0,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/counter.py b/configs/nerf_unbounded/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c7d362485b684b6e4da4bafd40868f5cc89f55e
--- /dev/null
+++ b/configs/nerf_unbounded/counter.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_counter_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/counter',
+ factor=4, # 1558x1038
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.2, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/fish.py b/configs/nerf_unbounded/fish.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fb9ba7968bc1e60f34d0d10ea02ba2a2748798d
--- /dev/null
+++ b/configs/nerf_unbounded/fish.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_fish_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/fish',
+ factor=2, # 1297x840
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ #shift_y=-0.0, # negative down
+ shift_y=-0.10, # negative down
+ shift_z=0.0,
+ scale_r=0.9,
+ pitch_deg=-0,
+ ),
+)
diff --git a/configs/nerf_unbounded/fork.py b/configs/nerf_unbounded/fork.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80c67079744bcf80e0828b005cc3bffb56f5308
--- /dev/null
+++ b/configs/nerf_unbounded/fork.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_fork_unbounded'
+
+data = dict(
+ datadir='./data/fork/dense',
+ factor=8, # 1558x1038
+ bd_factor=None,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=0.0, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/garden.py b/configs/nerf_unbounded/garden.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecbc1dbbb10c0c8ba50c7f68d24bf66e0e64a4ac
--- /dev/null
+++ b/configs/nerf_unbounded/garden.py
@@ -0,0 +1,15 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_garden_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/garden',
+ factor=8,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.0, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30,
+ ),
+)
diff --git a/configs/nerf_unbounded/kitchen.py b/configs/nerf_unbounded/kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..aec8a4bcb666a39c0afbea633958023a8648a167
--- /dev/null
+++ b/configs/nerf_unbounded/kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_kitchen_unbounded'
+
+data = dict(
+ datadir='./nerf/data/360_v2/kitchen',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/nerf_unbounded/lab_desk.py b/configs/nerf_unbounded/lab_desk.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fba48d4e7e1a18c9f645959886a4dc34ccf55ab
--- /dev/null
+++ b/configs/nerf_unbounded/lab_desk.py
@@ -0,0 +1,8 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'lab_desk'
+
+data = dict(
+ datadir='./data/nerf_llff_data(NVOS)/lab_desk',
+ factor=2,
+)
diff --git a/configs/nerf_unbounded/legohouse.py b/configs/nerf_unbounded/legohouse.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c49c26a20676c26703c9af54d15b1357819305e
--- /dev/null
+++ b/configs/nerf_unbounded/legohouse.py
@@ -0,0 +1,13 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_legohouse_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/legohouse',
+ factor=8,
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/nerf_unbounded/mat.py b/configs/nerf_unbounded/mat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f912bcd496657673b3d0d692dfba69bff3058bb6
--- /dev/null
+++ b/configs/nerf_unbounded/mat.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_mat_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/mat',
+ factor=2, # 1297x840
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ #shift_y=-0.0, # negative down
+ shift_y=-0.10, # negative down
+ shift_z=0.0,
+ scale_r=1.0,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/nerf_unbounded/nerf_unbounded_default.py b/configs/nerf_unbounded/nerf_unbounded_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb18b7e885aff50f7f82b9a6817f775529e26d42
--- /dev/null
+++ b/configs/nerf_unbounded/nerf_unbounded_default.py
@@ -0,0 +1,51 @@
+_base_ = '../default.py'
+
+basedir = './logs/nerf_unbounded'
+
+data = dict(
+ dataset_type='llff',
+ spherify=True,
+ factor=4,
+ llffhold=8,
+ white_bkgd=True,
+ rand_bkgd=True,
+ unbounded_inward=True,
+ load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+ N_iters=800000,
+ N_rand=1024 * 4,
+ lrate_decay=80,
+ ray_sampler='flatten',
+ weight_nearclip=1.0,
+ weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+ tv_before=20000,
+ tv_dense_before=20000,
+ weight_tv_density=1e-6,
+ weight_tv_k0=1e-7,
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=160**3,
+ alpha_init=alpha_init,
+ stepsize=stepsize,
+ fast_color_thres={
+ '_delete_': True,
+ 0 : alpha_init*stepsize/10,
+ 1500: min(alpha_init, 1e-4)*stepsize/5,
+ 2500: min(alpha_init, 1e-4)*stepsize/2,
+ 3500: min(alpha_init, 1e-4)*stepsize/1.5,
+ 4500: min(alpha_init, 1e-4)*stepsize,
+ 5500: min(alpha_init, 1e-4),
+ 6500: 1e-4,
+ },
+ world_bound_scale=1,
+)
diff --git a/configs/nerf_unbounded/pinecone.py b/configs/nerf_unbounded/pinecone.py
new file mode 100644
index 0000000000000000000000000000000000000000..4daa91a07a6aeb375efd106e0f108395a1ff6831
--- /dev/null
+++ b/configs/nerf_unbounded/pinecone.py
@@ -0,0 +1,15 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_pinecone_unbounded'
+
+data = dict(
+ datadir='./data/nerf_real_360/pinecone',
+ factor=8, # 484x363
+ movie_render_kwargs=dict(
+ shift_x=0.0,
+ shift_y=0.0,
+ shift_z=0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
\ No newline at end of file
diff --git a/configs/nerf_unbounded/redtable.py b/configs/nerf_unbounded/redtable.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd3e20e100f39b6ba50c7a6b0d9c6a015f2f6b0
--- /dev/null
+++ b/configs/nerf_unbounded/redtable.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_redtable_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/redtable',
+ factor=2, # 1297x840
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ #shift_y=-0.0, # negative down
+ shift_y=-0.10, # negative down
+ shift_z=0.0,
+ scale_r=0.9,
+ pitch_deg=-0,
+ ),
+)
diff --git a/configs/nerf_unbounded/room.py b/configs/nerf_unbounded/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/nerf_unbounded/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/room',
+ # factor=2, # 1557x1038
+ factor=4,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.3, # negative down
+ shift_z=0,
+ scale_r=0.2,
+ pitch_deg=-40, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/sculptures.py b/configs/nerf_unbounded/sculptures.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f9398989e1039ee38f9253daf99b85803603e7
--- /dev/null
+++ b/configs/nerf_unbounded/sculptures.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_sculptures_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/sculptures',
+ factor=4, # 1297x840
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ #shift_y=-0.0, # negative down
+ shift_y=-0.10, # negative down
+ shift_z=0.0,
+ scale_r=0.9,
+ pitch_deg=-0,
+ ),
+)
diff --git a/configs/nerf_unbounded/seg_bicycle.py b/configs/nerf_unbounded/seg_bicycle.py
new file mode 100644
index 0000000000000000000000000000000000000000..54ffa8ca8680b0a5a4b1169c6a1fc33575b00036
--- /dev/null
+++ b/configs/nerf_unbounded/seg_bicycle.py
@@ -0,0 +1,14 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_bicycle_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/bicycle',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
+
diff --git a/configs/nerf_unbounded/seg_bonsai.py b/configs/nerf_unbounded/seg_bonsai.py
new file mode 100644
index 0000000000000000000000000000000000000000..d186467b2e8012653c314225dd7f1bee15d19778
--- /dev/null
+++ b/configs/nerf_unbounded/seg_bonsai.py
@@ -0,0 +1,16 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_bonsai_unbounded'
+
+data = dict(
+ datadir='./data/nerf/360_v2/bonsai',
+ factor=4, # 1559x1039
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=0, # negative down
+ shift_z=0,
+ scale_r=1.0,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/seg_counter.py b/configs/nerf_unbounded/seg_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18f4c63884ef6f87ef24f1dd0008ad43f096391
--- /dev/null
+++ b/configs/nerf_unbounded/seg_counter.py
@@ -0,0 +1,16 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_counter_unbounded'
+
+data = dict(
+ datadir='./data/nerf/360_v2/counter',
+ factor=8, # 1558x1038
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.2, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/seg_fork.py b/configs/nerf_unbounded/seg_fork.py
new file mode 100644
index 0000000000000000000000000000000000000000..5318166be98d21ed1b0b6f2c791505c185345e4e
--- /dev/null
+++ b/configs/nerf_unbounded/seg_fork.py
@@ -0,0 +1,17 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dcvgo_fork_unbounded'
+
+data = dict(
+ datadir='./data/nerf/fork/',
+ factor=8, # 1558x1038
+ bd_factor=None,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=0.0, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30, # negative look downward
+ ),
+)
+
diff --git a/configs/nerf_unbounded/seg_garden.py b/configs/nerf_unbounded/seg_garden.py
new file mode 100644
index 0000000000000000000000000000000000000000..11909ce245e4140afe24f174e7fdc3bf5d591940
--- /dev/null
+++ b/configs/nerf_unbounded/seg_garden.py
@@ -0,0 +1,15 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_garden_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/garden',
+ factor=8,
+ movie_render_kwargs=dict(
+ shift_x=0.0, # positive right
+ shift_y=-0.0, # negative down
+ shift_z=0,
+ scale_r=0.9,
+ pitch_deg=-30,
+ ),
+)
diff --git a/configs/nerf_unbounded/seg_kitchen.py b/configs/nerf_unbounded/seg_kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..90caab07762467ab5488121539b9afd78ae5d7ef
--- /dev/null
+++ b/configs/nerf_unbounded/seg_kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_kitchen_unbounded'
+
+data = dict(
+ datadir='./data/360_v2/kitchen',
+ factor=4, # 1558x1039
+ movie_render_kwargs=dict(
+ shift_y=-0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/nerf_unbounded/seg_nerf_unbounded_default.py b/configs/nerf_unbounded/seg_nerf_unbounded_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ac5106edf1433df199ae1a3baf669a21fe67fb
--- /dev/null
+++ b/configs/nerf_unbounded/seg_nerf_unbounded_default.py
@@ -0,0 +1,52 @@
+_base_ = '../seg_default.py'
+
+basedir = './logs/nerf_unbounded'
+
+data = dict(
+ dataset_type='llff',
+ spherify=True,
+ factor=4,
+ llffhold=8,
+ white_bkgd=True,
+ rand_bkgd=True,
+ unbounded_inward=True,
+ load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+ N_iters=800000,
+ N_rand=1024 * 4,
+ lrate_decay=80,
+ ray_sampler='flatten',
+ weight_nearclip=1.0,
+ weight_distortion=0.01,
+ pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+ tv_before=20000,
+ tv_dense_before=20000,
+ weight_tv_density=1e-6,
+ weight_tv_k0=1e-7,
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+ num_voxels=320**3,
+ num_voxels_base=160**3,
+ alpha_init=alpha_init,
+ stepsize=stepsize,
+ fast_color_thres=0.1,
+# fast_color_thres={
+# '_delete_': True,
+# 0 : alpha_init*stepsize/10,
+# 1500: min(alpha_init, 1e-4)*stepsize/5,
+# 2500: min(alpha_init, 1e-4)*stepsize/2,
+# 3500: min(alpha_init, 1e-4)*stepsize/1.5,
+# 4500: min(alpha_init, 1e-4)*stepsize,
+# 5500: min(alpha_init, 1e-4),
+# 6500: 1e-4,
+# },
+ world_bound_scale=1,
+)
diff --git a/configs/nerf_unbounded/seg_pinecone.py b/configs/nerf_unbounded/seg_pinecone.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36a3617272f6fc48ce78cee2929c0ae6bf00696
--- /dev/null
+++ b/configs/nerf_unbounded/seg_pinecone.py
@@ -0,0 +1,15 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dcvgo_pinecone_unbounded'
+
+data = dict(
+ datadir='./data/nerf/nerf_real_360/pinecone',
+ factor=8, # 484x363
+ movie_render_kwargs=dict(
+ shift_x=0.0,
+ shift_y=0.0,
+ shift_z=0.0,
+ scale_r=0.9,
+ pitch_deg=-40,
+ ),
+)
diff --git a/configs/seg_default.py b/configs/seg_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85034dc67cb6e6121df7bd04a1b96994968b5d3
--- /dev/null
+++ b/configs/seg_default.py
@@ -0,0 +1,122 @@
+from copy import deepcopy
+
+expname = None # experiment name
+basedir = './logs/' # where to store ckpts and logs
+
+''' Template of data options
+'''
+data = dict(
+ datadir=None, # path to dataset root folder
+ dataset_type=None, # blender | nsvf | blendedmvs | tankstemple | deepvoxels | co3d
+ inverse_y=False, # intrinsict mode (to support blendedmvs, nsvf, tankstemple)
+ flip_x=False, # to support co3d
+ flip_y=False, # to suppo/= 10
+ annot_path='', # to support co3d
+ split_path='', # to support co3d
+ sequence_name='', # to support co3d
+# load2gpu_on_the_fly=False, # do not load all images into gpu (to save gpu memory)
+ load2gpu_on_the_fly=True, # do not load all images into gpu (to save gpu memory)
+ testskip=5, # subsample testset to preview results
+ white_bkgd=True, # use white background (note that some dataset don't provide alpha and with blended bg color)
+ rand_bkgd=False, # use random background during training
+ half_res=False, # [TODO]
+ bd_factor=.75,
+ movie_render_kwargs=dict(),
+
+ # Below are forward-facing llff specific settings.
+ ndc=False, # use ndc coordinate (only for forward-facing; not support yet)
+ spherify=False, # inward-facing
+ factor=4, # [TODO]
+ width=None, # enforce image width
+ height=None, # enforce image height
+ llffhold=8, # testsplit
+ load_depths=False, # load depth
+
+ # Below are unbounded inward-facing specific settings.
+ unbounded_inward=False,
+ unbounded_inner_r=1.0,
+)
+
+''' Template of training options
+'''
+coarse_train = dict(
+ N_iters=5000, # number of optimization steps
+ N_rand=8192, # batch size (number of random rays per optimization step)
+ #N_rand=1024, # batch size (number of random rays per optimization step)
+ lrate_seg_mask_grid=1, # lr of segmentation voxel grid
+ lrate_dual_seg_mask_grid=1, # lr of dual segmentation voxel grid
+ # lrate_k0_mask_grid=1e-2,
+ lrate_density=0, # lr of density voxel grid
+ lrate_k0=0, # lr of color/feature voxel grid
+ lrate_rgbnet=0, # lr of the mlp to preduct view-dependent color
+ lrate_decay=20, # lr decay by 0.1 after every lrate_decay*1000 steps
+ pervoxel_lr=False, # view-count-based lr
+ pervoxel_lr_downrate=0, # downsampled image for computing view-count-based lr
+ ray_sampler='random', # ray sampling strategies
+ weight_main=1.0, # weight of photometric loss
+ weight_entropy_last=0.01, # weight of background entropy loss
+ weight_nearclip=0,
+ weight_distortion=0,
+ weight_rgbper=0.1, # weight of per-point rgb loss
+ tv_every=1, # count total variation loss every tv_every step
+ tv_after=0, # count total variation loss from tv_from step
+ tv_before=0, # count total variation before the given number of iterations
+ tv_dense_before=0, # count total variation densely before the given number of iterations
+ weight_tv_density=0.0, # weight of total variation loss of density voxel grid
+ weight_tv_k0=0.0, # weight of total variation loss of color/feature voxel grid
+ pg_scale=[], # checkpoints for progressive scaling
+ decay_after_scale=1.0, # decay act_shift after scaling
+ skip_zero_grad_fields=[], # the variable name to skip optimizing parameters w/ zero grad in each iteration
+ maskout_lt_nviews=0,
+)
+
+fine_train = deepcopy(coarse_train)
+fine_train.update(dict(
+ N_iters=20000,
+ pervoxel_lr=False,
+ ray_sampler='flatten',
+ weight_entropy_last=0.001,
+ weight_rgbper=0.01,
+ pg_scale=[1000, 2000, 3000, 4000],
+ skip_zero_grad_fields=['density', 'k0'],
+))
+
+''' Template of model and rendering options
+'''
+coarse_model_and_render = dict(
+ num_voxels=1024000, # expected number of voxel
+ num_voxels_base=1024000, # to rescale delta distance
+ density_type='DenseGrid', # DenseGrid, TensoRFGrid
+ k0_type='TensoRFGrid', # DenseGrid, TensoRFGrid
+ density_config=dict(),
+ k0_config=dict(n_comp=48),
+ mpi_depth=128, # the number of planes in Multiplane Image (work when ndc=True)
+ nearest=False, # nearest interpolation
+ pre_act_density=False, # pre-activated trilinear interpolation
+ in_act_density=False, # in-activated trilinear interpolation
+ bbox_thres=1e-3, # threshold to determine known free-space in the fine stage
+ mask_cache_thres=1e-3, # threshold to determine a tighten BBox in the fine stage
+ rgbnet_dim=0, # feature voxel grid dim
+ rgbnet_full_implicit=False, # let the colors MLP ignore feature voxel grid
+ rgbnet_direct=True, # set to False to treat the first 3 dim of feature voxel grid as diffuse rgb
+ rgbnet_depth=3, # depth of the colors MLP (there are rgbnet_depth-1 intermediate features)
+ rgbnet_width=128, # width of the colors MLP
+ alpha_init=1e-6, # set the alpha values everywhere at the begin of training
+ fast_color_thres=1e-7, # threshold of alpha value to skip the fine stage sampled point
+ maskout_near_cam_vox=True, # maskout grid points that between cameras and their near planes
+ world_bound_scale=1, # rescale the BBox enclosing the scene
+ stepsize=0.5, # sampling stepsize in volume rendering
+)
+
+fine_model_and_render = deepcopy(coarse_model_and_render)
+fine_model_and_render.update(dict(
+ num_voxels=160**3,
+ num_voxels_base=160**3,
+ rgbnet_dim=12,
+ alpha_init=1e-2,
+ fast_color_thres=1e-4,
+ maskout_near_cam_vox=False,
+ world_bound_scale=1.05,
+))
+
+del deepcopy
diff --git a/croco/LICENSE b/croco/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d
--- /dev/null
+++ b/croco/LICENSE
@@ -0,0 +1,52 @@
+CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+ https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+
+
+SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+This software in this file incorporates parts of the following software available here:
+
+Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py
+available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE
+
+MoCo v3: https://github.com/facebookresearch/moco-v3
+available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE
+
+DeiT: https://github.com/facebookresearch/deit
+available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE
+
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/facebookresearch/mae/blob/main/LICENSE
+
+Attribution-NonCommercial 4.0 International
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/blocks.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/rwightman/pytorch-image-models
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/croco/NOTICE b/croco/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c
--- /dev/null
+++ b/croco/NOTICE
@@ -0,0 +1,21 @@
+CroCo
+Copyright 2022-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+facebookresearch/mae
+https://github.com/facebookresearch/mae
+
+Attribution-NonCommercial 4.0 International
+
+====
+
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/croco/README.MD b/croco/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b
--- /dev/null
+++ b/croco/README.MD
@@ -0,0 +1,124 @@
+# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow
+
+[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)]
+
+This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2:
+
+![image](assets/arch.jpg)
+
+```bibtex
+@inproceedings{croco,
+ title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}},
+ author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}},
+ booktitle={{NeurIPS}},
+ year={2022}
+}
+
+@inproceedings{croco_v2,
+ title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}},
+ author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me},
+ booktitle={ICCV},
+ year={2023}
+}
+```
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information.
+Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License.
+Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license.
+
+## Preparation
+
+1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version.
+
+```bash
+conda create -n croco python=3.7 cmake=3.14.0
+conda activate croco
+conda install habitat-sim headless -c conda-forge -c aihabitat
+conda install pytorch torchvision -c pytorch
+conda install notebook ipykernel matplotlib
+conda install ipywidgets widgetsnbextension
+conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation
+
+```
+
+2. Compile cuda kernels for RoPE
+
+CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels.
+```bash
+cd models/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+
+This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only.
+You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation.
+
+In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded.
+
+3. Download pre-trained model
+
+We provide several pre-trained models:
+
+| modelname | pre-training data | pos. embed. | Encoder | Decoder |
+|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------|
+| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth) | Habitat | cosine | ViT-B | Small |
+| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real | RoPE | ViT-B | Small |
+| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth) | Habitat + real | RoPE | ViT-B | Base |
+| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real | RoPE | ViT-L | Base |
+
+To download a specific model, i.e., the first one (`CroCo.pth`)
+```bash
+mkdir -p pretrained_models/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/
+```
+
+## Reconstruction example
+
+Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`)
+```bash
+python demo.py
+```
+
+## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator
+
+First download the test scene from Habitat:
+```bash
+python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/
+```
+
+Then, run the Notebook demo `interactive_demo.ipynb`.
+
+In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo.
+![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg)
+
+## Pre-training
+
+### CroCo
+
+To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command:
+```
+torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/
+```
+
+Our CroCo pre-training was launched on a single server with 4 GPUs.
+It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+The first run can take a few minutes to start, to parse all available pre-training pairs.
+
+### CroCo v2
+
+For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD).
+Then, run the following command for the largest model (ViT-L encoder, Base decoder):
+```
+torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/
+```
+
+Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases.
+The largest model should take around 12 days on A100.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+
+## Stereo matching and Optical flow downstream tasks
+
+For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD).
diff --git a/croco/assets/Chateau1.png b/croco/assets/Chateau1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d282fc6a51c00b8dd8267d5d507220ae253c2d65
Binary files /dev/null and b/croco/assets/Chateau1.png differ
diff --git a/croco/assets/Chateau2.png b/croco/assets/Chateau2.png
new file mode 100644
index 0000000000000000000000000000000000000000..722b2fc553ec089346722efb9445526ddfa8e7bd
Binary files /dev/null and b/croco/assets/Chateau2.png differ
diff --git a/croco/assets/arch.jpg b/croco/assets/arch.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3f5b032729ddc58c06d890a0ebda1749276070c4
Binary files /dev/null and b/croco/assets/arch.jpg differ
diff --git a/croco/croco-stereo-flow-demo.ipynb b/croco/croco-stereo-flow-demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0
--- /dev/null
+++ b/croco/croco-stereo-flow-demo.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9bca0f41",
+ "metadata": {},
+ "source": [
+ "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80653ef7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+ "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f033862",
+ "metadata": {},
+ "source": [
+ "First download the model(s) of your choice by running\n",
+ "```\n",
+ "bash stereoflow/download_model.sh crocostereo.pth\n",
+ "bash stereoflow/download_model.sh crocoflow.pth\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1fb2e392",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+ "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+ "import matplotlib.pylab as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0e25d77",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from stereoflow.test import _load_model_and_criterion\n",
+ "from stereoflow.engine import tiled_pred\n",
+ "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+ "from stereoflow.datasets_flow import flowToColor\n",
+ "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86a921f5",
+ "metadata": {},
+ "source": [
+ "### CroCo-Stereo example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "64e483cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image1 = np.asarray(Image.open(''))\n",
+ "image2 = np.asarray(Image.open(''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0d04303",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47dc14b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+ "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+ "with torch.inference_mode():\n",
+ " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+ "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "583b9f16",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.imshow(vis_disparity(pred))\n",
+ "plt.axis('off')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d2df5d70",
+ "metadata": {},
+ "source": [
+ "### CroCo-Flow example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ee257a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image1 = np.asarray(Image.open(''))\n",
+ "image2 = np.asarray(Image.open(''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5edccf0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b19692c3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+ "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+ "with torch.inference_mode():\n",
+ " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+ "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26f79db3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.imshow(flowToColor(pred))\n",
+ "plt.axis('off')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/croco/datasets/__init__.py b/croco/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/croco/datasets/crops/README.MD b/croco/datasets/crops/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f
--- /dev/null
+++ b/croco/datasets/crops/README.MD
@@ -0,0 +1,104 @@
+## Generation of crops from the real datasets
+
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+
+### Download the metadata of the crops to generate
+
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+
+### Prepare the original datasets
+
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+
+##### ARKitScenes
+
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+ └───40753679
+ │ │ ultrawide
+ │ │ ...
+ └───40753686
+ │
+ ...
+```
+
+##### MegaDepth
+
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│ └───images
+│ │ │ 1000557903_87fa96b8a4_o.jpg
+│ │ └ ...
+│ └─── ...
+└───0001
+│ │
+│ └ ...
+└─── ...
+```
+
+##### 3DStreetView
+
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│ └───0002
+│ │ │ 0000002_0000001_0000002_0000001.jpg
+│ │ └ ...
+│ └─── ...
+└───dataset_unaligned
+│ └───0003
+│ │ │ 0000003_0000001_0000002_0000001.jpg
+│ │ └ ...
+│ └─── ...
+```
+
+##### IndoorVL
+
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install "GangnamStation_*"
+cd -
+```
+
+### Extract the crops
+
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL;
+do
+ python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+
+##### Note for IndoorVL
+
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.
diff --git a/croco/datasets/crops/extract_crops_from_images.py b/croco/datasets/crops/extract_crops_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb66a0474ce44b54c44c08887cbafdb045b11ff3
--- /dev/null
+++ b/croco/datasets/crops/extract_crops_from_images.py
@@ -0,0 +1,159 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+
+
+def arg_parser():
+ parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list')
+
+ parser.add_argument('--crops', type=str, required=True, help='crop file')
+ parser.add_argument('--root-dir', type=str, required=True, help='root directory')
+ parser.add_argument('--output-dir', type=str, required=True, help='output directory')
+ parser.add_argument('--imsize', type=int, default=256, help='size of the crops')
+ parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads')
+ parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories')
+ parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir')
+ return parser
+
+
+def main(args):
+ listing_path = os.path.join(args.output_dir, 'listing.txt')
+
+ print(f'Loading list of crops ... ({args.nthread} threads)')
+ crops, num_crops_to_generate = load_crop_file(args.crops)
+
+ print(f'Preparing jobs ({len(crops)} candidate image pairs)...')
+ num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels)
+ num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels))
+
+ jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+ del crops
+
+ os.makedirs(args.output_dir, exist_ok=True)
+ mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+ call = functools.partial(save_image_crops, args)
+
+ print(f"Generating cropped images to {args.output_dir} ...")
+ with open(listing_path, 'w') as listing:
+ listing.write('# pair_path\n')
+ for results in tqdm(mmap(call, jobs), total=len(jobs)):
+ for path in results:
+ listing.write(f'{path}\n')
+ print('Finished writing listing to', listing_path)
+
+
+def load_crop_file(path):
+ data = open(path).read().splitlines()
+ pairs = []
+ num_crops_to_generate = 0
+ for line in tqdm(data):
+ if line.startswith('#'):
+ continue
+ line = line.split(', ')
+ if len(line) < 8:
+ img1, img2, rotation = line
+ pairs.append((img1, img2, int(rotation), []))
+ else:
+ l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+ rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+ pairs[-1][-1].append((rect1, rect2))
+ num_crops_to_generate += 1
+ return pairs, num_crops_to_generate
+
+
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+ jobs = []
+ powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+
+ def get_path(idx):
+ idx_array = []
+ d = idx
+ for level in range(num_levels - 1):
+ idx_array.append(idx // powers[level])
+ idx = idx % powers[level]
+ idx_array.append(d)
+ return '/'.join(map(lambda x: hex(x)[2:], idx_array))
+
+ idx = 0
+ for pair_data in tqdm(pairs):
+ img1, img2, rotation, crops = pair_data
+ if -60 <= rotation and rotation <= 60:
+ rotation = 0 # most likely not a true rotation
+ paths = [get_path(idx + k) for k in range(len(crops))]
+ idx += len(crops)
+ jobs.append(((img1, img2), rotation, crops, paths))
+ return jobs
+
+
+def load_image(path):
+ try:
+ return Image.open(path).convert('RGB')
+ except Exception as e:
+ print('skipping', path, e)
+ raise OSError()
+
+
+def save_image_crops(args, data):
+ # load images
+ img_pair, rot, crops, paths = data
+ try:
+ img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair]
+ except OSError as e:
+ return []
+
+ def area(sz):
+ return sz[0] * sz[1]
+
+ tgt_size = (args.imsize, args.imsize)
+
+ def prepare_crop(img, rect, rot=0):
+ # actual crop
+ img = img.crop(rect)
+
+ # resize to desired size
+ interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC
+ img = img.resize(tgt_size, resample=interp)
+
+ # rotate the image
+ rot90 = (round(rot/90) % 4) * 90
+ if rot90 == 90:
+ img = img.transpose(Image.Transpose.ROTATE_90)
+ elif rot90 == 180:
+ img = img.transpose(Image.Transpose.ROTATE_180)
+ elif rot90 == 270:
+ img = img.transpose(Image.Transpose.ROTATE_270)
+ return img
+
+ results = []
+ for (rect1, rect2), path in zip(crops, paths):
+ crop1 = prepare_crop(img1, rect1)
+ crop2 = prepare_crop(img2, rect2, rot)
+
+ fullpath1 = os.path.join(args.output_dir, path+'_1.jpg')
+ fullpath2 = os.path.join(args.output_dir, path+'_2.jpg')
+ os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+
+ assert not os.path.isfile(fullpath1), fullpath1
+ assert not os.path.isfile(fullpath2), fullpath2
+ crop1.save(fullpath1)
+ crop2.save(fullpath2)
+ results.append(path)
+
+ return results
+
+
+if __name__ == '__main__':
+ args = arg_parser().parse_args()
+ main(args)
+
diff --git a/croco/datasets/habitat_sim/README.MD b/croco/datasets/habitat_sim/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940
--- /dev/null
+++ b/croco/datasets/habitat_sim/README.MD
@@ -0,0 +1,76 @@
+## Generation of synthetic image pairs using Habitat-Sim
+
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+ └──scene_datasets/
+ ├──hm3d/
+ ├──gibson/
+ ├──habitat-test-scenes/
+ ├──replica_cad_baked_lighting/
+ ├──replica_cad/
+ ├──ReplicaDataset/
+ └──scannet/
+```
+
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+ - Scannet: 1097 scenes / 985 209 pairs
+ - HM3D:
+ - hm3d/train: 800 / 800k pairs
+ - hm3d/val: 100 scenes / 100k pairs
+ - hm3d/minival: 10 scenes / 10k pairs
+ - habitat-test-scenes: 3 scenes / 3k pairs
+ - replica_cad_baked_lighting: 13 scenes / 13k pairs
+
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+
+## Metadata generation
+
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR $METADATA_DIR
+```
diff --git a/croco/datasets/habitat_sim/__init__.py b/croco/datasets/habitat_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/croco/datasets/habitat_sim/generate_from_metadata.py b/croco/datasets/habitat_sim/generate_from_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0d399084359495250dc8184671ff498adfbf2
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_from_metadata.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+
+def generate_multiview_images_from_metadata(metadata_filename,
+ output_dir,
+ overload_params = dict(),
+ scene_datasets_paths=None,
+ exist_ok=False):
+ """
+ Generate images from a metadata file for reproducibility purposes.
+ """
+ # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+ if scene_datasets_paths is not None:
+ scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True))
+
+ with open(metadata_filename, 'r') as f:
+ input_metadata = json.load(f)
+ metadata = dict()
+ for key, value in input_metadata.items():
+ # Optionally replace some paths
+ if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+ if scene_datasets_paths is not None:
+ for dataset_label, dataset_path in scene_datasets_paths.items():
+ if value.startswith(dataset_label):
+ value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label)))
+ break
+ metadata[key] = value
+
+ # Overload some parameters
+ for key, value in overload_params.items():
+ metadata[key] = value
+
+ generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))])
+ generate_depth = metadata["generate_depth"]
+
+ os.makedirs(output_dir, exist_ok=exist_ok)
+
+ generator = MultiviewHabitatSimGenerator(**generation_entries)
+
+ # Generate views
+ for idx_label, data in tqdm(metadata['multiviews'].items()):
+ positions = data["positions"]
+ orientations = data["orientations"]
+ n = len(positions)
+ for oidx in range(n):
+ observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx]))
+ observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+ # Color image saved using PIL
+ img = PIL.Image.fromarray(observation['color'][:,:,:3])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+ img.save(filename)
+ if generate_depth:
+ # Depth image as EXR file
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+ cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+ # Camera parameters
+ camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+ with open(filename, "w") as f:
+ json.dump(camera_params, f)
+ # Save metadata
+ with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+ json.dump(metadata, f)
+
+ generator.close()
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--metadata_filename", required=True)
+ parser.add_argument("--output_dir", required=True)
+ args = parser.parse_args()
+
+ generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename,
+ output_dir=args.output_dir,
+ scene_datasets_paths=SCENES_DATASET,
+ overload_params=dict(),
+ exist_ok=True)
+
+
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/generate_from_metadata_files.py b/croco/datasets/habitat_sim/generate_from_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..962ef849d8c31397b8622df4f2d9140175d78873
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_from_metadata_files.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_dir", required=True)
+ parser.add_argument("--output_dir", required=True)
+ parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.")
+ args = parser.parse_args()
+
+ input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True)
+
+ for metadata_filename in tqdm(input_metadata_filenames):
+ output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir))
+ # Do not process the scene if the metadata file already exists
+ if os.path.exists(os.path.join(output_dir, "metadata.json")):
+ continue
+ commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+ print(commandline)
diff --git a/croco/datasets/habitat_sim/generate_multiview_images.py b/croco/datasets/habitat_sim/generate_multiview_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..421d49a1696474415940493296b3f2d982398850
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_multiview_images.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+
+def generate_multiview_images_for_scene(scene_dataset_config_file,
+ scene,
+ navmesh,
+ output_dir,
+ views_count,
+ size,
+ exist_ok=False,
+ generate_depth=False,
+ **kwargs):
+ """
+ Generate tuples of overlapping views for a given scene.
+ generate_depth: generate depth images and camera parameters.
+ """
+ if os.path.exists(output_dir) and not exist_ok:
+ print(f"Scene {scene}: data already generated. Ignoring generation.")
+ return
+ try:
+ print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+ os.makedirs(output_dir, exist_ok=exist_ok)
+
+ metadata_filename = os.path.join(output_dir, "metadata.json")
+
+ metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file,
+ scene=scene,
+ navmesh=navmesh,
+ views_count=views_count,
+ size=size,
+ generate_depth=generate_depth,
+ **kwargs)
+ metadata_template["multiviews"] = dict()
+
+ if os.path.exists(metadata_filename):
+ print("Metadata file already exists:", metadata_filename)
+ print("Loading already generated metadata file...")
+ with open(metadata_filename, "r") as f:
+ metadata = json.load(f)
+
+ for key in metadata_template.keys():
+ if key != "multiviews":
+ assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+ else:
+ print("No temporary file found. Starting generation from scratch...")
+ metadata = metadata_template
+
+ starting_id = len(metadata["multiviews"])
+ print(f"Starting generation from index {starting_id}/{size}...")
+ if starting_id >= size:
+ print("Generation already done.")
+ return
+
+ generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file,
+ scene=scene,
+ navmesh=navmesh,
+ views_count = views_count,
+ size = size,
+ **kwargs)
+
+ for idx in tqdm(range(starting_id, size)):
+ # Generate / re-generate the observations
+ try:
+ data = generator[idx]
+ observations = data["observations"]
+ positions = data["positions"]
+ orientations = data["orientations"]
+
+ idx_label = f"{idx:08}"
+ for oidx, observation in enumerate(observations):
+ observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+ # Color image saved using PIL
+ img = PIL.Image.fromarray(observation['color'][:,:,:3])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+ img.save(filename)
+ if generate_depth:
+ # Depth image as EXR file
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+ cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+ # Camera parameters
+ camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+ filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+ with open(filename, "w") as f:
+ json.dump(camera_params, f)
+ metadata["multiviews"][idx_label] = {"positions": positions.tolist(),
+ "orientations": orientations.tolist(),
+ "covisibility_ratios": data["covisibility_ratios"].tolist(),
+ "valid_fractions": data["valid_fractions"].tolist(),
+ "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()}
+ except RecursionError:
+ print("Recursion error: unable to sample observations for this scene. We will stop there.")
+ break
+
+ # Regularly save a temporary metadata file, in case we need to restart the generation
+ if idx % 10 == 0:
+ with open(metadata_filename, "w") as f:
+ json.dump(metadata, f)
+
+ # Save metadata
+ with open(metadata_filename, "w") as f:
+ json.dump(metadata, f)
+
+ generator.close()
+ except NoNaviguableSpaceError:
+ pass
+
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+ """
+ Create a commandline string to generate a scene.
+ """
+ def my_formatting(val):
+ if val is None or val == "":
+ return '""'
+ else:
+ return val
+ commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)}
+ --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)}
+ --navmesh {my_formatting(scene_data.navmesh)}
+ --output_dir {my_formatting(scene_data.output_dir)}
+ --generate_depth {int(generate_depth)}
+ --exist_ok {int(exist_ok)}
+ """
+ commandline = " ".join(commandline.split())
+ return commandline
+
+if __name__ == "__main__":
+ os.umask(2)
+
+ parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available:
+ > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+ """)
+
+ parser.add_argument("--output_dir", type=str, required=True)
+ parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true")
+ parser.add_argument("--scene", type=str, default="")
+ parser.add_argument("--scene_dataset_config_file", type=str, default="")
+ parser.add_argument("--navmesh", type=str, default="")
+
+ parser.add_argument("--generate_depth", type=int, default=1)
+ parser.add_argument("--exist_ok", type=int, default=0)
+
+ kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000)
+
+ args = parser.parse_args()
+ generate_depth=bool(args.generate_depth)
+ exist_ok = bool(args.exist_ok)
+
+ if args.list_commands:
+ # Listing scenes available...
+ scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+
+ for scene_data in scenes_data:
+ print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok))
+ else:
+ if args.scene == "" or args.output_dir == "":
+ print("Missing scene or output dir argument!")
+ print(parser.format_help())
+ else:
+ generate_multiview_images_for_scene(scene=args.scene,
+ scene_dataset_config_file = args.scene_dataset_config_file,
+ navmesh = args.navmesh,
+ output_dir = args.output_dir,
+ exist_ok=exist_ok,
+ generate_depth=generate_depth,
+ **kwargs)
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e5f923b836a645caf5d8e4aacc425047e3c144
--- /dev/null
+++ b/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
@@ -0,0 +1,390 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+
+def compute_camera_intrinsics(height, width, hfov):
+ f = width/2 / np.tan(hfov/2 * np.pi/180)
+ cu, cv = width/2, height/2
+ return f, cu, cv
+
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+ R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+ t_cam2world = np.asarray(camera_position)
+ return R_cam2world, t_cam2world
+
+def compute_pointmap(depthmap, hfov):
+ """ Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+ height, width = depthmap.shape
+ f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+ # Cast depth map to point
+ z_cam = depthmap
+ u, v = np.meshgrid(range(width), range(height))
+ x_cam = (u - cu) / f * z_cam
+ y_cam = (v - cv) / f * z_cam
+ X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+ return X_cam
+
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+ """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+ R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation)
+
+ X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+ valid_mask = (X_cam[:,:,2] != 0.0)
+
+ X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+ X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+ return X_world
+
+def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False):
+ """
+ Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+ """
+ nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2)
+ distances, indices = nbrs.kneighbors(pointcloud1)
+ intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+
+ data = {"intersection1": intersection1,
+ "size1": len(pointcloud1)}
+ if compute_symmetric:
+ nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1)
+ distances, indices = nbrs.kneighbors(pointcloud2)
+ intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+ data["intersection2"] = intersection2
+ data["size2"] = len(pointcloud2)
+
+ return data
+
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+ """
+ Add camera parameters to the observation dictionnary produced by Habitat-Sim
+ In-place modifications.
+ """
+ R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation)
+ height, width = observation['depth'].shape
+ f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+ K = np.asarray([[f, 0, cu],
+ [0, f, cv],
+ [0, 0, 1.0]])
+ observation["camera_intrinsics"] = K
+ observation["t_cam2world"] = t_cam2world
+ observation["R_cam2world"] = R_cam2world
+
+def look_at(eye, center, up, return_cam2world=True):
+ """
+ Return camera pose looking at a given center point.
+ Analogous of gluLookAt function, using OpenCV camera convention.
+ """
+ z = center - eye
+ z /= np.linalg.norm(z, axis=-1, keepdims=True)
+ y = -up
+ y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+ y /= np.linalg.norm(y, axis=-1, keepdims=True)
+ x = np.cross(y, z, axis=-1)
+
+ if return_cam2world:
+ R = np.stack((x, y, z), axis=-1)
+ t = eye
+ else:
+ # World to camera transformation
+ # Transposed matrix
+ R = np.stack((x, y, z), axis=-2)
+ t = - np.einsum('...ij, ...j', R, eye)
+ return R, t
+
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+ R, t = look_at(eye, center, up)
+ orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+ return orientation, t
+
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+ return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP)
+ * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT)
+ * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT))
+
+
+class NoNaviguableSpaceError(RuntimeError):
+ def __init__(self, *args):
+ super().__init__(*args)
+
+class MultiviewHabitatSimGenerator:
+ def __init__(self,
+ scene,
+ navmesh,
+ scene_dataset_config_file,
+ resolution = (240, 320),
+ views_count=2,
+ hfov = 60,
+ gpu_id = 0,
+ size = 10000,
+ minimum_covisibility = 0.5,
+ transform = None):
+ self.scene = scene
+ self.navmesh = navmesh
+ self.scene_dataset_config_file = scene_dataset_config_file
+ self.resolution = resolution
+ self.views_count = views_count
+ assert(self.views_count >= 1)
+ self.hfov = hfov
+ self.gpu_id = gpu_id
+ self.size = size
+ self.transform = transform
+
+ # Noise added to camera orientation
+ self.pan_range = (-3, 3)
+ self.tilt_range = (-10, 10)
+ self.roll_range = (-5, 5)
+
+ # Height range to sample cameras
+ self.height_range = (1.2, 1.8)
+
+ # Random steps between the camera views
+ self.random_steps_count = 5
+ self.random_step_variance = 2.0
+
+ # Minimum fraction of the scene which should be valid (well defined depth)
+ self.minimum_valid_fraction = 0.7
+
+ # Distance threshold to see to select pairs
+ self.distance_threshold = 0.05
+ # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+ self.minimum_covisibility = minimum_covisibility
+
+ # Maximum number of retries.
+ self.max_attempts_count = 100
+
+ self.seed = None
+ self._lazy_initialization()
+
+ def _lazy_initialization(self):
+ # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+ if self.seed == None:
+ # Re-seed numpy generator
+ np.random.seed()
+ self.seed = np.random.randint(2**32-1)
+ sim_cfg = habitat_sim.SimulatorConfiguration()
+ sim_cfg.scene_id = self.scene
+ if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+ sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+ sim_cfg.random_seed = self.seed
+ sim_cfg.load_semantic_mesh = False
+ sim_cfg.gpu_device_id = self.gpu_id
+
+ depth_sensor_spec = habitat_sim.CameraSensorSpec()
+ depth_sensor_spec.uuid = "depth"
+ depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+ depth_sensor_spec.resolution = self.resolution
+ depth_sensor_spec.hfov = self.hfov
+ depth_sensor_spec.position = [0.0, 0.0, 0]
+ depth_sensor_spec.orientation
+
+ rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+ rgb_sensor_spec.uuid = "color"
+ rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+ rgb_sensor_spec.resolution = self.resolution
+ rgb_sensor_spec.hfov = self.hfov
+ rgb_sensor_spec.position = [0.0, 0.0, 0]
+ agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec])
+
+ cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+ self.sim = habitat_sim.Simulator(cfg)
+ if self.navmesh is not None and self.navmesh != "":
+ # Use pre-computed navmesh when available (usually better than those generated automatically)
+ self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+ if not self.sim.pathfinder.is_loaded:
+ # Try to compute a navmesh
+ navmesh_settings = habitat_sim.NavMeshSettings()
+ navmesh_settings.set_defaults()
+ self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+ # Ensure that the navmesh is not empty
+ if not self.sim.pathfinder.is_loaded:
+ raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+
+ self.agent = self.sim.initialize_agent(agent_id=0)
+
+ def close(self):
+ self.sim.close()
+
+ def __del__(self):
+ self.sim.close()
+
+ def __len__(self):
+ return self.size
+
+ def sample_random_viewpoint(self):
+ """ Sample a random viewpoint using the navmesh """
+ nav_point = self.sim.pathfinder.get_random_navigable_point()
+
+ # Sample a random viewpoint height
+ viewpoint_height = np.random.uniform(*self.height_range)
+ viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+ viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+ return viewpoint_position, viewpoint_orientation, nav_point
+
+ def sample_other_random_viewpoint(self, observed_point, nav_point):
+ """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+ other_nav_point = nav_point
+
+ walk_directions = self.random_step_variance * np.asarray([1,0,1])
+ for i in range(self.random_steps_count):
+ temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3))
+ # Snapping may return nan when it fails
+ if not np.isnan(temp[0]):
+ other_nav_point = temp
+
+ other_viewpoint_height = np.random.uniform(*self.height_range)
+ other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+
+ # Set viewing direction towards the central point
+ rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True)
+ rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+ return position, rotation, other_nav_point
+
+ def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+ """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+ # Observation
+ pixels_count = self.resolution[0] * self.resolution[1]
+ valid_fraction = len(other_pointcloud) / pixels_count
+ assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+ overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True)
+ covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count)
+ is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility)
+ return is_valid, valid_fraction, covisibility
+
+ def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation):
+ """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+ # Observation
+ other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation)
+ return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+
+ def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+ agent_state = habitat_sim.AgentState()
+ agent_state.position = viewpoint_position
+ agent_state.rotation = viewpoint_orientation
+ self.agent.set_state(agent_state)
+ viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+ _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation)
+ return viewpoint_observations
+
+ def __getitem__(self, useless_idx):
+ ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+ ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+ # Extract point cloud
+ ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+ camera_position=ref_position, camera_rotation=ref_orientation)
+
+ pixels_count = self.resolution[0] * self.resolution[1]
+ ref_valid_fraction = len(ref_pointcloud) / pixels_count
+ assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+ if ref_valid_fraction < self.minimum_valid_fraction:
+ # This should produce a recursion error at some point when something is very wrong.
+ return self[0]
+ # Pick an reference observed point in the point cloud
+ observed_point = np.mean(ref_pointcloud, axis=0)
+
+ # Add the first image as reference
+ viewpoints_observations = [ref_observations]
+ viewpoints_covisibility = [ref_valid_fraction]
+ viewpoints_positions = [ref_position]
+ viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+ viewpoints_clouds = [ref_pointcloud]
+ viewpoints_valid_fractions = [ref_valid_fraction]
+
+ for _ in range(self.views_count - 1):
+ # Generate an other viewpoint using some dummy random walk
+ successful_sampling = False
+ for sampling_attempt in range(self.max_attempts_count):
+ position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point)
+ # Observation
+ other_viewpoint_observations = self.render_viewpoint(position, rotation)
+ other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation)
+
+ is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+ if is_valid:
+ successful_sampling = True
+ break
+ if not successful_sampling:
+ print("WARNING: Maximum number of attempts reached.")
+ # Dirty hack, try using a novel original viewpoint
+ return self[0]
+ viewpoints_observations.append(other_viewpoint_observations)
+ viewpoints_covisibility.append(covisibility)
+ viewpoints_positions.append(position)
+ viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding.
+ viewpoints_clouds.append(other_pointcloud)
+ viewpoints_valid_fractions.append(valid_fraction)
+
+ # Estimate relations between all pairs of images
+ pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations)))
+ for i in range(len(viewpoints_observations)):
+ pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i]
+ for j in range(i+1, len(viewpoints_observations)):
+ overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True)
+ pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count
+ pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count
+
+ # IoU is relative to the image 0
+ data = {"observations": viewpoints_observations,
+ "positions": np.asarray(viewpoints_positions),
+ "orientations": np.asarray(viewpoints_orientations),
+ "covisibility_ratios": np.asarray(viewpoints_covisibility),
+ "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+ "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float),
+ }
+
+ if self.transform is not None:
+ data = self.transform(data)
+ return data
+
+ def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False):
+ """
+ Return a list of images corresponding to a spiral trajectory from a random starting point.
+ Useful to generate nice visualisations.
+ Use an even number of half turns to get a nice "C1-continuous" loop effect
+ """
+ ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+ ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+ ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+ camera_position=ref_position, camera_rotation=ref_orientation)
+ pixels_count = self.resolution[0] * self.resolution[1]
+ if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+ # Dirty hack: ensure that the valid part of the image is significant
+ return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation)
+
+ # Pick an observed point in the point cloud
+ observed_point = np.mean(ref_pointcloud, axis=0)
+ ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation)
+
+ images = []
+ is_valid = []
+ # Spiral trajectory, use_constant orientation
+ for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+ r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius
+ theta = alpha * half_turns * np.pi
+ x = r * np.cos(theta)
+ y = r * np.sin(theta)
+ z = 0.0
+ position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten()
+ if use_constant_orientation:
+ orientation = ref_orientation
+ else:
+ # trajectory looking at a mean point in front of the ref observation
+ orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP)
+ observations = self.render_viewpoint(position, orientation)
+ images.append(observations['color'][...,:3])
+ _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation)
+ is_valid.append(_is_valid)
+ return images, np.all(is_valid)
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/pack_metadata_files.py b/croco/datasets/habitat_sim/pack_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..10672a01f7dd615d3b4df37781f7f6f97e753ba6
--- /dev/null
+++ b/croco/datasets/habitat_sim/pack_metadata_files.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("input_dir")
+ parser.add_argument("output_dir")
+ args = parser.parse_args()
+
+ input_dirname = args.input_dir
+ output_dirname = args.output_dir
+
+ input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True)
+
+ images_count = collections.defaultdict(lambda : 0)
+
+ os.makedirs(output_dirname)
+ for input_filename in tqdm(input_metadata_filenames):
+ # Ignore empty files
+ with open(input_filename, "r") as f:
+ original_metadata = json.load(f)
+ if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0:
+ print("No views in", input_filename)
+ continue
+
+ relpath = os.path.relpath(input_filename, input_dirname)
+ print(relpath)
+
+ # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+ # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+ scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True))
+ metadata = dict()
+ for key, value in original_metadata.items():
+ if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+ known_path = False
+ for dataset, dataset_path in scenes_dataset_paths.items():
+ if value.startswith(dataset_path):
+ value = os.path.join(dataset, os.path.relpath(value, dataset_path))
+ known_path = True
+ break
+ if not known_path:
+ raise KeyError("Unknown path:" + value)
+ metadata[key] = value
+
+ # Compile some general statistics while packing data
+ scene_split = metadata["scene"].split("/")
+ upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+ images_count[upper_level] += len(metadata["multiviews"])
+
+ output_filename = os.path.join(output_dirname, relpath)
+ os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+ with open(output_filename, "w") as f:
+ json.dump(metadata, f)
+
+ # Print statistics
+ print("Images count:")
+ for upper_level, count in images_count.items():
+ print(f"- {upper_level}: {count}")
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/paths.py b/croco/datasets/habitat_sim/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d63b5fa29c274ddfeae084734a35ba66d7edee8
--- /dev/null
+++ b/croco/datasets/habitat_sim/paths.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Paths to Habitat-Sim scenes
+"""
+
+import os
+import json
+import collections
+from tqdm import tqdm
+
+
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+ "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+ "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+ "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+ "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+ "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+ "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+ "scannet": "./data/habitat-sim/scene_datasets/scannet/"
+}
+
+SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"])
+
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+ scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json")
+ scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+ navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+ scenes_data = []
+ for idx in range(len(scenes)):
+ output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+ # Add scene
+ data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+ scene = scenes[idx] + ".scene_instance.json",
+ navmesh = os.path.join(base_path, navmeshes[idx]),
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]):
+ scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json")
+ scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [])
+ navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+ scenes_data = []
+ for idx in range(len(scenes)):
+ output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx])
+ data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+ scene = scenes[idx],
+ navmesh = "",
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+def list_replica_scenes(base_output_dir, base_path):
+ scenes_data = []
+ for scene_id in os.listdir(base_path):
+ scene = os.path.join(base_path, scene_id, "mesh.ply")
+ navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it
+ scene_dataset_config_file = ""
+ output_dir = os.path.join(base_output_dir, scene_id)
+ # Add scene only if it does not exist already, or if exist_ok
+ data = SceneData(scene_dataset_config_file = scene_dataset_config_file,
+ scene = scene,
+ navmesh = navmesh,
+ output_dir = output_dir)
+ scenes_data.append(data)
+ return scenes_data
+
+
+def list_scenes(base_output_dir, base_path):
+ """
+ Generic method iterating through a base_path folder to find scenes.
+ """
+ scenes_data = []
+ for root, dirs, files in os.walk(base_path, followlinks=True):
+ folder_scenes_data = []
+ for file in files:
+ name, ext = os.path.splitext(file)
+ if ext == ".glb":
+ scene = os.path.join(root, name + ".glb")
+ navmesh = os.path.join(root, name + ".navmesh")
+ if not os.path.exists(navmesh):
+ navmesh = ""
+ relpath = os.path.relpath(root, base_path)
+ output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name))
+ data = SceneData(scene_dataset_config_file="",
+ scene = scene,
+ navmesh = navmesh,
+ output_dir = output_dir)
+ folder_scenes_data.append(data)
+
+ # Specific check for HM3D:
+ # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+ basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")]
+ if len(basis_scenes) != 0:
+ folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)]
+
+ scenes_data.extend(folder_scenes_data)
+ return scenes_data
+
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+ scenes_data = []
+
+ # HM3D
+ for split in ("minival", "train", "val", "examples"):
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+ base_path=f"{scenes_dataset_paths['hm3d']}/{split}")
+
+ # Gibson
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"),
+ base_path=scenes_dataset_paths["gibson"])
+
+ # Habitat test scenes (just a few)
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+ base_path=scenes_dataset_paths["habitat-test-scenes"])
+
+ # ReplicaCAD (baked lightning)
+ scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir)
+
+ # ScanNet
+ scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"),
+ base_path=scenes_dataset_paths["scannet"])
+
+ # Replica
+ list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"),
+ base_path=scenes_dataset_paths["replica"])
+ return scenes_data
diff --git a/croco/datasets/pairs_dataset.py b/croco/datasets/pairs_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f107526b34e154d9013a9a7a0bde3d5ff6f581c
--- /dev/null
+++ b/croco/datasets/pairs_dataset.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+
+from datasets.transforms import get_pair_transforms
+
+def load_image(impath):
+ return Image.open(impath)
+
+def load_pairs_from_cache_file(fname, root=''):
+ assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+ with open(fname, 'r') as fid:
+ lines = fid.read().strip().splitlines()
+ pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines]
+ return pairs
+
+def load_pairs_from_list_file(fname, root=''):
+ assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+ with open(fname, 'r') as fid:
+ lines = fid.read().strip().splitlines()
+ pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')]
+ return pairs
+
+
+def write_cache_file(fname, pairs, root=''):
+ if len(root)>0:
+ if not root.endswith('/'): root+='/'
+ assert os.path.isdir(root)
+ s = ''
+ for im1, im2 in pairs:
+ if len(root)>0:
+ assert im1.startswith(root), im1
+ assert im2.startswith(root), im2
+ s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):])
+ with open(fname, 'w') as fid:
+ fid.write(s[:-1])
+
+def parse_and_cache_all_pairs(dname, data_dir='./data/'):
+ if dname=='habitat_release':
+ dirname = os.path.join(data_dir, 'habitat_release')
+ assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+ cache_file = os.path.join(dirname, 'pairs.txt')
+ assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file
+
+ print('Parsing pairs for dataset: '+dname)
+ pairs = []
+ for root, dirs, files in os.walk(dirname):
+ if 'val' in root: continue
+ dirs.sort()
+ pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')]
+ print('Found {:,} pairs'.format(len(pairs)))
+ print('Writing cache to: '+cache_file)
+ write_cache_file(cache_file, pairs, root=dirname)
+
+ else:
+ raise NotImplementedError('Unknown dataset: '+dname)
+
+def dnames_to_image_pairs(dnames, data_dir='./data/'):
+ """
+ dnames: list of datasets with image pairs, separated by +
+ """
+ all_pairs = []
+ for dname in dnames.split('+'):
+ if dname=='habitat_release':
+ dirname = os.path.join(data_dir, 'habitat_release')
+ assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+ cache_file = os.path.join(dirname, 'pairs.txt')
+ assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file
+ pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+ elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']:
+ dirname = os.path.join(data_dir, dname+'_crops')
+ assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+ list_file = os.path.join(dirname, 'listing.txt')
+ assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file)
+ pairs = load_pairs_from_list_file(list_file, root=dirname)
+ print(' {:s}: {:,} pairs'.format(dname, len(pairs)))
+ all_pairs += pairs
+ if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs)))
+ return all_pairs
+
+
+class PairsDataset(Dataset):
+
+ def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'):
+ super().__init__()
+ self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+ self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize)
+
+ def __len__(self):
+ return len(self.image_pairs)
+
+ def __getitem__(self, index):
+ im1path, im2path = self.image_pairs[index]
+ im1 = load_image(im1path)
+ im2 = load_image(im2path)
+ if self.transforms is not None: im1, im2 = self.transforms(im1, im2)
+ return im1, im2
+
+
+if __name__=="__main__":
+ import argparse
+ parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset")
+ parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+ parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset")
+ args = parser.parse_args()
+ parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)
diff --git a/croco/datasets/transforms.py b/croco/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..216bac61f8254fd50e7f269ee80301f250a2d11e
--- /dev/null
+++ b/croco/datasets/transforms.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+
+class ComposePair(torchvision.transforms.Compose):
+ def __call__(self, img1, img2):
+ for t in self.transforms:
+ img1, img2 = t(img1, img2)
+ return img1, img2
+
+class NormalizeBoth(torchvision.transforms.Normalize):
+ def forward(self, img1, img2):
+ img1 = super().forward(img1)
+ img2 = super().forward(img2)
+ return img1, img2
+
+class ToTensorBoth(torchvision.transforms.ToTensor):
+ def __call__(self, img1, img2):
+ img1 = super().__call__(img1)
+ img2 = super().__call__(img2)
+ return img1, img2
+
+class RandomCropPair(torchvision.transforms.RandomCrop):
+ # the crop will be intentionally different for the two images with this class
+ def forward(self, img1, img2):
+ img1 = super().forward(img1)
+ img2 = super().forward(img2)
+ return img1, img2
+
+class ColorJitterPair(torchvision.transforms.ColorJitter):
+ # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob
+ def __init__(self, assymetric_prob, **kwargs):
+ super().__init__(**kwargs)
+ self.assymetric_prob = assymetric_prob
+ def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor):
+ for fn_id in fn_idx:
+ if fn_id == 0 and brightness_factor is not None:
+ img = F.adjust_brightness(img, brightness_factor)
+ elif fn_id == 1 and contrast_factor is not None:
+ img = F.adjust_contrast(img, contrast_factor)
+ elif fn_id == 2 and saturation_factor is not None:
+ img = F.adjust_saturation(img, saturation_factor)
+ elif fn_id == 3 and hue_factor is not None:
+ img = F.adjust_hue(img, hue_factor)
+ return img
+
+ def forward(self, img1, img2):
+
+ fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+ self.brightness, self.contrast, self.saturation, self.hue
+ )
+ img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+ if torch.rand(1) < self.assymetric_prob: # assymetric:
+ fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+ self.brightness, self.contrast, self.saturation, self.hue
+ )
+ img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+ return img1, img2
+
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+ # transform_str is eg crop224+color
+ trfs = []
+ for s in transform_str.split('+'):
+ if s.startswith('crop'):
+ size = int(s[len('crop'):])
+ trfs.append(RandomCropPair(size))
+ elif s=='acolor':
+ trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0))
+ elif s=='': # if transform_str was ""
+ pass
+ else:
+ raise NotImplementedError('Unknown augmentation: '+s)
+
+ if totensor:
+ trfs.append( ToTensorBoth() )
+ if normalize:
+ trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) )
+
+ if len(trfs)==0:
+ return None
+ elif len(trfs)==1:
+ return trfs
+ else:
+ return ComposePair(trfs)
+
+
+
+
+
diff --git a/croco/demo.py b/croco/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b80ccc5c98c18e20d1ce782511aa824ef28f77
--- /dev/null
+++ b/croco/demo.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+from models.croco import CroCoNet
+from PIL import Image
+import torchvision.transforms
+from torchvision.transforms import ToTensor, Normalize, Compose
+
+def main():
+ device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu')
+
+ # load 224x224 images and transform them to tensor
+ imagenet_mean = [0.485, 0.456, 0.406]
+ imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True)
+ imagenet_std = [0.229, 0.224, 0.225]
+ imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True)
+ trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)])
+ image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+ image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+
+ # load model
+ ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+ model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device)
+ model.eval()
+ msg = model.load_state_dict(ckpt['model'], strict=True)
+
+ # forward
+ with torch.inference_mode():
+ out, mask, target = model(image1, image2)
+
+ # the output is normalized, thus use the mean/std of the actual image to go back to RGB space
+ patchified = model.patchify(image1)
+ mean = patchified.mean(dim=-1, keepdim=True)
+ var = patchified.var(dim=-1, keepdim=True)
+ decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean)
+ # undo imagenet normalization, prepare masked image
+ decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor
+ input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor
+ ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor
+ image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])
+ masked_input_image = ((1 - image_masks) * input_image)
+
+ # make visualization
+ visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4
+ B, C, H, W = visualization.shape
+ visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W)
+ visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1))
+ fname = "demo_output.png"
+ visualization.save(fname)
+ print('Visualization save in '+fname)
+
+
+if __name__=="__main__":
+ main()
diff --git a/croco/interactive_demo.ipynb b/croco/interactive_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71
--- /dev/null
+++ b/croco/interactive_demo.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Interactive demo of Cross-view Completion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+ "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "import numpy as np\n",
+ "from models.croco import CroCoNet\n",
+ "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+ "import ipywidgets as widgets\n",
+ "import matplotlib.pyplot as plt\n",
+ "import quaternion\n",
+ "import models.masking"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load CroCo model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+ "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+ "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+ "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+ "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+ "model = model.eval()\n",
+ "model = model.to(device=device)\n",
+ "print(msg)\n",
+ "\n",
+ "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+ " \"\"\"\n",
+ " Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+ " \"\"\"\n",
+ " # Replace the mask generator\n",
+ " model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+ "\n",
+ " # ImageNet-1k color normalization\n",
+ " imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+ " imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+ "\n",
+ " normalize_input_colors = True\n",
+ " is_output_normalized = True\n",
+ " with torch.no_grad():\n",
+ " # Cast data to torch\n",
+ " target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+ " ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+ "\n",
+ " if normalize_input_colors:\n",
+ " ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+ " target_image = (target_image - imagenet_mean) / imagenet_std\n",
+ "\n",
+ " out, mask, _ = model(target_image, ref_image)\n",
+ " # # get target\n",
+ " if not is_output_normalized:\n",
+ " predicted_image = model.unpatchify(out)\n",
+ " else:\n",
+ " # The output only contains higher order information,\n",
+ " # we retrieve mean and standard deviation from the actual target image\n",
+ " patchified = model.patchify(target_image)\n",
+ " mean = patchified.mean(dim=-1, keepdim=True)\n",
+ " var = patchified.var(dim=-1, keepdim=True)\n",
+ " pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+ " predicted_image = model.unpatchify(pred_renorm)\n",
+ "\n",
+ " image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+ " masked_target_image = (1 - image_masks) * target_image\n",
+ " \n",
+ " if not reconstruct_unmasked_patches:\n",
+ " # Replace unmasked patches by their actual values\n",
+ " predicted_image = predicted_image * image_masks + masked_target_image\n",
+ "\n",
+ " # Unapply color normalization\n",
+ " if normalize_input_colors:\n",
+ " predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+ " masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+ " \n",
+ " # Cast to Numpy\n",
+ " masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+ " predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+ " return masked_target_image, predicted_image"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+ "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+ "import habitat_sim\n",
+ "\n",
+ "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+ "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+ "\n",
+ "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+ "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+ "sim_cfg.scene_id = scene\n",
+ "sim_cfg.load_semantic_mesh = False\n",
+ "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+ "rgb_sensor_spec.uuid = \"color\"\n",
+ "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+ "rgb_sensor_spec.resolution = (224,224)\n",
+ "rgb_sensor_spec.hfov = 56.56\n",
+ "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+ "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+ "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+ "\n",
+ "\n",
+ "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+ "sim = habitat_sim.Simulator(cfg)\n",
+ "if navmesh is not None:\n",
+ " sim.pathfinder.load_nav_mesh(navmesh)\n",
+ "agent = sim.initialize_agent(agent_id=0)\n",
+ "\n",
+ "def sample_random_viewpoint():\n",
+ " \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+ " nav_point = sim.pathfinder.get_random_navigable_point()\n",
+ " # Sample a random viewpoint height\n",
+ " viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+ " viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+ " viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+ " return viewpoint_position, viewpoint_orientation\n",
+ "\n",
+ "def render_viewpoint(position, orientation):\n",
+ " agent_state = habitat_sim.AgentState()\n",
+ " agent_state.position = position\n",
+ " agent_state.rotation = orientation\n",
+ " agent.set_state(agent_state)\n",
+ " viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+ " image = viewpoint_observations['color'][:,:,:3]\n",
+ " image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+ " return image"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sample a random reference view"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ref_position, ref_orientation = sample_random_viewpoint()\n",
+ "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+ "plt.clf()\n",
+ "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+ "axes[0,0].imshow(ref_image)\n",
+ "for ax in axes.flatten():\n",
+ " ax.set_xticks([])\n",
+ " ax.set_yticks([])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Interactive cross-view completion using CroCo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reconstruct_unmasked_patches = False\n",
+ "\n",
+ "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+ " R = quaternion.as_rotation_matrix(ref_orientation)\n",
+ " target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+ " target_orientation = (ref_orientation\n",
+ " * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+ " * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+ " \n",
+ " ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+ " target_image = render_viewpoint(target_position, target_orientation)\n",
+ "\n",
+ " masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+ "\n",
+ " fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+ " axes[0].imshow(ref_image)\n",
+ " axes[0].set_xlabel(\"Reference\")\n",
+ " axes[1].imshow(masked_target_image)\n",
+ " axes[1].set_xlabel(\"Masked target\")\n",
+ " axes[2].imshow(predicted_image)\n",
+ " axes[2].set_xlabel(\"Reconstruction\") \n",
+ " axes[3].imshow(target_image)\n",
+ " axes[3].set_xlabel(\"Target\")\n",
+ " for ax in axes.flatten():\n",
+ " ax.set_xticks([])\n",
+ " ax.set_yticks([])\n",
+ "\n",
+ "interact(show_demo,\n",
+ " masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+ " x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+ " panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+ " elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/croco/models/__pycache__/blocks.cpython-310.pyc b/croco/models/__pycache__/blocks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efa90d1c03a84c2f7a38687a5ebd669574b21742
Binary files /dev/null and b/croco/models/__pycache__/blocks.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/blocks.cpython-38.pyc b/croco/models/__pycache__/blocks.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5656e856bfb677b31397411f53e97897605bf57e
Binary files /dev/null and b/croco/models/__pycache__/blocks.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/croco.cpython-310.pyc b/croco/models/__pycache__/croco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca15afa9df757bf3657afc2417916c50397a5a61
Binary files /dev/null and b/croco/models/__pycache__/croco.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/croco.cpython-38.pyc b/croco/models/__pycache__/croco.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cda8f1ec2e60077d9fb4a030c5fadd36db05ea42
Binary files /dev/null and b/croco/models/__pycache__/croco.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/dpt_block.cpython-310.pyc b/croco/models/__pycache__/dpt_block.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee797f8209b6197b7cefb7a0abab6b9b4114dfa3
Binary files /dev/null and b/croco/models/__pycache__/dpt_block.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/dpt_block.cpython-38.pyc b/croco/models/__pycache__/dpt_block.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ec48ecea7f4bc4a0222bd683e9758cdfcce70fa
Binary files /dev/null and b/croco/models/__pycache__/dpt_block.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/masking.cpython-310.pyc b/croco/models/__pycache__/masking.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3438d62138dac00d4d6e63691da265481006fd25
Binary files /dev/null and b/croco/models/__pycache__/masking.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/masking.cpython-38.pyc b/croco/models/__pycache__/masking.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b40acb53c29bf1bb9d4aeb6f19c6a4dee7b2bb0f
Binary files /dev/null and b/croco/models/__pycache__/masking.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/pos_embed.cpython-310.pyc b/croco/models/__pycache__/pos_embed.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e0c3ef339c700e1c7b215d709e70a2d55d078b
Binary files /dev/null and b/croco/models/__pycache__/pos_embed.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/pos_embed.cpython-38.pyc b/croco/models/__pycache__/pos_embed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45a53efdb0071da6cfa9f741cf0ab5a804ae8bb9
Binary files /dev/null and b/croco/models/__pycache__/pos_embed.cpython-38.pyc differ
diff --git a/croco/models/blocks.py b/croco/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..297187871314937a9a43f2901c1ffa8bb41cf762
--- /dev/null
+++ b/croco/models/blocks.py
@@ -0,0 +1,241 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References:
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+ return x
+ return tuple(repeat(x, n))
+ return parse
+to_2tuple = _ntuple(2)
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0 and scale_by_keep:
+ random_tensor.div_(keep_prob)
+ return x * random_tensor
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+ self.scale_by_keep = scale_by_keep
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+ def extra_repr(self):
+ return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+class Mlp(nn.Module):
+ """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ bias = to_2tuple(bias)
+ drop_probs = to_2tuple(drop)
+
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+ self.act = act_layer()
+ self.drop1 = nn.Dropout(drop_probs[0])
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+ self.drop2 = nn.Dropout(drop_probs[1])
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop1(x)
+ x = self.fc2(x)
+ x = self.drop2(x)
+ return x
+
+class Attention(nn.Module):
+
+ def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim ** -0.5
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.rope = rope
+
+ def forward(self, x, xpos): # 多头注意力机制,这里head的默认个数是16
+ B, N, C = x.shape
+
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+ q, k, v = [qkv[:,:,i] for i in range(3)]
+ # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple)
+
+ if self.rope is not None:
+ q = self.rope(q, xpos)
+ k = self.rope(k, xpos)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x, xpos):
+ x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+class CrossAttention(nn.Module):
+
+ def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim ** -0.5
+
+ self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+ self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+ self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ self.rope = rope
+
+ def forward(self, query, key, value, qpos, kpos):
+ B, Nq, C = query.shape
+ Nk = key.shape[1]
+ Nv = value.shape[1]
+
+ q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+ k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+ v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+
+ if self.rope is not None:
+ q = self.rope(q, qpos)
+ k = self.rope(k, kpos)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class DecoderBlock(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ self.norm3 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+ self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+ def forward(self, x, y, xpos, ypos):
+ x = x + self.drop_path(self.attn(self.norm1(x), xpos)) # 给x做自注意力机制
+ y_ = self.norm_y(y)
+ x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) # x和y做交叉注意力机制
+ x = x + self.drop_path(self.mlp(self.norm3(x)))
+ return x, y
+
+
+# patch embedding
+class PositionGetter(object):
+ """ return positions of patches """
+
+ def __init__(self):
+ self.cache_positions = {}
+
+ def __call__(self, b, h, w, device):
+ if not (h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+ pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+ return pos # 返回位置编码,其实就是PatchEmbedding后每个patch块的相对坐标
+
+class PatchEmbed(nn.Module):
+ """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+ self.num_patches = self.grid_size[0] * self.grid_size[1]
+ self.flatten = flatten
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ self.position_getter = PositionGetter()
+
+ def forward(self, x):
+ B, C, H, W = x.shape
+ torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+ torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+ x = self.proj(x)
+ pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+ if self.flatten:
+ x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
+ x = self.norm(x)
+ return x, pos
+
+ def _init_weights(self):
+ w = self.proj.weight.data
+ torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
diff --git a/croco/models/criterion.py b/croco/models/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..11696c40865344490f23796ea45e8fbd5e654731
--- /dev/null
+++ b/croco/models/criterion.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+class MaskedMSE(torch.nn.Module):
+
+ def __init__(self, norm_pix_loss=False, masked=True):
+ """
+ norm_pix_loss: normalize each patch by their pixel mean and variance
+ masked: compute loss over the masked patches only
+ """
+ super().__init__()
+ self.norm_pix_loss = norm_pix_loss
+ self.masked = masked
+
+ def forward(self, pred, mask, target):
+
+ if self.norm_pix_loss:
+ mean = target.mean(dim=-1, keepdim=True)
+ var = target.var(dim=-1, keepdim=True)
+ target = (target - mean) / (var + 1.e-6)**.5
+
+ loss = (pred - target) ** 2
+ loss = loss.mean(dim=-1) # [N, L], mean loss per patch
+ if self.masked:
+ loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches
+ else:
+ loss = loss.mean() # mean loss
+ return loss
diff --git a/croco/models/croco.py b/croco/models/croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0145e353831ac29e5889a7ec9a70191c71a0028d
--- /dev/null
+++ b/croco/models/croco.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from croco.models.blocks import Block, DecoderBlock, PatchEmbed
+from croco.models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from croco.models.masking import RandomMask
+
+
+class CroCoNet(nn.Module):
+
+ def __init__(self,
+ img_size=224, # input image size
+ patch_size=16, # patch_size
+ mask_ratio=0.9, # ratios of masked tokens
+ enc_embed_dim=768, # encoder feature dimension
+ enc_depth=12, # encoder depth
+ enc_num_heads=12, # encoder number of heads in the transformer block
+ dec_embed_dim=512, # decoder feature dimension
+ dec_depth=8, # decoder depth
+ dec_num_heads=16, # decoder number of heads in the transformer block
+ mlp_ratio=4,
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder
+ pos_embed='cosine', # positional embedding (either cosine or RoPE100)
+ ):
+
+ super(CroCoNet, self).__init__()
+
+ # patch embeddings (with initialization done as in MAE)
+ self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+
+ # mask generations
+ self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+
+ self.pos_embed = pos_embed
+ if pos_embed=='cosine':
+ # positional embedding of the encoder
+ enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+ self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+ # positional embedding of the decoder
+ dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+ self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+ # pos embedding in each block
+ self.rope = None # nothing for cosine
+ elif pos_embed.startswith('RoPE'): # eg RoPE100
+ self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+ self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+ if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+ freq = float(pos_embed[len('RoPE'):])
+ self.rope = RoPE2D(freq=freq)
+ else:
+ raise NotImplementedError('Unknown pos_embed '+pos_embed)
+
+ # transformer for the encoder
+ self.enc_depth = enc_depth
+ self.enc_embed_dim = enc_embed_dim
+ self.enc_blocks = nn.ModuleList([
+ Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+ for i in range(enc_depth)])
+ self.enc_norm = norm_layer(enc_embed_dim)
+
+ # masked tokens
+ self._set_mask_token(dec_embed_dim)
+
+ # decoder
+ self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+
+ # prediction head
+ self._set_prediction_head(dec_embed_dim, patch_size)
+
+ # initializer weights
+ self.initialize_weights()
+
+ def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+ self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+ def _set_mask_generator(self, num_patches, mask_ratio):
+ self.mask_generator = RandomMask(num_patches, mask_ratio)
+
+ def _set_mask_token(self, dec_embed_dim):
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+
+ def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+ self.dec_depth = dec_depth
+ self.dec_embed_dim = dec_embed_dim
+ # transfer from encoder to decoder
+ self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+ # transformer for the decoder
+ self.dec_blocks = nn.ModuleList([
+ DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+ for i in range(dec_depth)])
+ # final norm layer
+ self.dec_norm = norm_layer(dec_embed_dim)
+
+ def _set_prediction_head(self, dec_embed_dim, patch_size):
+ self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+
+
+ def initialize_weights(self):
+ # patch embed
+ self.patch_embed._init_weights()
+ # mask tokens
+ if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+ # linears and layer norms
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ # we use xavier_uniform following official JAX ViT:
+ torch.nn.init.xavier_uniform_(m.weight)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+ """
+ image has B x 3 x img_size x img_size
+ do_mask: whether to perform masking or not
+ return_all_blocks: if True, return the features at the end of every block
+ instead of just the features from the last block (eg for some prediction heads)
+ """
+ # embed the image into patches (x has size B x Npatches x C)
+ # and get position if each return patch (pos has size B x Npatches x 2)
+ x, pos = self.patch_embed(image)
+ # add positional embedding without cls token
+ if self.enc_pos_embed is not None:
+ x = x + self.enc_pos_embed[None,...]
+ # apply masking
+ B,N,C = x.size()
+ if do_mask:
+ masks = self.mask_generator(x)
+ x = x[~masks].view(B, -1, C)
+ posvis = pos[~masks].view(B, -1, 2)
+ else:
+ B,N,C = x.size()
+ masks = torch.zeros((B,N), dtype=bool)
+ posvis = pos
+ # now apply the transformer encoder and normalization
+ if return_all_blocks:
+ out = []
+ for blk in self.enc_blocks:
+ x = blk(x, posvis)
+ out.append(x)
+ out[-1] = self.enc_norm(out[-1])
+ return out, pos, masks
+ else:
+ for blk in self.enc_blocks:
+ x = blk(x, posvis)
+ x = self.enc_norm(x)
+ return x, pos, masks
+
+ def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+ """
+ return_all_blocks: if True, return the features at the end of every block
+ instead of just the features from the last block (eg for some prediction heads)
+
+ masks1 can be None => assume image1 fully visible
+ """
+ # encoder to decoder layer
+ visf1 = self.decoder_embed(feat1)
+ f2 = self.decoder_embed(feat2)
+ # append masked tokens to the sequence
+ B,Nenc,C = visf1.size()
+ if masks1 is None: # downstreams
+ f1_ = visf1
+ else: # pretraining
+ Ntotal = masks1.size(1)
+ f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+ f1_[~masks1] = visf1.view(B * Nenc, C)
+ # add positional embedding
+ if self.dec_pos_embed is not None:
+ f1_ = f1_ + self.dec_pos_embed
+ f2 = f2 + self.dec_pos_embed
+ # apply Transformer blocks
+ out = f1_
+ out2 = f2
+ if return_all_blocks:
+ _out, out = out, []
+ for blk in self.dec_blocks:
+ _out, out2 = blk(_out, out2, pos1, pos2)
+ out.append(_out)
+ out[-1] = self.dec_norm(out[-1])
+ else:
+ for blk in self.dec_blocks:
+ out, out2 = blk(out, out2, pos1, pos2)
+ out = self.dec_norm(out)
+ return out
+
+ def patchify(self, imgs):
+ """
+ imgs: (B, 3, H, W)
+ x: (B, L, patch_size**2 *3)
+ """
+ p = self.patch_embed.patch_size[0]
+ assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+ h = w = imgs.shape[2] // p
+ x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+ x = torch.einsum('nchpwq->nhwpqc', x)
+ x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+
+ return x
+
+ def unpatchify(self, x, channels=3):
+ """
+ x: (N, L, patch_size**2 *channels)
+ imgs: (N, 3, H, W)
+ """
+ patch_size = self.patch_embed.patch_size[0]
+ h = w = int(x.shape[1]**.5)
+ assert h * w == x.shape[1]
+ x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+ x = torch.einsum('nhwpqc->nchpwq', x)
+ imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+ return imgs
+
+ def forward(self, img1, img2):
+ """
+ img1: tensor of size B x 3 x img_size x img_size
+ img2: tensor of size B x 3 x img_size x img_size
+
+ out will be B x N x (3*patch_size*patch_size)
+ masks are also returned as B x N just in case
+ """
+ # encoder of the masked first image
+ feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+ # encoder of the second image
+ feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+ # decoder
+ decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+ # prediction head
+ out = self.prediction_head(decfeat)
+ # get target
+ target = self.patchify(img1)
+ return out, mask1, target
diff --git a/croco/models/croco_downstream.py b/croco/models/croco_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..159dfff4d2c1461bc235e21441b57ce1e2088f76
--- /dev/null
+++ b/croco/models/croco_downstream.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# CroCo model for downstream tasks
+# --------------------------------------------------------
+
+import torch
+
+from .croco import CroCoNet
+
+
+def croco_args_from_ckpt(ckpt):
+ if 'croco_kwargs' in ckpt: # CroCo v2 released models
+ return ckpt['croco_kwargs']
+ elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release
+ s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
+ assert s.startswith('CroCoNet(')
+ return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it
+ else: # CroCo v1 released models
+ return dict()
+
+class CroCoDownstreamMonocularEncoder(CroCoNet):
+
+ def __init__(self,
+ head,
+ **kwargs):
+ """ Build network for monocular downstream task, only using the encoder.
+ It takes an extra argument head, that is called with the features
+ and a dictionary img_info containing 'width' and 'height' keys
+ The head is setup with the croconet arguments in this init function
+ NOTE: It works by *calling super().__init__() but with redefined setters
+
+ """
+ super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
+ head.setup(self)
+ self.head = head
+
+ def _set_mask_generator(self, *args, **kwargs):
+ """ No mask generator """
+ return
+
+ def _set_mask_token(self, *args, **kwargs):
+ """ No mask token """
+ self.mask_token = None
+ return
+
+ def _set_decoder(self, *args, **kwargs):
+ """ No decoder """
+ return
+
+ def _set_prediction_head(self, *args, **kwargs):
+ """ No 'prediction head' for downstream tasks."""
+ return
+
+ def forward(self, img):
+ """
+ img if of size batch_size x 3 x h x w
+ """
+ B, C, H, W = img.size()
+ img_info = {'height': H, 'width': W}
+ need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+ out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers)
+ return self.head(out, img_info)
+
+
+class CroCoDownstreamBinocular(CroCoNet):
+
+ def __init__(self,
+ head,
+ **kwargs):
+ """ Build network for binocular downstream task
+ It takes an extra argument head, that is called with the features
+ and a dictionary img_info containing 'width' and 'height' keys
+ The head is setup with the croconet arguments in this init function
+ """
+ super(CroCoDownstreamBinocular, self).__init__(**kwargs)
+ head.setup(self)
+ self.head = head
+
+ def _set_mask_generator(self, *args, **kwargs):
+ """ No mask generator """
+ return
+
+ def _set_mask_token(self, *args, **kwargs):
+ """ No mask token """
+ self.mask_token = None
+ return
+
+ def _set_prediction_head(self, *args, **kwargs):
+ """ No prediction head for downstream tasks, define your own head """
+ return
+
+ def encode_image_pairs(self, img1, img2, return_all_blocks=False):
+ """ run encoder for a pair of images
+ it is actually ~5% faster to concatenate the images along the batch dimension
+ than to encode them separately
+ """
+ ## the two commented lines below is the naive version with separate encoding
+ #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
+ #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
+ ## and now the faster version
+ out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks )
+ if return_all_blocks:
+ out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
+ out2 = out2[-1]
+ else:
+ out,out2 = out.chunk(2, dim=0)
+ pos,pos2 = pos.chunk(2, dim=0)
+ return out, out2, pos, pos2
+
+ def forward(self, img1, img2):
+ B, C, H, W = img1.size()
+ img_info = {'height': H, 'width': W}
+ return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+ out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks)
+ if return_all_blocks:
+ decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+ decout = out+decout
+ else:
+ decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+ return self.head(decout, img_info)
\ No newline at end of file
diff --git a/croco/models/curope/__init__.py b/croco/models/curope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878
--- /dev/null
+++ b/croco/models/curope/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from .curope2d import cuRoPE2D
diff --git a/croco/models/curope/__pycache__/__init__.cpython-310.pyc b/croco/models/curope/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae664f61817c08229de3b3bd25d09e67c512fb68
Binary files /dev/null and b/croco/models/curope/__pycache__/__init__.cpython-310.pyc differ
diff --git a/croco/models/curope/__pycache__/__init__.cpython-38.pyc b/croco/models/curope/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c0e5621f0af15475b19c83026bdab80bf4055e
Binary files /dev/null and b/croco/models/curope/__pycache__/__init__.cpython-38.pyc differ
diff --git a/croco/models/curope/__pycache__/curope2d.cpython-310.pyc b/croco/models/curope/__pycache__/curope2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e4c5bd8907288ae181f415c682f7f57f430674
Binary files /dev/null and b/croco/models/curope/__pycache__/curope2d.cpython-310.pyc differ
diff --git a/croco/models/curope/__pycache__/curope2d.cpython-38.pyc b/croco/models/curope/__pycache__/curope2d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d3788feadd6ad7a5c123f5ddb9191fee182dcb
Binary files /dev/null and b/croco/models/curope/__pycache__/curope2d.cpython-38.pyc differ
diff --git a/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so b/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..54cb8272bdaeecf27eb33e7b36377339e8a4ba3b
Binary files /dev/null and b/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps
new file mode 100644
index 0000000000000000000000000000000000000000..6898f9369bb638a32223f5ed309410a5c55bcb77
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log
new file mode 100644
index 0000000000000000000000000000000000000000..dded40ff4052fee4defdd363fa72572da12ec117
--- /dev/null
+++ b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log
@@ -0,0 +1,4 @@
+# ninja log v5
+0 18414 1711080902080587065 /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o 1ee4a1dd32c06eb6
+3 17217 1711081567047778386 /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o 1ee4a1dd32c06eb6
+3 234838 1711081784655296109 /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o d5d1e582a9379a6b
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja b/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja
new file mode 100644
index 0000000000000000000000000000000000000000..75f901ccb7b6145f0b3d0c5a56a9e4117b1afb9b
--- /dev/null
+++ b/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja
@@ -0,0 +1,33 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -pthread -B /home/hy/anaconda3/envs/dust3r/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/hy/anaconda3/envs/dust3r/include -fPIC -O2 -isystem /home/hy/anaconda3/envs/dust3r/include -fPIC -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/TH -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/hy/anaconda3/envs/dust3r/include/python3.11 -c
+post_cflags = -O3 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=curope -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_cflags = -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/TH -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/hy/anaconda3/envs/dust3r/include/python3.11 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O3 --ptxas-options=-v --use_fast_math -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=curope -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_dlink_post_cflags =
+ldflags =
+
+rule compile
+ command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+ depfile = $out.d
+ deps = gcc
+
+rule cuda_compile
+ depfile = $out.d
+ deps = gcc
+ command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o: compile /data/hunterj-projects/dust3r/croco/models/curope/curope.cpp
+build /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o: cuda_compile /data/hunterj-projects/dust3r/croco/models/curope/kernels.cu
+
+
+
+
+
+
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o b/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o
new file mode 100644
index 0000000000000000000000000000000000000000..25a1503c7698cb5202cb176904e3c3a62a28687f
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o b/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o
new file mode 100644
index 0000000000000000000000000000000000000000..532e378f66f053ac04f6842fed02a063260f0cba
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o differ
diff --git a/croco/models/curope/curope.cpp b/croco/models/curope/curope.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b
--- /dev/null
+++ b/croco/models/curope/curope.cpp
@@ -0,0 +1,69 @@
+/*
+ Copyright (C) 2022-present Naver Corporation. All rights reserved.
+ Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include
+
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+ const int B = tokens.size(0);
+ const int N = tokens.size(1);
+ const int H = tokens.size(2);
+ const int D = tokens.size(3) / 4;
+
+ auto tok = tokens.accessor();
+ auto pos = positions.accessor();
+
+ for (int b = 0; b < B; b++) {
+ for (int x = 0; x < 2; x++) { // y and then x (2d)
+ for (int n = 0; n < N; n++) {
+
+ // grab the token position
+ const int p = pos[b][n][x];
+
+ for (int h = 0; h < H; h++) {
+ for (int d = 0; d < D; d++) {
+ // grab the two values
+ float u = tok[b][n][h][d+0+x*2*D];
+ float v = tok[b][n][h][d+D+x*2*D];
+
+ // grab the cos,sin
+ const float inv_freq = fwd * p / powf(base, d/float(D));
+ float c = cosf(inv_freq);
+ float s = sinf(inv_freq);
+
+ // write the result
+ tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+ tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+ }
+ }
+ }
+ }
+ }
+}
+
+void rope_2d( torch::Tensor tokens, // B,N,H,D
+ const torch::Tensor positions, // B,N,2
+ const float base,
+ const float fwd )
+{
+ TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+ TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+ TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+ TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+ TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+ TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+ if (tokens.is_cuda())
+ rope_2d_cuda( tokens, positions, base, fwd );
+ else
+ rope_2d_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}
diff --git a/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so b/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..54cb8272bdaeecf27eb33e7b36377339e8a4ba3b
Binary files /dev/null and b/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so differ
diff --git a/croco/models/curope/curope2d.py b/croco/models/curope/curope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49c12f8c529e9a889b5ac20c5767158f238e17d
--- /dev/null
+++ b/croco/models/curope/curope2d.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+
+try:
+ import curope as _kernels # run `python setup.py install`
+except ModuleNotFoundError:
+ from . import curope as _kernels # run `python setup.py build_ext --inplace`
+
+
+class cuRoPE2D_func (torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, tokens, positions, base, F0=1):
+ ctx.save_for_backward(positions)
+ ctx.saved_base = base
+ ctx.saved_F0 = F0
+ # tokens = tokens.clone() # uncomment this if inplace doesn't work
+ _kernels.rope_2d( tokens, positions, base, F0 )
+ ctx.mark_dirty(tokens)
+ return tokens
+
+ @staticmethod
+ def backward(ctx, grad_res):
+ positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+ _kernels.rope_2d( grad_res, positions, base, -F0 )
+ ctx.mark_dirty(grad_res)
+ return grad_res, None, None, None
+
+
+class cuRoPE2D(torch.nn.Module):
+ def __init__(self, freq=100.0, F0=1.0):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+
+ def forward(self, tokens, positions):
+ cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
+ return tokens
\ No newline at end of file
diff --git a/croco/models/curope/kernels.cu b/croco/models/curope/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7156cd1bb935cb1f0be45e58add53f9c21505c20
--- /dev/null
+++ b/croco/models/curope/kernels.cu
@@ -0,0 +1,108 @@
+/*
+ Copyright (C) 2022-present Naver Corporation. All rights reserved.
+ Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include
+#include
+#include
+#include
+
+#define CHECK_CUDA(tensor) {\
+ TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+ TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t >
+__global__ void rope_2d_cuda_kernel(
+ //scalar_t* __restrict__ tokens,
+ torch::PackedTensorAccessor32 tokens,
+ const int64_t* __restrict__ pos,
+ const float base,
+ const float fwd )
+ // const int N, const int H, const int D )
+{
+ // tokens shape = (B, N, H, D)
+ const int N = tokens.size(1);
+ const int H = tokens.size(2);
+ const int D = tokens.size(3);
+
+ // each block update a single token, for all heads
+ // each thread takes care of a single output
+ extern __shared__ float shared[];
+ float* shared_inv_freq = shared + D;
+
+ const int b = blockIdx.x / N;
+ const int n = blockIdx.x % N;
+
+ const int Q = D / 4;
+ // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+ // u_Y v_Y u_X v_X
+
+ // shared memory: first, compute inv_freq
+ if (threadIdx.x < Q)
+ shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+ __syncthreads();
+
+ // start of X or Y part
+ const int X = threadIdx.x < D/2 ? 0 : 1;
+ const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X
+
+ // grab the cos,sin appropriate for me
+ const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+ const float cos = cosf(freq);
+ const float sin = sinf(freq);
+ /*
+ float* shared_cos_sin = shared + D + D/4;
+ if ((threadIdx.x % (D/2)) < Q)
+ shared_cos_sin[m+0] = cosf(freq);
+ else
+ shared_cos_sin[m+Q] = sinf(freq);
+ __syncthreads();
+ const float cos = shared_cos_sin[m+0];
+ const float sin = shared_cos_sin[m+Q];
+ */
+
+ for (int h = 0; h < H; h++)
+ {
+ // then, load all the token for this head in shared memory
+ shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+ __syncthreads();
+
+ const float u = shared[m];
+ const float v = shared[m+Q];
+
+ // write output
+ if ((threadIdx.x % (D/2)) < Q)
+ tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+ else
+ tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+ }
+}
+
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
+{
+ const int B = tokens.size(0); // batch size
+ const int N = tokens.size(1); // sequence length
+ const int H = tokens.size(2); // number of heads
+ const int D = tokens.size(3); // dimension per head
+
+ TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+ TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+ TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+ TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+
+ // one block for each layer, one thread per local-max
+ const int THREADS_PER_BLOCK = D;
+ const int N_BLOCKS = B * N; // each block takes care of H*D values
+ const int SHARED_MEM = sizeof(float) * (D + D/4);
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+ rope_2d_cuda_kernel <<>> (
+ //tokens.data_ptr(),
+ tokens.packed_accessor32(),
+ pos.data_ptr(),
+ base, fwd); //, N, H, D );
+ }));
+}
diff --git a/croco/models/curope/setup.py b/croco/models/curope/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..230632ed05e309200e8f93a3a852072333975009
--- /dev/null
+++ b/croco/models/curope/setup.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+ # '-gencode', 'arch=compute_70,code=sm_70',
+ # '-gencode', 'arch=compute_75,code=sm_75',
+ # '-gencode', 'arch=compute_80,code=sm_80',
+ # '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+
+setup(
+ name = 'curope',
+ ext_modules = [
+ CUDAExtension(
+ name='curope',
+ sources=[
+ "curope.cpp",
+ "kernels.cu",
+ ],
+ extra_compile_args = dict(
+ nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
+ cxx=['-O3'])
+ )
+ ],
+ cmdclass = {
+ 'build_ext': BuildExtension
+ })
diff --git a/croco/models/dpt_block.py b/croco/models/dpt_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a3b63c6267270883c2e620ca226059d63dc8df
--- /dev/null
+++ b/croco/models/dpt_block.py
@@ -0,0 +1,450 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References:
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+
+def pair(t):
+ return t if isinstance(t, tuple) else (t, t)
+
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+ scratch = nn.Module()
+
+ out_shape1 = out_shape
+ out_shape2 = out_shape
+ out_shape3 = out_shape
+ out_shape4 = out_shape
+ if expand == True:
+ out_shape1 = out_shape
+ out_shape2 = out_shape * 2
+ out_shape3 = out_shape * 4
+ out_shape4 = out_shape * 8
+
+ scratch.layer1_rn = nn.Conv2d(
+ in_shape[0],
+ out_shape1,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer2_rn = nn.Conv2d(
+ in_shape[1],
+ out_shape2,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer3_rn = nn.Conv2d(
+ in_shape[2],
+ out_shape3,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+ scratch.layer4_rn = nn.Conv2d(
+ in_shape[3],
+ out_shape4,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ groups=groups,
+ )
+
+ scratch.layer_rn = nn.ModuleList([
+ scratch.layer1_rn,
+ scratch.layer2_rn,
+ scratch.layer3_rn,
+ scratch.layer4_rn,
+ ])
+
+ return scratch
+
+class ResidualConvUnit_custom(nn.Module):
+ """Residual convolution module."""
+
+ def __init__(self, features, activation, bn):
+ """Init.
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups = 1
+
+ self.conv1 = nn.Conv2d(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=not self.bn,
+ groups=self.groups,
+ )
+
+ self.conv2 = nn.Conv2d(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=not self.bn,
+ groups=self.groups,
+ )
+
+ if self.bn == True:
+ self.bn1 = nn.BatchNorm2d(features)
+ self.bn2 = nn.BatchNorm2d(features)
+
+ self.activation = activation
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, x): # RefineNet中的RCU组件: Residual Conv Unit
+ """Forward pass.
+ Args:
+ x (tensor): input
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x) # ReLU
+ out = self.conv1(out)
+ if self.bn == True:
+ out = self.bn1(out)
+
+ out = self.activation(out) # ReLU
+ out = self.conv2(out)
+ if self.bn == True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return self.skip_add.add(out, x) # 残差连接
+
+class FeatureFusionBlock_custom(nn.Module):
+ """Feature fusion block."""
+
+ def __init__(
+ self,
+ features,
+ activation,
+ deconv=False,
+ bn=False,
+ expand=False,
+ align_corners=True,
+ width_ratio=1,
+ ):
+ """Init.
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock_custom, self).__init__()
+ self.width_ratio = width_ratio
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups = 1
+
+ self.expand = expand
+ out_features = features
+ if self.expand == True:
+ out_features = features // 2
+
+ self.out_conv = nn.Conv2d(
+ features,
+ out_features,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=True,
+ groups=1,
+ )
+
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, *xs): # Multi-Path Refinement
+ """Forward pass.
+ Returns:
+ tensor: output
+ """
+ output = xs[0] # 前面小分辨率图像对应的RefineNet的累计输出结果
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])# 当前分辨率图像对应RefineNet的输入
+ if self.width_ratio != 1: # 不执行
+ res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+
+ output = self.skip_add.add(output, res) # 残差连接 output += res
+
+
+ output = self.resConfUnit2(output)
+
+ if self.width_ratio != 1:
+ # and output.shape[3] < self.width_ratio * output.shape[2]
+ #size=(image.shape[])
+ if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+ shape = 3 * output.shape[3]
+ else:
+ shape = int(self.width_ratio * 2 * output.shape[2])
+ output = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+ else:
+ output = nn.functional.interpolate(output, scale_factor=2,
+ mode="bilinear", align_corners=self.align_corners)
+ output = self.out_conv(output)
+ return output
+
+def make_fusion_block(features, use_bn, width_ratio=1):
+ return FeatureFusionBlock_custom(
+ features,
+ nn.ReLU(False),
+ deconv=False,
+ bn=use_bn,
+ expand=False,
+ align_corners=True,
+ width_ratio=width_ratio,
+ )
+
+class Interpolate(nn.Module):
+ """Interpolation module."""
+
+ def __init__(self, scale_factor, mode, align_corners=False):
+ """Init.
+ Args:
+ scale_factor (float): scaling
+ mode (str): interpolation mode
+ """
+ super(Interpolate, self).__init__()
+
+ self.interp = nn.functional.interpolate
+ self.scale_factor = scale_factor
+ self.mode = mode
+ self.align_corners = align_corners
+
+ def forward(self, x):
+ """Forward pass.
+ Args:
+ x (tensor): input
+ Returns:
+ tensor: interpolated data
+ """
+
+ x = self.interp(
+ x,
+ scale_factor=self.scale_factor,
+ mode=self.mode,
+ align_corners=self.align_corners,
+ )
+
+ return x
+
+class DPTOutputAdapter(nn.Module):
+ """DPT output adapter.
+
+ :param num_cahnnels: Number of output channels
+ :param stride_level: tride level compared to the full-sized image.
+ E.g. 4 for 1/4th the size of the image.
+ :param patch_size_full: Int or tuple of the patch size over the full image size.
+ Patch size for smaller inputs will be computed accordingly.
+ :param hooks: Index of intermediate layers
+ :param layer_dims: Dimension of intermediate layers
+ :param feature_dim: Feature dimension
+ :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+ :param use_bn: If set to True, activates batch norm
+ :param dim_tokens_enc: Dimension of tokens coming from encoder
+ """
+
+ def __init__(self,
+ num_channels: int = 1,
+ stride_level: int = 1,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ main_tasks: Iterable[str] = ('rgb',),
+ hooks: List[int] = [2, 5, 8, 11],
+ layer_dims: List[int] = [96, 192, 384, 768],
+ feature_dim: int = 256,
+ last_dim: int = 32,
+ use_bn: bool = False,
+ dim_tokens_enc: Optional[int] = None,
+ head_type: str = 'regression',
+ output_width_ratio=1,
+ **kwargs):
+ super().__init__()
+ self.num_channels = num_channels
+ self.stride_level = stride_level
+ self.patch_size = pair(patch_size)
+ self.main_tasks = main_tasks
+ self.hooks = hooks
+ self.layer_dims = layer_dims
+ self.feature_dim = feature_dim
+ self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+ self.head_type = head_type
+
+ # Actual patch height and width, taking into account stride of input
+ self.P_H = max(1, self.patch_size[0] // stride_level)
+ self.P_W = max(1, self.patch_size[1] // stride_level)
+
+ self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+
+ self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+ self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+
+ if self.head_type == 'regression':
+ # The "DPTDepthModel" head
+ self.head = nn.Sequential(
+ nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+ nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+ )
+ elif self.head_type == 'semseg':
+ # The "DPTSegmentationModel" head
+ self.head = nn.Sequential(
+ nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+ nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+ nn.ReLU(True),
+ nn.Dropout(0.1, False),
+ nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+ )
+ else:
+ raise ValueError('DPT head_type must be "regression" or "semseg".')
+
+ if self.dim_tokens_enc is not None:
+ self.init(dim_tokens_enc=dim_tokens_enc)
+
+ def init(self, dim_tokens_enc=768):
+ """
+ Initialize parts of decoder that are dependent on dimension of encoder tokens.
+ Should be called when setting up MultiMAE.
+
+ :param dim_tokens_enc: Dimension of tokens coming from encoder
+ """
+ #print(dim_tokens_enc)
+
+ # Set up activation postprocessing layers
+ if isinstance(dim_tokens_enc, int):
+ dim_tokens_enc = 4 * [dim_tokens_enc]
+
+ self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+
+ self.act_1_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[0],
+ out_channels=self.layer_dims[0],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.ConvTranspose2d(
+ in_channels=self.layer_dims[0],
+ out_channels=self.layer_dims[0],
+ kernel_size=4, stride=4, padding=0,
+ bias=True, dilation=1, groups=1,
+ )
+ )
+
+ self.act_2_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[1],
+ out_channels=self.layer_dims[1],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.ConvTranspose2d(
+ in_channels=self.layer_dims[1],
+ out_channels=self.layer_dims[1],
+ kernel_size=2, stride=2, padding=0,
+ bias=True, dilation=1, groups=1,
+ )
+ )
+
+ self.act_3_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[2],
+ out_channels=self.layer_dims[2],
+ kernel_size=1, stride=1, padding=0,
+ )
+ )
+
+ self.act_4_postprocess = nn.Sequential(
+ nn.Conv2d(
+ in_channels=self.dim_tokens_enc[3],
+ out_channels=self.layer_dims[3],
+ kernel_size=1, stride=1, padding=0,
+ ),
+ nn.Conv2d(
+ in_channels=self.layer_dims[3],
+ out_channels=self.layer_dims[3],
+ kernel_size=3, stride=2, padding=1,
+ )
+ )
+
+ self.act_postprocess = nn.ModuleList([
+ self.act_1_postprocess,
+ self.act_2_postprocess,
+ self.act_3_postprocess,
+ self.act_4_postprocess
+ ])
+
+ def adapt_tokens(self, encoder_tokens):
+ # Adapt tokens
+ x = []
+ x.append(encoder_tokens[:, :])
+ x = torch.cat(x, dim=-1)
+ return x
+
+ def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+ #input_info: Dict):
+ assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+ H, W = image_size
+
+ # Number of patches in height and width
+ N_H = H // (self.stride_level * self.P_H)
+ N_W = W // (self.stride_level * self.P_W)
+
+ # Hook decoder onto 4 layers from specified ViT layers
+ layers = [encoder_tokens[hook] for hook in self.hooks]
+
+ # Extract only task-relevant tokens and ignore global tokens.
+ layers = [self.adapt_tokens(l) for l in layers]
+
+ # Reshape tokens to spatial representation
+ layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+
+ layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+ # Project layers to chosen feature dim
+ layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+ # Fuse layers using refinement stages
+ path_4 = self.scratch.refinenet4(layers[3])
+ path_3 = self.scratch.refinenet3(path_4, layers[2])
+ path_2 = self.scratch.refinenet2(path_3, layers[1])
+ path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+ # Output head
+ out = self.head(path_1)
+
+ return out
diff --git a/croco/models/head_downstream.py b/croco/models/head_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd40c91ba244d6c3522c6efd4ed4d724b7bdc650
--- /dev/null
+++ b/croco/models/head_downstream.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Heads for downstream tasks
+# --------------------------------------------------------
+
+"""
+A head is a module where the __init__ defines only the head hyperparameters.
+A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes.
+The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height'
+"""
+
+import torch
+import torch.nn as nn
+from .dpt_block import DPTOutputAdapter
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+ """ DPT module for CroCo.
+ by default, hooks_idx will be equal to:
+ * for encoder-only: 4 equally spread layers
+ * for encoder+decoder: last encoder + 3 equally spread layers of the decoder
+ """
+
+ def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768],
+ output_width_ratio=1, num_channels=1, postprocess=None, **kwargs):
+ super(PixelwiseTaskWithDPT, self).__init__()
+ self.return_all_blocks = True # backbone needs to return all layers
+ self.postprocess = postprocess
+ self.output_width_ratio = output_width_ratio
+ self.num_channels = num_channels
+ self.hooks_idx = hooks_idx
+ self.layer_dims = layer_dims
+
+ def setup(self, croconet):
+ dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels}
+ if self.hooks_idx is None:
+ if hasattr(croconet, 'dec_blocks'): # encoder + decoder
+ step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth]
+ hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+ else: # encoder only
+ step = croconet.enc_depth//4
+ hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+ self.hooks_idx = hooks_idx
+ print(f' PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}')
+ dpt_args['hooks'] = self.hooks_idx
+ dpt_args['layer_dims'] = self.layer_dims
+ self.dpt = DPTOutputAdapter(**dpt_args)
+ dim_tokens = [croconet.enc_embed_dim if hook0:
+ pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ assert embed_dim % 2 == 0
+
+ # use half of dimensions to encode grid_h
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=float)
+ omega /= embed_dim / 2.
+ omega = 1. / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+ if 'pos_embed' in checkpoint_model:
+ pos_embed_checkpoint = checkpoint_model['pos_embed']
+ embedding_size = pos_embed_checkpoint.shape[-1]
+ num_patches = model.patch_embed.num_patches
+ num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+ # height (== width) for the checkpoint position embedding
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+ # height (== width) for the new position embedding
+ new_size = int(num_patches ** 0.5)
+ # class_token and dist_token are kept unchanged
+ if orig_size != new_size:
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+ # only the position tokens are interpolated
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+ pos_tokens = torch.nn.functional.interpolate(
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+ checkpoint_model['pos_embed'] = new_pos_embed
+
+
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+
+try:
+ from models.curope import cuRoPE2D
+ RoPE2D = cuRoPE2D
+except ImportError:
+ print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+
+ class RoPE2D(torch.nn.Module):
+
+ def __init__(self, freq=100.0, F0=1.0):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype):
+ if (D,seq_len,device,dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D,seq_len,device,dtype] = (cos,sin)
+ return self.cache[D,seq_len,device,dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim==2
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 2 (y and x position of each token)
+ output:
+ * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+ """
+ assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+ D = tokens.size(3) // 2
+ assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+ cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+ # split features into two along the feature dimension, and apply rope1d on each half
+ y, x = tokens.chunk(2, dim=-1)
+ y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+ x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+ tokens = torch.cat((y, x), dim=-1)
+ return tokens
\ No newline at end of file
diff --git a/croco/pretrain.py b/croco/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c45e488015ef5380c71d0381ff453fdb860759e
--- /dev/null
+++ b/croco/pretrain.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Pre-training CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from pathlib import Path
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco import CroCoNet
+from models.criterion import MaskedMSE
+from datasets.pairs_dataset import PairsDataset
+
+
+def get_args_parser():
+ parser = argparse.ArgumentParser('CroCo pre-training', add_help=False)
+ # model and criterion
+ parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build")
+ parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss")
+ # dataset
+ parser.add_argument('--dataset', default='habitat_release', type=str, help="training set")
+ parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful
+ # training
+ parser.add_argument('--seed', default=0, type=int, help="Random seed")
+ parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+ parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler")
+ parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch")
+ parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+ parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+ parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+ parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+ parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+ parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR')
+ parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining")
+ # others
+ parser.add_argument('--num_workers', default=8, type=int)
+ parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+ parser.add_argument('--local_rank', default=-1, type=int)
+ parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+ parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth')
+ parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+ parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training')
+ # paths
+ parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+ parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+ return parser
+
+
+
+
+def main(args):
+ misc.init_distributed_mode(args)
+ global_rank = misc.get_rank()
+ world_size = misc.get_world_size()
+
+ print("output_dir: "+args.output_dir)
+ if args.output_dir:
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+ # auto resume
+ last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+ args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+ print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+ print("{}".format(args).replace(', ', ',\n'))
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ device = torch.device(device)
+
+ # fix the seed
+ seed = args.seed + misc.get_rank()
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+
+ cudnn.benchmark = True
+
+ ## training dataset and loader
+ print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms))
+ dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir)
+ if world_size>1:
+ sampler_train = torch.utils.data.DistributedSampler(
+ dataset, num_replicas=world_size, rank=global_rank, shuffle=True
+ )
+ print("Sampler_train = %s" % str(sampler_train))
+ else:
+ sampler_train = torch.utils.data.RandomSampler(dataset)
+ data_loader_train = torch.utils.data.DataLoader(
+ dataset, sampler=sampler_train,
+ batch_size=args.batch_size,
+ num_workers=args.num_workers,
+ pin_memory=True,
+ drop_last=True,
+ )
+
+ ## model
+ print('Loading model: {:s}'.format(args.model))
+ model = eval(args.model)
+ print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss))))
+ criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss))
+
+ model.to(device)
+ model_without_ddp = model
+ print("Model = %s" % str(model_without_ddp))
+
+ eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+ if args.lr is None: # only base_lr is specified
+ args.lr = args.blr * eff_batch_size / 256
+ print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+ print("actual lr: %.2e" % args.lr)
+ print("accumulate grad iterations: %d" % args.accum_iter)
+ print("effective batch size: %d" % eff_batch_size)
+
+ if args.distributed:
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True)
+ model_without_ddp = model.module
+
+ param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers
+ optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+ print(optimizer)
+ loss_scaler = NativeScaler()
+
+ misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+ if global_rank == 0 and args.output_dir is not None:
+ log_writer = SummaryWriter(log_dir=args.output_dir)
+ else:
+ log_writer = None
+
+ print(f"Start training until {args.max_epoch} epochs")
+ start_time = time.time()
+ for epoch in range(args.start_epoch, args.max_epoch):
+ if world_size>1:
+ data_loader_train.sampler.set_epoch(epoch)
+
+ train_stats = train_one_epoch(
+ model, criterion, data_loader_train,
+ optimizer, device, epoch, loss_scaler,
+ log_writer=log_writer,
+ args=args
+ )
+
+ if args.output_dir and epoch % args.save_freq == 0 :
+ misc.save_model(
+ args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+ loss_scaler=loss_scaler, epoch=epoch, fname='last')
+
+ if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1):
+ misc.save_model(
+ args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+ loss_scaler=loss_scaler, epoch=epoch)
+
+ log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+ 'epoch': epoch,}
+
+ if args.output_dir and misc.is_main_process():
+ if log_writer is not None:
+ log_writer.flush()
+ with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+ f.write(json.dumps(log_stats) + "\n")
+
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ print('Training time {}'.format(total_time_str))
+
+
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+ data_loader: Iterable, optimizer: torch.optim.Optimizer,
+ device: torch.device, epoch: int, loss_scaler,
+ log_writer=None,
+ args=None):
+ model.train(True)
+ metric_logger = misc.MetricLogger(delimiter=" ")
+ metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+ header = 'Epoch: [{}]'.format(epoch)
+ accum_iter = args.accum_iter
+
+ optimizer.zero_grad()
+
+ if log_writer is not None:
+ print('log_dir: {}'.format(log_writer.log_dir))
+
+ for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+
+ # we use a per iteration lr scheduler
+ if data_iter_step % accum_iter == 0:
+ misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+ image1 = image1.to(device, non_blocking=True)
+ image2 = image2.to(device, non_blocking=True)
+ with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+ out, mask, target = model(image1, image2)
+ loss = criterion(out, mask, target)
+
+ loss_value = loss.item()
+
+ if not math.isfinite(loss_value):
+ print("Loss is {}, stopping training".format(loss_value))
+ sys.exit(1)
+
+ loss /= accum_iter
+ loss_scaler(loss, optimizer, parameters=model.parameters(),
+ update_grad=(data_iter_step + 1) % accum_iter == 0)
+ if (data_iter_step + 1) % accum_iter == 0:
+ optimizer.zero_grad()
+
+ torch.cuda.synchronize()
+
+ metric_logger.update(loss=loss_value)
+
+ lr = optimizer.param_groups[0]["lr"]
+ metric_logger.update(lr=lr)
+
+ loss_value_reduce = misc.all_reduce_mean(loss_value)
+ if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0:
+ # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes
+ epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+ log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+ log_writer.add_scalar('lr', lr, epoch_1000x)
+
+ # gather the stats from all processes
+ metric_logger.synchronize_between_processes()
+ print("Averaged stats:", metric_logger)
+ return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+
+if __name__ == '__main__':
+ args = get_args_parser()
+ args = args.parse_args()
+ main(args)
diff --git a/croco/stereoflow/README.MD b/croco/stereoflow/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34
--- /dev/null
+++ b/croco/stereoflow/README.MD
@@ -0,0 +1,318 @@
+## CroCo-Stereo and CroCo-Flow
+
+This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained.
+All commands should be launched from the root directory.
+
+### Simple inference example
+
+We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`.
+Before running it, please download the trained models with:
+```
+bash stereoflow/download_model.sh crocostereo.pth
+bash stereoflow/download_model.sh crocoflow.pth
+```
+
+### Prepare data for training or evaluation
+
+Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`).
+Please find below on the file structure should look for each dataset:
+
+FlyingChairs
+
+```
+./data/stereoflow/FlyingChairs/
+└───chairs_split.txt
+└───data/
+ └─── ...
+```
+
+
+
+MPI-Sintel
+
+```
+./data/stereoflow/MPI-Sintel/
+└───training/
+│ └───clean/
+│ └───final/
+│ └───flow/
+└───test/
+ └───clean/
+ └───final/
+```
+
+
+
+SceneFlow (including FlyingThings)
+
+```
+./data/stereoflow/SceneFlow/
+└───Driving/
+│ └───disparity/
+│ └───frames_cleanpass/
+│ └───frames_finalpass/
+└───FlyingThings/
+│ └───disparity/
+│ └───frames_cleanpass/
+│ └───frames_finalpass/
+│ └───optical_flow/
+└───Monkaa/
+ └───disparity/
+ └───frames_cleanpass/
+ └───frames_finalpass/
+```
+
+
+
+TartanAir
+
+```
+./data/stereoflow/TartanAir/
+└───abandonedfactory/
+│ └───.../
+└───abandonedfactory_night/
+│ └───.../
+└───.../
+```
+
+
+
+Booster
+
+```
+./data/stereoflow/booster_gt/
+└───train/
+ └───balanced/
+ └───Bathroom/
+ └───Bedroom/
+ └───...
+```
+
+
+
+CREStereo
+
+```
+./data/stereoflow/crenet_stereo_trainset/
+└───stereo_trainset/
+ └───crestereo/
+ └───hole/
+ └───reflective/
+ └───shapenet/
+ └───tree/
+```
+
+
+
+ETH3D Two-view Low-res
+
+```
+./data/stereoflow/eth3d_lowres/
+└───test/
+│ └───lakeside_1l/
+│ └───...
+└───train/
+│ └───delivery_area_1l/
+│ └───...
+└───train_gt/
+ └───delivery_area_1l/
+ └───...
+```
+
+
+
+KITTI 2012
+
+```
+./data/stereoflow/kitti-stereo-2012/
+└───testing/
+│ └───colored_0/
+│ └───colored_1/
+└───training/
+ └───colored_0/
+ └───colored_1/
+ └───disp_occ/
+ └───flow_occ/
+```
+
+
+
+KITTI 2015
+
+```
+./data/stereoflow/kitti-stereo-2015/
+└───testing/
+│ └───image_2/
+│ └───image_3/
+└───training/
+ └───image_2/
+ └───image_3/
+ └───disp_occ_0/
+ └───flow_occ/
+```
+
+
+
+Middlebury
+
+```
+./data/stereoflow/middlebury
+└───2005/
+│ └───train/
+│ └───Art/
+│ └───...
+└───2006/
+│ └───Aloe/
+│ └───Baby1/
+│ └───...
+└───2014/
+│ └───Adirondack-imperfect/
+│ └───Adirondack-perfect/
+│ └───...
+└───2021/
+│ └───data/
+│ └───artroom1/
+│ └───artroom2/
+│ └───...
+└───MiddEval3_F/
+ └───test/
+ │ └───Australia/
+ │ └───...
+ └───train/
+ └───Adirondack/
+ └───...
+```
+
+
+
+Spring
+
+```
+./data/stereoflow/spring/
+└───test/
+│ └───0003/
+│ └───...
+└───train/
+ └───0001/
+ └───...
+```
+
+
+
+### CroCo-Stereo
+
+##### Main model
+
+The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo.pth
+# Middlebury v3 submission
+python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9
+# Training command that was used, using checkpoint-last.pth
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus:
+torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+```
+
+For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo_subtrain.pth
+# Evaluation on validation sets
+python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9
+# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/
+```
+
+##### Other models
+
+
+ Model for ETH3D
+ The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss.
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_eth3d.pth
+ # ETH3D submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9
+ # Training command that was used
+ python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/
+
+
+
+
+ Main model finetuned on Kitti
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth
+ # Kitti submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9
+ # Training that was used
+ python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5
+
+
+
+ Main model finetuned on Spring
+
+ # Download the model
+ bash stereoflow/download_model.sh crocostereo_finetune_spring.pth
+ # Spring submission
+ python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+ # Training command that was used
+ python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/
+
+
+
+ Smaller models
+ To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the --pretrained
argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth
, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth
.
+
+
+
+### CroCo-Flow
+
+##### Main model
+
+The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets.
+It was used for our submission to the MPI-Sintel benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocoflow.pth
+# Evaluation
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9
+# Sintel submission
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9
+# Training command that was used, with checkpoint-best.pth
+python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/
+```
+
+##### Other models
+
+
+ Main model finetuned on Kitti
+
+ # Download the model
+ bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth
+ # Kitti submission
+ python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99
+ # Training that was used, with checkpoint-last.pth
+ python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/
+
+
+
+ Main model finetuned on Spring
+
+ # Download the model
+ bash stereoflow/download_model.sh crocoflow_finetune_spring.pth
+ # Spring submission
+ python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+ # Training command that was used, with checkpoint-last.pth
+ python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/
+
+
+
+ Smaller models
+ To train CroCo-Flow with smaller CroCo pretrained models, simply replace the --pretrained
argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth
, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth
.
+
diff --git a/croco/stereoflow/augmentor.py b/croco/stereoflow/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e6117151988d94cbc4b385e0d88e982133bf10
--- /dev/null
+++ b/croco/stereoflow/augmentor.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Data augmentation for training stereo and flow
+# --------------------------------------------------------
+
+# References
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py
+
+
+import numpy as np
+import random
+from PIL import Image
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torchvision.transforms.functional as FF
+
+class StereoAugmentor(object):
+
+ def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5):
+ self.crop_size = crop_size
+ self.scale_prob = scale_prob
+ self.scale_xonly = scale_xonly
+ self.lhth = lhth
+ self.lminscale = lminscale
+ self.lmaxscale = lmaxscale
+ self.hminscale = hminscale
+ self.hmaxscale = hmaxscale
+ self.scale_interp_nearest = scale_interp_nearest
+ self.rightjitterprob = rightjitterprob
+ self.v_flip_prob = v_flip_prob
+ self.color_aug_asym = color_aug_asym
+ self.color_choice_prob = color_choice_prob
+
+ def _random_scale(self, img1, img2, disp):
+ ch,cw = self.crop_size
+ h,w = img1.shape[:2]
+ if self.scale_prob>0. and np.random.rand()1.:
+ scale_x = clip_scale
+ scale_y = scale_x if not self.scale_xonly else 1.0
+ img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+ return img1, img2, disp
+
+ def _random_crop(self, img1, img2, disp):
+ h,w = img1.shape[:2]
+ ch,cw = self.crop_size
+ assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw)
+ offset_x = np.random.randint(w - cw + 1)
+ offset_y = np.random.randint(h - ch + 1)
+ img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw]
+ return img1, img2, disp
+
+ def _random_vflip(self, img1, img2, disp):
+ # vertical flip
+ if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob:
+ img1 = np.copy(np.flipud(img1))
+ img2 = np.copy(np.flipud(img2))
+ disp = np.copy(np.flipud(disp))
+ return img1, img2, disp
+
+ def _random_rotate_shift_right(self, img2):
+ if self.rightjitterprob>0. and np.random.rand() 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+ xx = xx[v]
+ yy = yy[v]
+ flow1 = flow1[v]
+
+ flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones
+ flow[yy, xx] = flow1
+ return flow
+
+ def spatial_transform(self, img1, img2, flow, dname):
+
+ if np.random.rand() < self.spatial_aug_prob:
+ # randomly sample scale
+ ht, wd = img1.shape[:2]
+ clip_min_scale = np.maximum(
+ (self.crop_size[0] + 8) / float(ht),
+ (self.crop_size[1] + 8) / float(wd))
+ min_scale, max_scale = self.min_scale, self.max_scale
+ scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+ scale_x = scale
+ scale_y = scale
+ if np.random.rand() < self.stretch_prob:
+ scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+ scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+ scale_x = np.clip(scale_x, clip_min_scale, None)
+ scale_y = np.clip(scale_y, clip_min_scale, None)
+ # rescale the images
+ img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+ flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0)
+ elif dname=="Spring":
+ flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0)
+
+ if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob: # h-flip
+ img1 = img1[:, ::-1]
+ img2 = img2[:, ::-1]
+ flow = flow[:, ::-1] * [-1.0, 1.0]
+
+ if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob: # v-flip
+ img1 = img1[::-1, :]
+ img2 = img2[::-1, :]
+ flow = flow[::-1, :] * [1.0, -1.0]
+
+ # In case no cropping
+ if img1.shape[0] - self.crop_size[0] > 0:
+ y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+ else:
+ y0 = 0
+ if img1.shape[1] - self.crop_size[1] > 0:
+ x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+ else:
+ x0 = 0
+
+ img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+ img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+ flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+
+ return img1, img2, flow
+
+ def __call__(self, img1, img2, flow, dname):
+ img1, img2, flow = self.spatial_transform(img1, img2, flow, dname)
+ img1, img2 = self.color_transform(img1, img2)
+ img1 = np.ascontiguousarray(img1)
+ img2 = np.ascontiguousarray(img2)
+ flow = np.ascontiguousarray(flow)
+ return img1, img2, flow
\ No newline at end of file
diff --git a/croco/stereoflow/criterion.py b/croco/stereoflow/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..57792ebeeee34827b317a4d32b7445837bb33f17
--- /dev/null
+++ b/croco/stereoflow/criterion.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Losses, metrics per batch, metrics per dataset
+# --------------------------------------------------------
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+def _get_gtnorm(gt):
+ if gt.size(1)==1: # stereo
+ return gt
+ # flow
+ return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW
+
+############ losses without confidence
+
+class L1Loss(nn.Module):
+
+ def __init__(self, max_gtnorm=None):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = False
+
+ def _error(self, gt, predictions):
+ return torch.abs(gt-predictions)
+
+ def forward(self, predictions, gt, inspect=False):
+ mask = torch.isfinite(gt)
+ if self.max_gtnorm is not None:
+ mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1) which is a constant
+
+
+class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b
+ def __init__(self, max_gtnorm=10000., a=0.25, b=4.):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = True
+ self.a, self.b = a, b
+
+ def forward(self, predictions, gt, conf):
+ mask = torch.isfinite(gt)
+ mask = mask[:,0,:,:]
+ if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant
+
+class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b
+ def __init__(self, max_gtnorm=None, a=3.0, b=3.0):
+ super().__init__()
+ self.max_gtnorm = max_gtnorm
+ self.with_conf = True
+ self.a, self.b = a, b
+
+ def forward(self, predictions, gt, conf):
+ mask = torch.isfinite(gt)
+ mask = mask[:,0,:,:]
+ if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant
+
+############## metrics per batch
+
+class StereoMetrics(nn.Module):
+
+ def __init__(self, do_quantile=False):
+ super().__init__()
+ self.bad_ths = [0.5,1,2,3]
+ self.do_quantile = do_quantile
+
+ def forward(self, predictions, gt):
+ B = predictions.size(0)
+ metrics = {}
+ gtcopy = gt.clone()
+ mask = torch.isfinite(gtcopy)
+ gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0
+ Npx = mask.view(B,-1).sum(dim=1)
+ L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1)
+ L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1)
+ # avgerr
+ metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx )
+ # rmse
+ metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0)
+ # err > t for t in [0.5,1,2,3]
+ for ths in self.bad_ths:
+ metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+ return metrics
+
+class FlowMetrics(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [1,3,5]
+
+ def forward(self, predictions, gt):
+ B = predictions.size(0)
+ metrics = {}
+ mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ Npx = mask.view(B,-1).sum(dim=1)
+ gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored
+ gtcopy[:,0,:,:][~mask] = 999999.0
+ gtcopy[:,1,:,:][~mask] = 999999.0
+ L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1)
+ L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1)
+ metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx )
+ metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx )
+ for ths in self.bad_ths:
+ metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+ return metrics
+
+############## metrics per dataset
+## we update the average and maintain the number of pixels while adding data batch per batch
+## at the beggining, call reset()
+## after each batch, call add_batch(...)
+## at the end: call get_results()
+
+class StereoDatasetMetrics(nn.Module):
+
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [0.5,1,2,3]
+
+ def reset(self):
+ self.agg_N = 0 # number of pixels so far
+ self.agg_L1err = torch.tensor(0.0) # L1 error so far
+ self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels
+ self._metrics = None
+
+ def add_batch(self, predictions, gt):
+ assert predictions.size(1)==1, predictions.size()
+ assert gt.size(1)==1, gt.size()
+ if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+ L1err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+ torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+ valid = torch.isfinite(L1err)
+ else:
+ valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+ N = valid.sum()
+ Nnew = self.agg_N + N
+ self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_N = Nnew
+ for i,th in enumerate(self.bad_ths):
+ self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu()
+
+ def _compute_metrics(self):
+ if self._metrics is not None: return
+ out = {}
+ out['L1err'] = self.agg_L1err.item()
+ for i,th in enumerate(self.bad_ths):
+ out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+ self._metrics = out
+
+ def get_results(self):
+ self._compute_metrics() # to avoid recompute them multiple times
+ return self._metrics
+
+class FlowDatasetMetrics(nn.Module):
+
+ def __init__(self):
+ super().__init__()
+ self.bad_ths = [0.5,1,3,5]
+ self.speed_ths = [(0,10),(10,40),(40,torch.inf)]
+
+ def reset(self):
+ self.agg_N = 0 # number of pixels so far
+ self.agg_L1err = torch.tensor(0.0) # L1 error so far
+ self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far
+ self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels
+ self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far
+ self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far
+ self._metrics = None
+ self.pairname_results = {}
+
+ def add_batch(self, predictions, gt):
+ assert predictions.size(1)==2, predictions.size()
+ assert gt.size(1)==2, gt.size()
+ if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+ L1err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+ torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+ torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+ L2err = torch.minimum( torch.minimum( torch.minimum(
+ torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))),
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1)))
+ valid = torch.isfinite(L1err)
+ gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\
+ torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them
+ else:
+ valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+ L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+ L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1))
+ gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1))
+ N = valid.sum()
+ Nnew = self.agg_N + N
+ self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew
+ self.agg_N = Nnew
+ for i,th in enumerate(self.bad_ths):
+ self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu()
+ for i,(th1,th2) in enumerate(self.speed_ths):
+ vv = (gtspeed[valid]>=th1) * (gtspeed[valid] don't use batch_size>1 at test time)
+ self._prepare_data()
+ self._load_or_build_cache()
+
+ def prepare_data(self):
+ """
+ to be defined for each dataset
+ """
+ raise NotImplementedError
+
+ def __len__(self):
+ return len(self.pairnames) # each pairname is typically of the form (str, int1, int2)
+
+ def __getitem__(self, index):
+ pairname = self.pairnames[index]
+
+ # get filenames
+ img1name = self.pairname_to_img1name(pairname)
+ img2name = self.pairname_to_img2name(pairname)
+ flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None
+
+ # load images and disparities
+ img1 = _read_img(img1name)
+ img2 = _read_img(img2name)
+ flow = self.load_flow(flowname) if flowname is not None else None
+
+ # apply augmentations
+ if self.augmentor is not None:
+ img1, img2, flow = self.augmentor(img1, img2, flow, self.name)
+
+ if self.totensor:
+ img1 = img_to_tensor(img1)
+ img2 = img_to_tensor(img2)
+ if flow is not None:
+ flow = flow_to_tensor(flow)
+ else:
+ flow = torch.tensor([]) # to allow dataloader batching with default collate_gn
+ pairname = str(pairname) # transform potential tuple to str to be able to batch it
+
+ return img1, img2, flow, pairname
+
+ def __rmul__(self, v):
+ self.rmul *= v
+ self.pairnames = v * self.pairnames
+ return self
+
+ def __str__(self):
+ return f'{self.__class__.__name__}_{self.split}'
+
+ def __repr__(self):
+ s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+ if self.rmul==1:
+ s+=f'\n\tnum pairs: {len(self.pairnames)}'
+ else:
+ s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+ return s
+
+ def _set_root(self):
+ self.root = dataset_to_root[self.name]
+ assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"
+
+ def _load_or_build_cache(self):
+ cache_file = osp.join(cache_dir, self.name+'.pkl')
+ if osp.isfile(cache_file):
+ with open(cache_file, 'rb') as fid:
+ self.pairnames = pickle.load(fid)[self.split]
+ else:
+ tosave = self._build_cache()
+ os.makedirs(cache_dir, exist_ok=True)
+ with open(cache_file, 'wb') as fid:
+ pickle.dump(tosave, fid)
+ self.pairnames = tosave[self.split]
+
+class TartanAirDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "TartanAir"
+ self._set_root()
+ assert self.split in ['train']
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2]))
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2]))
+ self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2]))
+ self.load_flow = _read_numpy_flow
+
+ def _build_cache(self):
+ seqs = sorted(os.listdir(self.root))
+ pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]]
+ assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir"
+ tosave = {'train': pairs}
+ return tosave
+
+class FlyingChairsDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "FlyingChairs"
+ self._set_root()
+ assert self.split in ['train','val']
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm')
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm')
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo')
+ self.pairname_to_str = lambda pairname: pairname
+ self.load_flow = _read_flo_file
+
+ def _build_cache(self):
+ split_file = osp.join(self.root, 'chairs_split.txt')
+ split_list = np.loadtxt(split_file, dtype=np.int32)
+ trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1]
+ valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1]
+ assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel"
+ tosave = {'train': trainpairs, 'val': valpairs}
+ return tosave
+
+class FlyingThingsDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "FlyingThings"
+ self._set_root()
+ assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']]
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2]))
+ self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+ self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+ self.load_flow = _read_pfm_flow
+
+ def _build_cache(self):
+ tosave = {}
+ # train and test splits for the different passes
+ for set_ in ['train', 'test']:
+ sroot = osp.join(self.root, 'optical_flow', set_.upper())
+ fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')])
+ pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]]
+ pairs = [(a,i,i+1) for a,i in pp]
+ pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp]
+ assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things"
+ for cam in ['left','right']:
+ camstr = '' if cam=='left' else f'_{cam}cam'
+ for pass_ in ['final', 'clean']:
+ tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs]
+ tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}']
+ # test1024: this is the same split as unimatch 'validation' split
+ # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229
+ test1024_nsamples = 1024
+ alltest_nsamples = len(tosave['test_cleanpass']) # 7866
+ stride = alltest_nsamples // test1024_nsamples
+ remove = alltest_nsamples % test1024_nsamples
+ for cam in ['left','right']:
+ camstr = '' if cam=='left' else f'_{cam}cam'
+ for pass_ in ['final','clean']:
+ tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before
+ assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things"
+ tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}']
+ return tosave
+
+
+class MPISintelDataset(FlowDataset):
+
+ def _prepare_data(self):
+ self.name = "MPISintel"
+ self._set_root()
+ assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']]
+ self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]))
+ self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1))
+ self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1]))
+ self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1]))
+ self.load_flow = _read_flo_file
+
+ def _build_cache(self):
+ trainseqs = sorted(os.listdir(self.root+'training/clean'))
+ trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))]
+ subvalseqs = ['temple_2','temple_3']
+ subtrainseqs = [s for s in trainseqs if s not in subvalseqs]
+ subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)]
+ subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)]
+ testseqs = sorted(os.listdir(self.root+'test/clean'))
+ testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))]
+ assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel"
+ tosave = {}
+ tosave['train_cleanpass'] = trainpairs
+ tosave['test_cleanpass'] = testpairs
+ tosave['subval_cleanpass'] = subvalpairs
+ tosave['subtrain_cleanpass'] = subtrainpairs
+ for t in ['train','test','subval','subtrain']:
+ tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']]
+ tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass']
+ return tosave
+
+ def submission_save_pairname(self, pairname, prediction, outdir, _time):
+ assert prediction.shape[2]==2
+ outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo')
+ os.makedirs( os.path.dirname(outfile), exist_ok=True)
+ writeFlowFile(prediction, outfile)
+
+ def finalize_submission(self, outdir):
+ assert self.split == 'test_allpass'
+ bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg