diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..988b6ed47aa64018e3c5c85ac99120fca2830024 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,38 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/fenceflower/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/mattcecsit/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
+output/llff(sanerf-hq)/mattwrite/point_cloud_projection.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..194e236cbd708160926c3513b4232285eb47b029
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+data/
+checkpoints/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..c950ef981a8d2e47599dd7acbbe1bf8de9a42aca
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "croco"]
+	path = croco
+	url = https://github.com/naver/croco
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1c2fda565b94d0f2b94cb65ba7cca866e7a25478
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/dust3r.iml b/.idea/dust3r.iml
new file mode 100644
index 0000000000000000000000000000000000000000..bb34444daa649d3f067846b9968327bc1a7bbc92
--- /dev/null
+++ b/.idea/dust3r.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="dust3r" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..435bbe22b00f9b9a1482166532075912909946a0
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="edgesam" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="dust3r" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e06983fc6f59398257503edc06ae534d7c029189
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/dust3r.iml" filepath="$PROJECT_DIR$/.idea/dust3r.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/other.xml b/.idea/other.xml
new file mode 100644
index 0000000000000000000000000000000000000000..4c89e05cf52029dbb6c8a4bed1cf2c782727520f
--- /dev/null
+++ b/.idea/other.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_INTERACTIVE_PLOTS_SUGGESTED" value="true" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000000000000000000000000000000000000..572b3e4829fcd634d2d1edd8e3a54a57390d26b5
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,653 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="de0dddb6-4a99-4847-9050-a2cb006d71c9" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 6
+}</component>
+  <component name="ProjectId" id="2fAGUbZMWEGJzrHYLuaOE0replo" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;Python.base_opt.executor&quot;: &quot;Debug&quot;,
+    &quot;Python.demo.executor&quot;: &quot;Debug&quot;,
+    &quot;Python.evaluate.executor&quot;: &quot;Run&quot;,
+    &quot;Python.gys_util.executor&quot;: &quot;Run&quot;,
+    &quot;Python.load_nvos.executor&quot;: &quot;Debug&quot;,
+    &quot;Python.prepare_prompts.executor&quot;: &quot;Debug&quot;,
+    &quot;Python.segment_eval_mask.executor&quot;: &quot;Run&quot;,
+    &quot;Python.test_vis.executor&quot;: &quot;Run&quot;,
+    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;last_opened_file_path&quot;: &quot;D:/XMU/mac/hujie/3D/DUSt3R/dust3r/data/nerf_llff_data(NVOS-all)/orchids&quot;,
+    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
+    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
+    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
+    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
+    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
+    &quot;settings.editor.selected.configurable&quot;: &quot;editor.preferences.fonts.default&quot;,
+    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  }
+}</component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\orchids" />
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\leaves" />
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\fortress" />
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\flower" />
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\flower\images_8" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS-all)\masks\horns_center" />
+      <recent name="D:\XMU\mac\hujie\3D\DUSt3R\dust3r\data\nerf_llff_data(NVOS)\masks\horns_center" />
+    </key>
+  </component>
+  <component name="RunManager" selected="Python.segment_eval_mask">
+    <configuration name="base_opt" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="dust3r" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/dust3r/cloud_opt" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="evaluate" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="dust3r" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/evaluate.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="gys_util" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="dust3r" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/gys_util.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="load_nvos" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="dust3r" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/load_nvos.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="segment_eval_mask" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="dust3r" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/segment_eval_mask.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <list>
+      <item itemvalue="Python.base_opt" />
+      <item itemvalue="Python.evaluate" />
+      <item itemvalue="Python.gys_util" />
+      <item itemvalue="Python.load_nvos" />
+      <item itemvalue="Python.segment_eval_mask" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.segment_eval_mask" />
+        <item itemvalue="Python.gys_util" />
+        <item itemvalue="Python.base_opt" />
+        <item itemvalue="Python.load_nvos" />
+        <item itemvalue="Python.evaluate" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-python-sdk-d68999036c7f-b11f5e8da5ad-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.14475.56" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="de0dddb6-4a99-4847-9050-a2cb006d71c9" name="Changes" comment="" />
+      <created>1713236486096</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1713236486096</updated>
+      <workItem from="1713236487157" duration="10677000" />
+      <workItem from="1713322792937" duration="2998000" />
+      <workItem from="1713327850893" duration="7179000" />
+      <workItem from="1713345425711" duration="3092000" />
+      <workItem from="1713365007655" duration="3000" />
+      <workItem from="1713367330362" duration="10000" />
+      <workItem from="1713424119618" duration="7824000" />
+      <workItem from="1713490936554" duration="2000" />
+      <workItem from="1713505652781" duration="4699000" />
+      <workItem from="1713663410571" duration="2000" />
+      <workItem from="1713692836014" duration="2000" />
+      <workItem from="1713706421213" duration="483000" />
+      <workItem from="1713760410928" duration="1449000" />
+      <workItem from="1713859656888" duration="121000" />
+      <workItem from="1713868202296" duration="766000" />
+      <workItem from="1713871959150" duration="9000" />
+      <workItem from="1714029659735" duration="12549000" />
+      <workItem from="1714107476497" duration="2510000" />
+      <workItem from="1714111340455" duration="1140000" />
+      <workItem from="1714112654480" duration="4607000" />
+      <workItem from="1714306019181" duration="6300000" />
+      <workItem from="1714374776625" duration="9143000" />
+      <workItem from="1714477007344" duration="1203000" />
+      <workItem from="1714977472055" duration="12684000" />
+      <workItem from="1715235695003" duration="6444000" />
+      <workItem from="1715266491201" duration="657000" />
+      <workItem from="1715322636502" duration="14461000" />
+      <workItem from="1715407622615" duration="5364000" />
+      <workItem from="1715496542428" duration="15485000" />
+      <workItem from="1715578333845" duration="3525000" />
+      <workItem from="1715654635430" duration="19165000" />
+      <workItem from="1715737090799" duration="8221000" />
+      <workItem from="1715825644950" duration="25707000" />
+      <workItem from="1715912343385" duration="771000" />
+      <workItem from="1715913133797" duration="6272000" />
+      <workItem from="1715959257185" duration="1615000" />
+      <workItem from="1716202913497" duration="1828000" />
+      <workItem from="1716518387871" duration="58000" />
+      <workItem from="1716625270304" duration="311000" />
+      <workItem from="1717227431906" duration="27000" />
+      <workItem from="1717554542796" duration="3744000" />
+      <workItem from="1717639168925" duration="599000" />
+      <workItem from="1717723619398" duration="15918000" />
+      <workItem from="1717815585723" duration="5160000" />
+      <workItem from="1717901397527" duration="3653000" />
+      <workItem from="1718069302578" duration="604000" />
+      <workItem from="1719749124178" duration="2000" />
+      <workItem from="1721024783860" duration="3000" />
+      <workItem from="1721484421877" duration="939000" />
+      <workItem from="1721528085975" duration="6199000" />
+      <workItem from="1721613881635" duration="3849000" />
+      <workItem from="1722040494765" duration="13950000" />
+      <workItem from="1722062512246" duration="18586000" />
+      <workItem from="1722129426856" duration="17038000" />
+      <workItem from="1722215818784" duration="16344000" />
+      <workItem from="1722304780775" duration="18623000" />
+      <workItem from="1722407231490" duration="6609000" />
+      <workItem from="1722472667237" duration="5193000" />
+      <workItem from="1722657508626" duration="597000" />
+      <workItem from="1723690716890" duration="9000" />
+      <workItem from="1723793077905" duration="5340000" />
+      <workItem from="1723806823176" duration="6000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/demo.py</url>
+          <line>352</line>
+          <option name="timeStamp" value="21" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/demo.py</url>
+          <line>350</line>
+          <option name="timeStamp" value="22" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>49</line>
+          <option name="timeStamp" value="62" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>347</line>
+          <option name="timeStamp" value="64" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>39</line>
+          <option name="timeStamp" value="65" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>307</line>
+          <option name="timeStamp" value="66" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>301</line>
+          <option name="timeStamp" value="67" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>64</line>
+          <option name="timeStamp" value="114" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>107</line>
+          <option name="timeStamp" value="115" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>106</line>
+          <option name="timeStamp" value="116" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>204</line>
+          <option name="timeStamp" value="149" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>75</line>
+          <option name="timeStamp" value="164" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>187</line>
+          <option name="timeStamp" value="378" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>179</line>
+          <option name="timeStamp" value="382" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>166</line>
+          <option name="timeStamp" value="383" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/croco/models/dpt_block.py</url>
+          <line>444</line>
+          <option name="timeStamp" value="386" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>71</line>
+          <option name="timeStamp" value="388" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>184</line>
+          <option name="timeStamp" value="391" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>58</line>
+          <option name="timeStamp" value="421" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>163</line>
+          <option name="timeStamp" value="431" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>185</line>
+          <option name="timeStamp" value="432" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>187</line>
+          <option name="timeStamp" value="433" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>184</line>
+          <option name="timeStamp" value="434" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/evaluate.py</url>
+          <line>90</line>
+          <option name="timeStamp" value="435" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/evaluate.py</url>
+          <line>94</line>
+          <option name="timeStamp" value="437" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/evaluate.py</url>
+          <line>56</line>
+          <option name="timeStamp" value="438" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/evaluate.py</url>
+          <line>89</line>
+          <option name="timeStamp" value="441" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/evaluate.py</url>
+          <line>95</line>
+          <option name="timeStamp" value="442" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>533</line>
+          <option name="timeStamp" value="444" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>171</line>
+          <option name="timeStamp" value="446" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>172</line>
+          <option name="timeStamp" value="447" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>167</line>
+          <option name="timeStamp" value="449" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>166</line>
+          <option name="timeStamp" value="450" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/load_nvos.py</url>
+          <line>170</line>
+          <option name="timeStamp" value="451" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>37</line>
+          <option name="timeStamp" value="455" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>128</line>
+          <option name="timeStamp" value="456" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>131</line>
+          <option name="timeStamp" value="457" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>146</line>
+          <option name="timeStamp" value="458" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>136</line>
+          <option name="timeStamp" value="459" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>140</line>
+          <option name="timeStamp" value="460" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>143</line>
+          <option name="timeStamp" value="461" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/post_process.py</url>
+          <line>16</line>
+          <option name="timeStamp" value="465" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>291</line>
+          <option name="timeStamp" value="474" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/init_im_poses.py</url>
+          <line>292</line>
+          <option name="timeStamp" value="475" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>370</line>
+          <option name="timeStamp" value="484" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>270</line>
+          <option name="timeStamp" value="486" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>269</line>
+          <option name="timeStamp" value="487" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py</url>
+          <line>179</line>
+          <option name="timeStamp" value="490" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py</url>
+          <line>195</line>
+          <option name="timeStamp" value="492" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py</url>
+          <line>176</line>
+          <option name="timeStamp" value="493" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py</url>
+          <line>197</line>
+          <option name="timeStamp" value="494" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/optimizer.py</url>
+          <line>187</line>
+          <option name="timeStamp" value="495" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>30</line>
+          <option name="timeStamp" value="497" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>377</line>
+          <option name="timeStamp" value="503" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>140</line>
+          <option name="timeStamp" value="504" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/dust3r/cloud_opt/base_opt.py</url>
+          <line>139</line>
+          <option name="timeStamp" value="505" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>167</line>
+          <option name="timeStamp" value="506" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>173</line>
+          <option name="timeStamp" value="508" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>171</line>
+          <option name="timeStamp" value="509" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>162</line>
+          <option name="timeStamp" value="513" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/SAM/predictor.py</url>
+          <line>162</line>
+          <option name="timeStamp" value="514" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/SAM/predictor.py</url>
+          <line>153</line>
+          <option name="timeStamp" value="515" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/SAM/predictor.py</url>
+          <line>237</line>
+          <option name="timeStamp" value="516" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/SAM/predictor.py</url>
+          <line>239</line>
+          <option name="timeStamp" value="517" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/gys_util.py</url>
+          <line>109</line>
+          <option name="timeStamp" value="522" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>20</line>
+          <option name="timeStamp" value="526" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>21</line>
+          <option name="timeStamp" value="527" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>94</line>
+          <option name="timeStamp" value="530" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>337</line>
+          <option name="timeStamp" value="536" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>350</line>
+          <option name="timeStamp" value="537" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/segment_eval_mask.py</url>
+          <line>505</line>
+          <option name="timeStamp" value="539" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/dust3r$test_vis.coverage" NAME="test_vis Coverage Results" MODIFIED="1714045279462" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/dust3r$gys_util.coverage" NAME="gys_util Coverage Results" MODIFIED="1722411390675" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/dust3r$load_nvos.coverage" NAME="load_nvos Coverage Results" MODIFIED="1722071842346" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/dust3r$prepare_prompts.coverage" NAME="prepare_prompts Coverage Results" MODIFIED="1714108229869" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/lib" />
+    <SUITE FILE_PATH="coverage/dust3r$base_opt.coverage" NAME="base_opt Coverage Results" MODIFIED="1722220609861" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/dust3r/cloud_opt" />
+    <SUITE FILE_PATH="coverage/dust3r$segment_eval_mask.coverage" NAME="segment_eval_mask Coverage Results" MODIFIED="1723797134862" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/dust3r$demo.coverage" NAME="demo Coverage Results" MODIFIED="1714038776406" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/dust3r$evaluate.coverage" NAME="evaluate Coverage Results" MODIFIED="1722070716981" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..31d92d26f1b665d0f06b23378ef1e1d558b648d7
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,13 @@
+DUSt3R
+Copyright 2024-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms. 
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+naver/croco
+https://github.com/naver/croco/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+
diff --git a/README.md b/README.md
index e57733f951abe6839b51d58ae91d8149a9c3c4dc..c8ba786340b23cb30849ef2ae0f130cfa56e6103 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
----
-title: Our3D
-emoji: 🏆
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 4.42.0
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
+title: 3D
+emoji: 🐨
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 4.42.0
+app_file: app.py
+pinned: false
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/SAM/__init__.py b/SAM/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65fe71691ee281310fb821441bc6d14285044322
--- /dev/null
+++ b/SAM/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .build_sam import (
+    build_sam,
+    build_sam_vit_h,
+    build_sam_vit_l,
+    build_sam_vit_b,
+    sam_model_registry,
+)
+from .predictor import SamPredictor
+from .automatic_mask_generator import SamAutomaticMaskGenerator
\ No newline at end of file
diff --git a/SAM/__pycache__/__init__.cpython-310.pyc b/SAM/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4232ca38eca64052b110ac7db53b31a96decb924
Binary files /dev/null and b/SAM/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc b/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee29a97176397f9a9a7eeaf2663f61cdd7c10909
Binary files /dev/null and b/SAM/__pycache__/automatic_mask_generator.cpython-310.pyc differ
diff --git a/SAM/__pycache__/build_sam.cpython-310.pyc b/SAM/__pycache__/build_sam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fd9f031bf3367e8698126d84f6de4b0a1b565ee
Binary files /dev/null and b/SAM/__pycache__/build_sam.cpython-310.pyc differ
diff --git a/SAM/__pycache__/predictor.cpython-310.pyc b/SAM/__pycache__/predictor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..334ea40f774d11ad495980f8f6081b3b9847c0a1
Binary files /dev/null and b/SAM/__pycache__/predictor.cpython-310.pyc differ
diff --git a/SAM/automatic_mask_generator.py b/SAM/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..da2f60cef62b7da80b02a835aca21eff328e65aa
--- /dev/null
+++ b/SAM/automatic_mask_generator.py
@@ -0,0 +1,372 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from .modeling import Sam
+from .predictor import SamPredictor
+from .utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+class SamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = SamPredictor(model)
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[:2]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[:2]
+        self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+        self.predictor.reset_image()
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros_like(data["boxes"][:, 0]),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["points"] = uncrop_points(data["points"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        # Run model on this batch
+        transformed_points = self.predictor.transform.apply_coords(points, im_size)
+        in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
+        in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device)
+        masks, iou_preds, _ = self.predictor.predict_torch(
+            in_points[:, None, :],
+            in_labels[:, None],
+            multimask_output=True,
+            return_logits=True,
+        )
+
+        # Serialize predictions and store in MaskData
+        data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+        )
+        del masks
+
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > self.predictor.model.mask_threshold
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
\ No newline at end of file
diff --git a/SAM/build_sam.py b/SAM/build_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf07ae6373f722ad7f78ce515338b03d422a72a7
--- /dev/null
+++ b/SAM/build_sam.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from functools import partial
+
+from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+
+
+def build_sam_vit_h(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=1280,
+        encoder_depth=32,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[7, 15, 23, 31],
+        checkpoint=checkpoint,
+    )
+
+
+build_sam = build_sam_vit_h
+
+
+def build_sam_vit_l(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=1024,
+        encoder_depth=24,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[5, 11, 17, 23],
+        checkpoint=checkpoint,
+    )
+
+
+def build_sam_vit_b(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+    )
+
+
+sam_model_registry = {
+    "default": build_sam_vit_h,
+    "vit_h": build_sam_vit_h,
+    "vit_l": build_sam_vit_l,
+    "vit_b": build_sam_vit_b,
+}
+
+
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    checkpoint=None,
+):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    sam = Sam(
+        image_encoder=ImageEncoderViT(
+            depth=encoder_depth,
+            embed_dim=encoder_embed_dim,
+            img_size=image_size,
+            mlp_ratio=4,
+            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+            num_heads=encoder_num_heads,
+            patch_size=vit_patch_size,
+            qkv_bias=True,
+            use_rel_pos=True,
+            global_attn_indexes=encoder_global_attn_indexes,
+            window_size=14,
+            out_chans=prompt_embed_dim,
+        ),
+        prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+        ),
+        mask_decoder=MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+        ),
+        pixel_mean=[123.675, 116.28, 103.53],
+        pixel_std=[58.395, 57.12, 57.375],
+    )
+    sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)
+        sam.load_state_dict(state_dict)
+    return sam
\ No newline at end of file
diff --git a/SAM/modeling/__init__.py b/SAM/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..392f6cbc118573ee18f946312c06af34716f9836
--- /dev/null
+++ b/SAM/modeling/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .sam import Sam
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .transformer import TwoWayTransformer
+from .image_encoder import ImageEncoderViT
\ No newline at end of file
diff --git a/SAM/modeling/__pycache__/__init__.cpython-310.pyc b/SAM/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aac45024e3f3fc300f0bf3bf822a0057a1dc13d7
Binary files /dev/null and b/SAM/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/common.cpython-310.pyc b/SAM/modeling/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47f8c8efb755572200fa19fe93143fdd76488ef0
Binary files /dev/null and b/SAM/modeling/__pycache__/common.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc b/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeaf50923294930fbe14020a4c326c4b0a7a30d2
Binary files /dev/null and b/SAM/modeling/__pycache__/image_encoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc b/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7074bf0ef5a1c3d623aa4c57d24aad3e87905ae7
Binary files /dev/null and b/SAM/modeling/__pycache__/mask_decoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc b/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13910b6023c9111c5427dcfbf16e6a2b1a3f7d22
Binary files /dev/null and b/SAM/modeling/__pycache__/prompt_encoder.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/sam.cpython-310.pyc b/SAM/modeling/__pycache__/sam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5045ad7c082a3c18a36d5e3674a97fde72da086
Binary files /dev/null and b/SAM/modeling/__pycache__/sam.cpython-310.pyc differ
diff --git a/SAM/modeling/__pycache__/transformer.cpython-310.pyc b/SAM/modeling/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b38537c70cf84caaf28ef49ef2b62317e154e6ec
Binary files /dev/null and b/SAM/modeling/__pycache__/transformer.cpython-310.pyc differ
diff --git a/SAM/modeling/common.py b/SAM/modeling/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..d67662c6a517be28bf3b8d037056a6e376cf7a7e
--- /dev/null
+++ b/SAM/modeling/common.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from typing import Type
+
+
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
\ No newline at end of file
diff --git a/SAM/modeling/image_encoder.py b/SAM/modeling/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7030c033aa3147e9a30fb9a137cc7829e786993c
--- /dev/null
+++ b/SAM/modeling/image_encoder.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional, Tuple, Type
+
+from .common import LayerNorm2d, MLPBlock
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+            )
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.neck(x.permute(0, 3, 1, 2))
+
+        return x
+
+
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+
+        self.window_size = window_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+
+        return x
+
+
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+
+        return x
+
+
+def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+
+    return attn
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
\ No newline at end of file
diff --git a/SAM/modeling/mask_decoder.py b/SAM/modeling/mask_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6abb60143e4ea8e826a635f79b5d3a4df488add2
--- /dev/null
+++ b/SAM/modeling/mask_decoder.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import List, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+
+        self.num_multimask_outputs = num_multimask_outputs
+
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
+            activation(),
+        )
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+
+        self.iou_prediction_head = MLP(
+            transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+        )
+
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+        batch_ind_list: List[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+            batch_ind_list=batch_ind_list,
+        )
+
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, mask_slice, :, :]
+        iou_pred = iou_pred[:, mask_slice]
+
+        # Prepare output
+        return masks, iou_pred
+
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        batch_ind_list: List[int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        if batch_ind_list is None:
+            output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+            output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
+            tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+            # Expand per-image data in batch direction to be per-mask
+            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+            src = src + dense_prompt_embeddings
+            pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+            b, c, h, w = src.shape
+        else:
+            num_instances = int(sparse_prompt_embeddings.size(0))
+            output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+            output_tokens = output_tokens.unsqueeze(0).expand(num_instances, -1, -1)
+            tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+            # Expand per-image data in batch direction to be per-mask
+            image_embeddings = torch.cat([image_embeddings[i].unsqueeze(0).repeat(n, 1, 1, 1) for i, n in enumerate(batch_ind_list)], dim=0)
+            src = image_embeddings
+            src = src + dense_prompt_embeddings
+            pos_src = torch.repeat_interleave(image_pe, num_instances, dim=0)
+            b, c, h, w = src.shape
+
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+
+        return masks, iou_pred
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
\ No newline at end of file
diff --git a/SAM/modeling/prompt_encoder.py b/SAM/modeling/prompt_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ea3a1d02e02232a7928ac235024d433a85be97
--- /dev/null
+++ b/SAM/modeling/prompt_encoder.py
@@ -0,0 +1,214 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch import nn
+
+from typing import Any, Optional, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+        self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        return point_embedding
+
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        bs = self._get_batch_size(points, boxes, masks)
+        sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+
+        return sparse_embeddings, dense_embeddings
+
+
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
\ No newline at end of file
diff --git a/SAM/modeling/sam.py b/SAM/modeling/sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b4655187c22d06d63f27139ae469f20d33b749
--- /dev/null
+++ b/SAM/modeling/sam.py
@@ -0,0 +1,187 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import Any, Dict, List, Tuple
+
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .image_encoder import ImageEncoderViT
+
+class Sam(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = [123.675, 116.28, 103.53],
+        pixel_std: List[float] = [58.395, 57.12, 57.375],
+    ) -> None:
+        """
+        SAM predicts object masks from an image and input prompts.
+
+        Arguments:
+          image_encoder (ImageEncoderViT): The backbone used to encode the
+            image into image embeddings that allow for efficient mask prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+        """
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.prompt_encoder = prompt_encoder
+        self.mask_decoder = mask_decoder
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+    @property
+    def device(self) -> Any:
+        return self.pixel_mean.device
+
+    @torch.no_grad()
+    def forward(
+        self,
+        batched_input: List[Dict[str, Any]],
+        multimask_output: bool,
+    ) -> List[Dict[str, torch.Tensor]]:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+
+        Arguments:
+          batched_input (list(dict)): A list over input images, each a
+            dictionary with the following keys. A prompt key can be
+            excluded if it is not present.
+              'image': The image as a torch tensor in 3xHxW format,
+                already transformed for input to the model.
+              'original_size': (tuple(int, int)) The original size of
+                the image before transformation, as (H, W).
+              'point_coords': (torch.Tensor) Batched point prompts for
+                this image, with shape BxNx2. Already transformed to the
+                input frame of the model.
+              'point_labels': (torch.Tensor) Batched labels for point prompts,
+                with shape BxN.
+              'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
+                Already transformed to the input frame of the model.
+              'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
+                in the form Bx1xHxW.
+          multimask_output (bool): Whether the model should predict multiple
+            disambiguating masks, or return a single mask.
+
+        Returns:
+          (list(dict)): A list over input images, where each element is
+            as dictionary with the following keys.
+              'masks': (torch.Tensor) Batched binary mask predictions,
+                with shape BxCxHxW, where B is the number of input prompts,
+                C is determined by multimask_output, and (H, W) is the
+                original size of the image.
+              'iou_predictions': (torch.Tensor) The model's predictions
+                of mask quality, in shape BxC.
+              'low_res_logits': (torch.Tensor) Low resolution logits with
+                shape BxCxHxW, where H=W=256. Can be passed as mask input
+                to subsequent iterations of prediction.
+        """
+        spase_embed_list = []
+        dense_embed_list = []
+        batch_ind_list = []
+        input_images_list = []
+        for idx, image_record in enumerate(batched_input):
+            input_images_list.append(self.preprocess(image_record["image"]))
+            if "point_coords" in image_record:
+                points = (image_record["point_coords"], image_record["point_labels"])
+            else:
+                points = None
+            sparse_embed, dense_embed = self.prompt_encoder(
+                points=points,
+                boxes=image_record.get("boxes", None),
+                masks=image_record.get("mask_inputs", None),
+            )
+            assert len(sparse_embed) == len(dense_embed)
+            spase_embed_list.append(sparse_embed)
+            dense_embed_list.append(dense_embed)
+            batch_ind_list.append(len(sparse_embed))
+
+        image_embeddings = self.image_encoder(torch.stack(input_images_list, dim=0))
+        sparse_embed = torch.cat(spase_embed_list)
+        dense_embed = torch.cat(dense_embed_list)
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_pe=self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embed,
+            dense_prompt_embeddings=dense_embed,
+            multimask_output=multimask_output,
+            batch_ind_list=batch_ind_list,
+        )
+        low_res_masks = torch.split(low_res_masks, batch_ind_list, dim=0)
+        iou_predictions = torch.split(iou_predictions, batch_ind_list, dim=0)
+        outputs = []
+        for image_record, low_res_mask, iou_prediction in zip(batched_input, low_res_masks, iou_predictions):
+            masks = self.postprocess_masks(
+                low_res_mask,
+                input_size=image_record["image"].shape[-2:],
+                original_size=image_record["original_size"],
+            )
+            masks = masks > self.mask_threshold
+            outputs.append(
+                {
+                    "masks": masks,
+                    "iou_predictions": iou_prediction,
+                    "low_res_logits": low_res_mask,
+                }
+            )
+        return outputs
+
+    def postprocess_masks(
+        self,
+        masks: torch.Tensor,
+        input_size: Tuple[int, ...],
+        original_size: Tuple[int, ...],
+    ) -> torch.Tensor:
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Arguments:
+          masks (torch.Tensor): Batched masks from the mask_decoder,
+            in BxCxHxW format.
+          input_size (tuple(int, int)): The size of the image input to the
+            model, in (H, W) format. Used to remove padding.
+          original_size (tuple(int, int)): The original size of the image
+            before resizing for input to the model, in (H, W) format.
+
+        Returns:
+          (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
+            is given by original_size.
+        """
+        masks = F.interpolate(
+            masks,
+            (self.image_encoder.img_size, self.image_encoder.img_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        masks = masks[..., : input_size[0], : input_size[1]]
+        masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
+        return masks
+
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        # Normalize colors
+        x = (x - self.pixel_mean) / self.pixel_std
+
+        # Pad
+        h, w = x.shape[-2:]
+        padh = self.image_encoder.img_size - h
+        padw = self.image_encoder.img_size - w
+        x = F.pad(x, (0, padw, 0, padh))
+        return x
\ No newline at end of file
diff --git a/SAM/modeling/transformer.py b/SAM/modeling/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..33b8070d3c18d11a43fcca3e1b3d7d33dc9a1147
--- /dev/null
+++ b/SAM/modeling/transformer.py
@@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import Tensor, nn
+
+import math
+from typing import Tuple, Type
+
+from .common import MLPBlock
+
+
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+
+        return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+
+        return queries, keys
+
+
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+
+        return out
\ No newline at end of file
diff --git a/SAM/predictor.py b/SAM/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..877324aa85f4f22e44f22df6889188e544c9c412
--- /dev/null
+++ b/SAM/predictor.py
@@ -0,0 +1,269 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from .modeling import Sam
+
+from typing import Optional, Tuple
+
+from .utils.transforms import ResizeLongestSide
+
+
+class SamPredictor:
+    def __init__(
+        self,
+        sam_model: Sam,
+    ) -> None:
+        """
+        Uses SAM to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+
+        Arguments:
+          sam_model (Sam): The model to use for mask prediction.
+        """
+        super().__init__()
+        self.model = sam_model
+        self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
+        self.reset_image()
+
+    def set_image(
+        self,
+        image: np.ndarray,
+        image_format: str = "RGB",
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+
+        Arguments:
+          image (np.ndarray): The image for calculating masks. Expects an
+            image in HWC uint8 format, with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        assert image_format in [
+            "RGB",
+            "BGR",
+        ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+        if image_format != self.model.image_format:
+            image = image[..., ::-1]
+
+        # Transform the image to the form expected by the model
+        input_image = self.transform.apply_image(image)
+        input_image_torch = torch.as_tensor(input_image, device=self.device)
+        input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+        self.set_torch_image(input_image_torch, image.shape[:2])
+
+    @torch.no_grad()
+    def set_torch_image(
+        self,
+        transformed_image: torch.Tensor,
+        original_image_size: Tuple[int, ...],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method. Expects the input
+        image to be already transformed to the format expected by the model.
+
+        Arguments:
+          transformed_image (torch.Tensor): The input image, with shape
+            1x3xHxW, which has been transformed with ResizeLongestSide.
+          original_image_size (tuple(int, int)): The size of the image
+            before transformation, in (H, W) format.
+        """
+        assert (
+            len(transformed_image.shape) == 4
+            and transformed_image.shape[1] == 3
+            and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size
+        ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}."
+        self.reset_image()
+
+        self.original_size = original_image_size
+        self.input_size = tuple(transformed_image.shape[-2:])
+        input_image = self.model.preprocess(transformed_image)
+        self.features = self.model.image_encoder(input_image)
+        self.is_image_set = True
+
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+        # Transform input prompts
+        coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            point_coords = self.transform.apply_coords(point_coords, self.original_size)
+            coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device)
+            labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+            coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+        if box is not None:
+            box = self.transform.apply_boxes(box, self.original_size)
+            box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
+            box_torch = box_torch[None, :]
+        if mask_input is not None:
+            mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device)
+            mask_input_torch = mask_input_torch[None, :, :, :]
+
+        masks, iou_predictions, low_res_masks = self.predict_torch(
+            coords_torch,
+            labels_torch,
+            box_torch,
+            mask_input_torch,
+            multimask_output,
+            return_logits=return_logits,
+        )
+
+        masks_np = masks[0].detach().cpu().numpy()
+        iou_predictions_np = iou_predictions[0].detach().cpu().numpy()
+        low_res_masks_np = low_res_masks[0].detach().cpu().numpy()
+        return masks_np, iou_predictions_np, low_res_masks_np
+
+    @torch.no_grad()
+    def predict_torch(
+        self,
+        point_coords: Optional[torch.Tensor],
+        point_labels: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        mask_input: Optional[torch.Tensor] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using ResizeLongestSide.
+
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+        if point_coords is not None:
+            points = (point_coords, point_labels)
+        else:
+            points = None
+
+        # Embed prompts
+        sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=mask_input,
+        )
+
+        # Predict masks
+        low_res_masks, iou_predictions = self.model.mask_decoder(
+            image_embeddings=self.features,
+            image_pe=self.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+
+        # Upscale the masks to the original image resolution
+        masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+
+        if not return_logits:
+            masks = masks > self.model.mask_threshold
+
+        return masks, iou_predictions, low_res_masks
+
+    def get_image_embedding(self) -> torch.Tensor:
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self.is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert self.features is not None, "Features must exist if an image has been set."
+        return self.features
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+
+    def reset_image(self) -> None:
+        """Resets the currently set image."""
+        self.is_image_set = False
+        self.features = None
+        self.orig_h = None
+        self.orig_w = None
+        self.input_h = None
+        self.input_w = None
\ No newline at end of file
diff --git a/SAM/utils/__pycache__/amg.cpython-310.pyc b/SAM/utils/__pycache__/amg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71ad4b29f690a99c0740652956b53dcf23ef30f9
Binary files /dev/null and b/SAM/utils/__pycache__/amg.cpython-310.pyc differ
diff --git a/SAM/utils/__pycache__/transforms.cpython-310.pyc b/SAM/utils/__pycache__/transforms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4635c30403ce8c463ce28eb8bc840471354f2ef2
Binary files /dev/null and b/SAM/utils/__pycache__/transforms.cpython-310.pyc differ
diff --git a/SAM/utils/amg.py b/SAM/utils/amg.py
new file mode 100644
index 0000000000000000000000000000000000000000..29deb156d2c6d1a350812db4aaca5c667d5f8960
--- /dev/null
+++ b/SAM/utils/amg.py
@@ -0,0 +1,346 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+
+class MaskData:
+    """
+    A structure for storing masks and their related data in batched format.
+    Implements basic filtering and concatenation.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        for v in kwargs.values():
+            assert isinstance(
+                v, (list, np.ndarray, torch.Tensor)
+            ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats = dict(**kwargs)
+
+    def __setitem__(self, key: str, item: Any) -> None:
+        assert isinstance(
+            item, (list, np.ndarray, torch.Tensor)
+        ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats[key] = item
+
+    def __delitem__(self, key: str) -> None:
+        del self._stats[key]
+
+    def __getitem__(self, key: str) -> Any:
+        return self._stats[key]
+
+    def items(self) -> ItemsView[str, Any]:
+        return self._stats.items()
+
+    def filter(self, keep: torch.Tensor) -> None:
+        for k, v in self._stats.items():
+            if v is None:
+                self._stats[k] = None
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = v[keep.detach().cpu().numpy()]
+            elif isinstance(v, list) and keep.dtype == torch.bool:
+                self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+            elif isinstance(v, list):
+                self._stats[k] = [v[i] for i in keep]
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+    def cat(self, new_stats: "MaskData") -> None:
+        for k, v in new_stats.items():
+            if k not in self._stats or self._stats[k] is None:
+                self._stats[k] = deepcopy(v)
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+            elif isinstance(v, list):
+                self._stats[k] = self._stats[k] + deepcopy(v)
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+    def to_numpy(self) -> None:
+        for k, v in self._stats.items():
+            if isinstance(v, torch.Tensor):
+                self._stats[k] = v.detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(
+    boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+    orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+    boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+    near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+    near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+    return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+    box_xywh = deepcopy(box_xyxy)
+    box_xywh[2] = box_xywh[2] - box_xywh[0]
+    box_xywh[3] = box_xywh[3] - box_xywh[1]
+    return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+    assert len(args) > 0 and all(
+        len(a) == len(args[0]) for a in args
+    ), "Batched iteration must have inputs of all the same size."
+    n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+    for b in range(n_batches):
+        yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+    """
+    Encodes masks to an uncompressed RLE, in the format expected by
+    pycoco tools.
+    """
+    # Put in fortran order and flatten h,w
+    b, h, w = tensor.shape
+    tensor = tensor.permute(0, 2, 1).flatten(1)
+
+    # Compute change indices
+    diff = tensor[:, 1:] ^ tensor[:, :-1]
+    change_indices = diff.nonzero()
+
+    # Encode run length
+    out = []
+    for i in range(b):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+        cur_idxs = torch.cat(
+            [
+                torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+                cur_idxs + 1,
+                torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+            ]
+        )
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if tensor[i, 0] == 0 else [0]
+        counts.extend(btw_idxs.detach().cpu().tolist())
+        out.append({"size": [h, w], "counts": counts})
+    return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+    """Compute a binary mask from an uncompressed RLE."""
+    h, w = rle["size"]
+    mask = np.empty(h * w, dtype=bool)
+    idx = 0
+    parity = False
+    for count in rle["counts"]:
+        mask[idx : idx + count] = parity
+        idx += count
+        parity ^= True
+    mask = mask.reshape(w, h)
+    return mask.transpose()  # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+    return sum(rle["counts"][1::2])
+
+
+def calculate_stability_score(
+    masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+    """
+    Computes the stability score for a batch of masks. The stability
+    score is the IoU between the binary masks obtained by thresholding
+    the predicted mask logits at high and low values.
+    """
+    # One mask is always contained inside the other.
+    # Save memory by preventing unnecessary cast to torch.int64
+    intersections = (
+        (masks > (mask_threshold + threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    unions = (
+        (masks > (mask_threshold - threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+    """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+    offset = 1 / (2 * n_per_side)
+    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+    points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+    return points
+
+
+def build_all_layer_point_grids(
+    n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+    """Generates point grids for all crop layers."""
+    points_by_layer = []
+    for i in range(n_layers + 1):
+        n_points = int(n_per_side / (scale_per_layer**i))
+        points_by_layer.append(build_point_grid(n_points))
+    return points_by_layer
+
+
+def generate_crop_boxes(
+    im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+    """
+    Generates a list of crop boxes of different sizes. Each layer
+    has (2**i)**2 boxes for the ith layer.
+    """
+    crop_boxes, layer_idxs = [], []
+    im_h, im_w = im_size
+    short_side = min(im_h, im_w)
+
+    # Original image
+    crop_boxes.append([0, 0, im_w, im_h])
+    layer_idxs.append(0)
+
+    def crop_len(orig_len, n_crops, overlap):
+        return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+    for i_layer in range(n_layers):
+        n_crops_per_side = 2 ** (i_layer + 1)
+        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+        crop_w = crop_len(im_w, n_crops_per_side, overlap)
+        crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+        crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+        crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+
+        # Crops in XYWH format
+        for x0, y0 in product(crop_box_x0, crop_box_y0):
+            box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+            crop_boxes.append(box)
+            layer_idxs.append(i_layer + 1)
+
+    return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0]], device=points.device)
+    # Check if points has a channel dimension
+    if len(points.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return points + offset
+
+
+def uncrop_masks(
+    masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+    x0, y0, x1, y1 = crop_box
+    if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+    pad = (x0, pad_x - x0, y0, pad_y - y0)
+    return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+    from pycocotools import mask as mask_utils  # type: ignore
+
+    h, w = uncompressed_rle["size"]
+    rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+    rle["counts"] = rle["counts"].decode("utf-8")  # Necessary to serialize with json
+    return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+    an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+    """
+    # torch.max below raises an error on empty inputs, just skip in this case
+    if torch.numel(masks) == 0:
+        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+    # Normalize shape to CxHxW
+    shape = masks.shape
+    h, w = shape[-2:]
+    if len(shape) > 2:
+        masks = masks.flatten(0, -3)
+    else:
+        masks = masks.unsqueeze(0)
+
+    # Get top and bottom edges
+    in_height, _ = torch.max(masks, dim=-1)
+    in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+    in_height_coords = in_height_coords + h * (~in_height)
+    top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+    # Get left and right edges
+    in_width, _ = torch.max(masks, dim=-2)
+    in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+    right_edges, _ = torch.max(in_width_coords, dim=-1)
+    in_width_coords = in_width_coords + w * (~in_width)
+    left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+    out = out * (~empty_filter).unsqueeze(-1)
+
+    # Return to original shape
+    if len(shape) > 2:
+        out = out.reshape(*shape[:-2], 4)
+    else:
+        out = out[0]
+
+    return out
\ No newline at end of file
diff --git a/SAM/utils/transforms.py b/SAM/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cdd0e02b123bd508180e3697a94e1af1a1aa570
--- /dev/null
+++ b/SAM/utils/transforms.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
+
+from copy import deepcopy
+from typing import Tuple
+
+
+class ResizeLongestSide:
+    """
+    Resizes images to the longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+        return np.array(resize(to_pil_image(image), target_size))
+
+    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    @staticmethod
+    def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
\ No newline at end of file
diff --git a/__pycache__/evaluate.cpython-310.pyc b/__pycache__/evaluate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1913b754054bf7fbe9af2c83edd5ae72ddbef55
Binary files /dev/null and b/__pycache__/evaluate.cpython-310.pyc differ
diff --git a/__pycache__/load_nvos.cpython-310.pyc b/__pycache__/load_nvos.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1135c654028930ec45969849db96c7796a02500
Binary files /dev/null and b/__pycache__/load_nvos.cpython-310.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1dcd6ad80fdfc29e27eea6e64be08b4fec23215
--- /dev/null
+++ b/app.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import argparse
+import gradio
+import os
+import torch
+import numpy as np
+import tempfile
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+
+from dust3r.inference import inference, load_model
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+batch_size = 1
+
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+    
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   
+    
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))    
+
+from SAM import SamPredictor
+from SAM.build_sam import sam_model_registry
+sam_checkpoint = "checkpoints/sam_vit_b_01ec64.pth"
+model_type = "vit_b"
+
+sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+sam.to(device='cuda')
+predictor = SamPredictor(sam)
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser_url = parser.add_mutually_exclusive_group()
+    parser_url.add_argument("--local_network", action='store_true', default=False,
+                            help="make app accessible on local network: address will be set to 0.0.0.0")
+    parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1")
+    parser.add_argument("--image_size", type=int, default=512, choices=[512, 224], help="image size")
+    parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). "
+                                                         "If None, will search for an available port starting at 7860."),
+                        default=None)
+    parser.add_argument("--weights", type=str, required=True, help="path to the model weights")
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir")
+    return parser
+
+
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False, transparent_cams=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, 'scene.glb')
+    print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+
+
+def get_3D_model_from_scene(outdir, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+    # post processes
+    if clean_depth:
+        scene = scene.clean_pointcloud()
+    if mask_sky:
+        scene = scene.mask_sky()
+
+    # get optimized values from scene
+    rgbimg = scene.imgs
+    # print('SAM step...')
+    # predictor.set_image((rgbimg[0] * 255).astype(np.uint8))
+    # h,w,c = rgbimg[0].shape
+    # input_point = np.array([
+    #     [int(w/2), int(h/2)],
+    #     [int(w/2), int(h/2)-20]
+    # ])
+    # input_label = np.array([1,1])
+    # masks1, scores, logits = predictor.predict(
+    #     point_coords=input_point,
+    #     point_labels=input_label,
+    #     multimask_output=False,
+    # )
+    # fig, ax = plt.subplots(4, 2, figsize=(20, 20))
+    # show_mask(masks1[0], ax[0][0], random_color=True)
+    # show_points(input_point, input_label, ax[0][0])
+    # ax[0][1].imshow(rgbimg[0])
+
+    # predictor.set_image((rgbimg[1] * 255).astype(np.uint8))
+    # h,w,c = rgbimg[1].shape
+    # input_point = np.array([
+    #     [int(w/2), int(h/2)],
+    #     [int(w/2), int(h/2)-20]
+    # ])
+    # input_label = np.array([1,1])
+    # masks2, scores, logits = predictor.predict(
+    #     point_coords=input_point,
+    #     point_labels=input_label,
+    #     multimask_output=False,
+    # )
+    focals = scene.get_focals().cpu()
+    cams2world = scene.get_im_poses().cpu()
+    # 3D pointcloud from depthmap, poses and intrinsics
+    pts3d = to_numpy(scene.get_pts3d())
+    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+    msk = to_numpy(scene.get_masks())
+    # ax[1][0].imshow(msk[0])
+    # msk[0] = msk[0] & masks1[0]
+    # ax[1][1].imshow(msk[0])
+    # ax[2][1].imshow(rgbimg[1])
+    # show_mask(masks2[0], ax[2][0], random_color=True)
+    # show_points(input_point, input_label, ax[2][0])
+    # ax[3][0].imshow(msk[1])
+    # # msk[1] = msk[1] & masks2[0]
+    # ax[3][1].imshow(msk[1])
+    # plt.savefig("rgb.png")
+    # import pdb
+    # pdb.set_trace()
+    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size)
+
+
+def get_reconstructed_scene(outdir, model, device, image_size, filelist, schedule, niter, min_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, refid):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    then run get_3D_model_from_scene
+    """
+    imgs = load_images(filelist, size=image_size)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=batch_size)
+
+    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+    scene = global_aligner(output, device=device, mode=mode)
+    lr = 0.01
+
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+
+    outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size)
+
+    # also return rgb, depth and confidence imgs
+    # depth is normalized with the max value for all images
+    # we apply the jet colormap on the confidence maps
+    rgbimg = scene.imgs
+    depths = to_numpy(scene.get_depthmaps())
+    confs = to_numpy([c for c in scene.im_conf])
+    cmap = plt.get_cmap('jet')
+    depths_max = max([d.max() for d in depths])
+    depths = [d/depths_max for d in depths]
+    confs_max = max([d.max() for d in confs])
+    confs = [cmap(d/confs_max) for d in confs]
+
+    imgs = []
+    for i in range(len(rgbimg)):
+        imgs.append(rgbimg[i])
+        imgs.append(rgb(depths[i]))
+        imgs.append(rgb(confs[i]))
+
+    return scene, outfile, imgs
+
+
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    max_winsize = max(1, (num_files - 1)//2)
+    if scenegraph_type == "swin":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=True)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files-1, step=1, visible=False)
+    elif scenegraph_type == "oneref":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files-1, step=1, visible=True)
+    else:
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files-1, step=1, visible=False)
+    return winsize, refid
+
+
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port):
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname)
+    with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="DUSt3R Demo") as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">DUSt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                schedule = gradio.Dropdown(["linear", "cosine"],
+                                           value='linear', label="schedule", info="For global alignment!")
+                niter = gradio.Number(value=300, precision=0, minimum=0, maximum=5000,
+                                      label="num_iterations", info="For global alignment!")
+                scenegraph_type = gradio.Dropdown(["complete", "swin", "oneref"],
+                                                  value='complete', label="Scenegraph",
+                                                  info="Define how to make pairs",
+                                                  interactive=True)
+                winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                        minimum=1, maximum=1, step=1, visible=False)
+                refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+
+            run_btn = gradio.Button("Run")
+
+            with gradio.Row():
+                # adjust the confidence threshold
+                min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1)
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001)
+            with gradio.Row():
+                as_pointcloud = gradio.Checkbox(value=False, label="As pointcloud")
+                # two post process implemented
+                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+
+            outmodel = gradio.Model3D()
+            outgallery = gradio.Gallery(label='rgb,depth,confidence', columns=3, height="100%")
+
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, winsize, refid, scenegraph_type],
+                                   outputs=[winsize, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, winsize, refid, scenegraph_type],
+                              outputs=[winsize, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[inputfiles, schedule, niter, min_conf_thr, as_pointcloud,
+                                  mask_sky, clean_depth, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, refid],
+                          outputs=[scene, outmodel, outgallery])
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            mask_sky.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            clean_depth.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size],
+                               outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                            clean_depth, transparent_cams, cam_size],
+                                    outputs=outmodel)
+    demo.launch(share=False, server_name=server_name, server_port=server_port)
+
+
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+
+    if args.tmp_dir is not None:
+        tmp_path = args.tmp_dir
+        os.makedirs(tmp_path, exist_ok=True)
+        tempfile.tempdir = tmp_path
+
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+    model = load_model(args.weights, args.device)
+    # dust3r will write the 3D model inside tmpdirname
+    with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+        print('Outputing stuff in', tmpdirname)
+        main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port)
diff --git a/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth b/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90014c0f6bab509e081b52712cc31e1f191d2a4a
--- /dev/null
+++ b/checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e8bbf0c4d1d6007f5343f3f45814b956ddc5bbb4d00cb66beaf73afe5c53b34
+size 2285019929
diff --git a/checkpoints/sam_vit_b_01ec64.pth b/checkpoints/sam_vit_b_01ec64.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ab7d111e57bd052a76fe669986560e3555e9c8f6
--- /dev/null
+++ b/checkpoints/sam_vit_b_01ec64.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912
+size 375042383
diff --git a/configs/default.py b/configs/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb15c0add64e922053afbcaa15a7d37755130d83
--- /dev/null
+++ b/configs/default.py
@@ -0,0 +1,119 @@
+from copy import deepcopy
+
+expname = None                    # experiment name
+basedir = './logs/'               # where to store ckpts and logs
+
+''' Template of data options
+'''
+data = dict(
+    datadir=None,                 # path to dataset root folder
+    dataset_type=None,            # blender | nsvf | blendedmvs | tankstemple | deepvoxels | co3d
+    inverse_y=False,              # intrinsict mode (to support blendedmvs, nsvf, tankstemple)
+    flip_x=False,                 # to support co3d
+    flip_y=False,                 # to suppo/= 10
+    annot_path='',                # to support co3d
+    split_path='',                # to support co3d
+    sequence_name='',             # to support co3d
+#    load2gpu_on_the_fly=False,    # do not load all images into gpu (to save gpu memory)
+    load2gpu_on_the_fly=True,    # do not load all images into gpu (to save gpu memory)
+    testskip=5,                   # subsample testset to preview results
+    white_bkgd=True,             # use white background (note that some dataset don't provide alpha and with blended bg color)
+    rand_bkgd=False,              # use random background during training
+    half_res=False,               # [TODO]
+    bd_factor=.75,
+    movie_render_kwargs=dict(),
+
+    # Below are forward-facing llff specific settings.
+    ndc=False,                    # use ndc coordinate (only for forward-facing; not support yet)
+    spherify=False,               # inward-facing
+    factor=4,                     # [TODO]
+    width=None,                   # enforce image width
+    height=None,                  # enforce image height
+    llffhold=8,                   # testsplit
+    load_depths=False,            # load depth
+
+    # Below are unbounded inward-facing specific settings.
+    unbounded_inward=False,
+    unbounded_inner_r=1.0,
+)
+
+''' Template of training options
+'''
+coarse_train = dict(
+    N_iters=5000,                 # number of optimization steps
+    N_rand=8192,                  # batch size (number of random rays per optimization step)
+    #N_rand=1024,                  # batch size (number of random rays per optimization step)
+    lrate_density=1e-1,           # lr of density voxel grid
+    lrate_k0=1e-1,                # lr of color/feature voxel grid
+    lrate_rgbnet=1e-3,            # lr of the mlp to preduct view-dependent color
+    lrate_decay=20,               # lr decay by 0.1 after every lrate_decay*1000 steps
+    pervoxel_lr=True,             # view-count-based lr
+    pervoxel_lr_downrate=1,       # downsampled image for computing view-count-based lr
+    ray_sampler='random',         # ray sampling strategies
+    weight_main=1.0,              # weight of photometric loss
+    weight_entropy_last=0.01,     # weight of background entropy loss
+    weight_nearclip=0,
+    weight_distortion=0,
+    weight_rgbper=0.1,            # weight of per-point rgb loss
+    tv_every=1,                   # count total variation loss every tv_every step
+    tv_after=0,                   # count total variation loss from tv_from step
+    tv_before=0,                  # count total variation before the given number of iterations
+    tv_dense_before=0,            # count total variation densely before the given number of iterations
+    weight_tv_density=0.0,        # weight of total variation loss of density voxel grid
+    weight_tv_k0=0.0,             # weight of total variation loss of color/feature voxel grid
+    pg_scale=[],                  # checkpoints for progressive scaling
+    decay_after_scale=1.0,        # decay act_shift after scaling
+    skip_zero_grad_fields=[],     # the variable name to skip optimizing parameters w/ zero grad in each iteration
+    maskout_lt_nviews=0,
+)
+
+fine_train = deepcopy(coarse_train)
+fine_train.update(dict(
+    N_iters=20000,
+    pervoxel_lr=False,
+    ray_sampler='flatten',
+    weight_entropy_last=0.001,
+    weight_rgbper=0.01,
+    pg_scale=[1000, 2000, 3000, 4000],
+    skip_zero_grad_fields=['density', 'k0'],
+))
+
+''' Template of model and rendering options
+'''
+coarse_model_and_render = dict(
+    num_voxels=1024000,           # expected number of voxel
+    num_voxels_base=1024000,      # to rescale delta distance
+    density_type='DenseGrid',     # DenseGrid, TensoRFGrid
+    k0_type='TensoRFGrid',        # DenseGrid, TensoRFGrid
+    density_config=dict(),
+    k0_config=dict(n_comp=48),
+    mpi_depth=128,                # the number of planes in Multiplane Image (work when ndc=True)
+    nearest=False,                # nearest interpolation
+    pre_act_density=False,        # pre-activated trilinear interpolation
+    in_act_density=False,         # in-activated trilinear interpolation
+    bbox_thres=1e-3,              # threshold to determine known free-space in the fine stage
+    mask_cache_thres=1e-3,        # threshold to determine a tighten BBox in the fine stage
+    rgbnet_dim=0,                 # feature voxel grid dim
+    rgbnet_full_implicit=False,   # let the colors MLP ignore feature voxel grid
+    rgbnet_direct=True,           # set to False to treat the first 3 dim of feature voxel grid as diffuse rgb
+    rgbnet_depth=3,               # depth of the colors MLP (there are rgbnet_depth-1 intermediate features)
+    rgbnet_width=128,             # width of the colors MLP
+    alpha_init=1e-6,              # set the alpha values everywhere at the begin of training
+    fast_color_thres=1e-7,        # threshold of alpha value to skip the fine stage sampled point
+    maskout_near_cam_vox=True,    # maskout grid points that between cameras and their near planes
+    world_bound_scale=1,          # rescale the BBox enclosing the scene
+    stepsize=0.5,                 # sampling stepsize in volume rendering
+)
+
+fine_model_and_render = deepcopy(coarse_model_and_render)
+fine_model_and_render.update(dict(
+    num_voxels=160**3,
+    num_voxels_base=160**3,
+    rgbnet_dim=12,
+    alpha_init=1e-2,
+    fast_color_thres=1e-4,
+    maskout_near_cam_vox=False,
+    world_bound_scale=1.05,
+))
+
+del deepcopy
diff --git a/configs/lerf/book_store.py b/configs/lerf/book_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..999f66d56eac44c535e7eebe72955cc4b617852f
--- /dev/null
+++ b/configs/lerf/book_store.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_book_store'
+
+data = dict(
+    datadir='./data/lerf_data/book_store',
+    factor=2, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.5,  # positive right
+        shift_y=0.5, # negative down
+        shift_z=1,
+        scale_r=0,
+        pitch_deg=0, # negative look downward
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/bouquet.py b/configs/lerf/bouquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..761b6653a4d1db9ffd565891e047d1ef7576e011
--- /dev/null
+++ b/configs/lerf/bouquet.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_bouquet'
+
+data = dict(
+    datadir='./data/lerf_data/bouquet',
+    factor=2, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.0, # negative down
+        shift_z=0,
+        scale_r=0.2,
+        pitch_deg=0, # negative look downward
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/donuts.py b/configs/lerf/donuts.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ee922f9043cb57c63ec3cd97d5d0a2bb618402c
--- /dev/null
+++ b/configs/lerf/donuts.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_donuts'
+
+data = dict(
+    datadir='./data/lerf_data/donuts',
+    factor=2, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=-0.2,  
+        shift_y=0.2, 
+        shift_z=0.1,
+        scale_r=1.3,
+        pitch_deg=60,
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/dozer_nerfgun_waldo.py b/configs/lerf/dozer_nerfgun_waldo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6888f66c5eb3848b98075f0f6ab132b300d64d
--- /dev/null
+++ b/configs/lerf/dozer_nerfgun_waldo.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_dozer_nerfgun_waldo'
+
+data = dict(
+    datadir='./data/lerf_data/dozer_nerfgun_waldo',
+    factor=2, # 497 * 369
+    # factor=4,
+#     movie_render_kwargs=dict(
+#         shift_x=0.0,  # positive right
+#         shift_y=-0.3, # negative down
+#         shift_z=0,
+#         scale_r=0.2,
+#         pitch_deg=-40, # negative look downward
+#     ),
+)
\ No newline at end of file
diff --git a/configs/lerf/espresso.py b/configs/lerf/espresso.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af0f7b84a68874195bfd08af6bba547cdaf61f3
--- /dev/null
+++ b/configs/lerf/espresso.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_espresso'
+
+data = dict(
+    datadir='./data/lerf_data/espresso',
+    factor=2, # 497 * 369
+    # factor=4,
+#     movie_render_kwargs=dict(
+#         shift_x=0.0,  # positive right
+#         shift_y=-0.3, # negative down
+#         shift_z=0,
+#         scale_r=0.2,
+#         pitch_deg=-40, # negative look downward
+#     ),
+)
\ No newline at end of file
diff --git a/configs/lerf/figurines.py b/configs/lerf/figurines.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b76d6cdfeac1116c1b1bdff89d63d431fd33941
--- /dev/null
+++ b/configs/lerf/figurines.py
@@ -0,0 +1,15 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_figurines'
+
+data = dict(
+    datadir='./data/lerf_data/figurines',
+    factor=2, # 497 * 369
+    movie_render_kwargs=dict(
+        shift_x=0.0,  
+        shift_y=0.0, 
+        shift_z=0.0,
+        scale_r=1.0,
+        pitch_deg=55,
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/lerf_default.py b/configs/lerf/lerf_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..64859cb80f6e34b7d3b74ed78878932d288f6ab0
--- /dev/null
+++ b/configs/lerf/lerf_default.py
@@ -0,0 +1,52 @@
+# copy from nerf unbounded
+_base_ = '../default.py'
+
+basedir = './logs/lerf'
+
+data = dict(
+    dataset_type='lerf',
+    spherify=False,
+    factor=2,
+    white_bkgd=True,
+    rand_bkgd=True,
+    inverse_y=False, # llff format
+    unbounded_inward=True,
+    load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+    N_iters=800000,
+    N_rand=1024 * 4,
+    lrate_decay=80,
+    ray_sampler='flatten',
+    weight_nearclip=1.0,
+    weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+    tv_before=20000,
+    tv_dense_before=20000,
+    weight_tv_density=1e-6,
+    weight_tv_k0=1e-7
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=160**3,
+    alpha_init=alpha_init,
+    stepsize=stepsize,
+    fast_color_thres={
+        '_delete_': True,
+        0   : alpha_init*stepsize/10,
+        1500: min(alpha_init, 1e-4)*stepsize/5,
+        2500: min(alpha_init, 1e-4)*stepsize/2,
+        3500: min(alpha_init, 1e-4)*stepsize/1.5,
+        4500: min(alpha_init, 1e-4)*stepsize,
+        5500: min(alpha_init, 1e-4),
+        6500: 1e-4,
+    },
+    world_bound_scale=1,
+)
diff --git a/configs/lerf/room.py b/configs/lerf/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/lerf/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/room',
+    # factor=2, # 1557x1038
+    factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.3, # negative down
+        shift_z=0,
+        scale_r=0.2,
+        pitch_deg=-40, # negative look downward
+    ),
+)
+
diff --git a/configs/lerf/seg_lerf/book_store.py b/configs/lerf/seg_lerf/book_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..999f66d56eac44c535e7eebe72955cc4b617852f
--- /dev/null
+++ b/configs/lerf/seg_lerf/book_store.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_book_store'
+
+data = dict(
+    datadir='./data/lerf_data/book_store',
+    factor=2, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.5,  # positive right
+        shift_y=0.5, # negative down
+        shift_z=1,
+        scale_r=0,
+        pitch_deg=0, # negative look downward
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/bouquet.py b/configs/lerf/seg_lerf/bouquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..761b6653a4d1db9ffd565891e047d1ef7576e011
--- /dev/null
+++ b/configs/lerf/seg_lerf/bouquet.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_bouquet'
+
+data = dict(
+    datadir='./data/lerf_data/bouquet',
+    factor=2, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.0, # negative down
+        shift_z=0,
+        scale_r=0.2,
+        pitch_deg=0, # negative look downward
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/donuts.py b/configs/lerf/seg_lerf/donuts.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccadb75ddafac0dec1838176e28082261be1b51
--- /dev/null
+++ b/configs/lerf/seg_lerf/donuts.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_donuts'
+
+data = dict(
+    datadir='./data/lerf_data/donuts',
+    factor=1, # 497 * 369
+    # factor=4,
+    movie_render_kwargs=dict(
+        shift_x=-0.2,  
+        shift_y=0.2, 
+        shift_z=0.1,
+        scale_r=1.3,
+        pitch_deg=60,
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py b/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6888f66c5eb3848b98075f0f6ab132b300d64d
--- /dev/null
+++ b/configs/lerf/seg_lerf/dozer_nerfgun_waldo.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_dozer_nerfgun_waldo'
+
+data = dict(
+    datadir='./data/lerf_data/dozer_nerfgun_waldo',
+    factor=2, # 497 * 369
+    # factor=4,
+#     movie_render_kwargs=dict(
+#         shift_x=0.0,  # positive right
+#         shift_y=-0.3, # negative down
+#         shift_z=0,
+#         scale_r=0.2,
+#         pitch_deg=-40, # negative look downward
+#     ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/espresso.py b/configs/lerf/seg_lerf/espresso.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af0f7b84a68874195bfd08af6bba547cdaf61f3
--- /dev/null
+++ b/configs/lerf/seg_lerf/espresso.py
@@ -0,0 +1,16 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_espresso'
+
+data = dict(
+    datadir='./data/lerf_data/espresso',
+    factor=2, # 497 * 369
+    # factor=4,
+#     movie_render_kwargs=dict(
+#         shift_x=0.0,  # positive right
+#         shift_y=-0.3, # negative down
+#         shift_z=0,
+#         scale_r=0.2,
+#         pitch_deg=-40, # negative look downward
+#     ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/figurines.py b/configs/lerf/seg_lerf/figurines.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b76d6cdfeac1116c1b1bdff89d63d431fd33941
--- /dev/null
+++ b/configs/lerf/seg_lerf/figurines.py
@@ -0,0 +1,15 @@
+_base_ = './lerf_default.py'
+
+expname = 'dcvgo_figurines'
+
+data = dict(
+    datadir='./data/lerf_data/figurines',
+    factor=2, # 497 * 369
+    movie_render_kwargs=dict(
+        shift_x=0.0,  
+        shift_y=0.0, 
+        shift_z=0.0,
+        scale_r=1.0,
+        pitch_deg=55,
+    ),
+)
\ No newline at end of file
diff --git a/configs/lerf/seg_lerf/lerf_default.py b/configs/lerf/seg_lerf/lerf_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba20465281f0f06c12a0f94436b51055b2d81ecf
--- /dev/null
+++ b/configs/lerf/seg_lerf/lerf_default.py
@@ -0,0 +1,52 @@
+# copy from nerf unbounded
+_base_ = '../../seg_default.py'
+
+basedir = './logs/lerf'
+
+data = dict(
+    dataset_type='lerf',
+    spherify=False,
+    factor=2,
+    white_bkgd=True,
+    rand_bkgd=True,
+    inverse_y=False, # llff format
+    unbounded_inward=True,
+    load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+    N_iters=800000,
+    N_rand=1024 * 4,
+    lrate_decay=80,
+    ray_sampler='flatten',
+    weight_nearclip=1.0,
+    weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+    tv_before=20000,
+    tv_dense_before=20000,
+    weight_tv_density=1e-6,
+    weight_tv_k0=1e-7
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=160**3,
+    alpha_init=alpha_init,
+    stepsize=stepsize,
+    fast_color_thres={
+        '_delete_': True,
+        0   : alpha_init*stepsize/10,
+        1500: min(alpha_init, 1e-4)*stepsize/5,
+        2500: min(alpha_init, 1e-4)*stepsize/2,
+        3500: min(alpha_init, 1e-4)*stepsize/1.5,
+        4500: min(alpha_init, 1e-4)*stepsize,
+        5500: min(alpha_init, 1e-4),
+        6500: 1e-4,
+    },
+    world_bound_scale=1,
+)
diff --git a/configs/lerf/seg_lerf/room.py b/configs/lerf/seg_lerf/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/lerf/seg_lerf/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/room',
+    # factor=2, # 1557x1038
+    factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.3, # negative down
+        shift_z=0,
+        scale_r=0.2,
+        pitch_deg=-40, # negative look downward
+    ),
+)
+
diff --git a/configs/llff/airplants.py b/configs/llff/airplants.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a2c01555963c058cb2905cc0929b534df53116
--- /dev/null
+++ b/configs/llff/airplants.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'airplants'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/airplants',
+)
diff --git a/configs/llff/apeskeleton.py b/configs/llff/apeskeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..d94364261872b6aaaff27b4549fa4a4279d3fc76
--- /dev/null
+++ b/configs/llff/apeskeleton.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'apeskeleton'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/apeskeleton',
+)
diff --git a/configs/llff/bikes.py b/configs/llff/bikes.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1c6f0ede20193edf593a76db61e953b56efc79
--- /dev/null
+++ b/configs/llff/bikes.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'bikes'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/bikes',
+)
diff --git a/configs/llff/butcher.py b/configs/llff/butcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f3fda298aa01133399ff37ef304be21e60d208
--- /dev/null
+++ b/configs/llff/butcher.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'butcher'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_butcher',
+)
diff --git a/configs/llff/chesstable.py b/configs/llff/chesstable.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb76b05e61d8078458809a27791580d58b2b854
--- /dev/null
+++ b/configs/llff/chesstable.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'chesstable'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data2_chesstable',
+)
diff --git a/configs/llff/colorfountain.py b/configs/llff/colorfountain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e936d2ebec9a4f9372ccf22e47abffeec2c44a9
--- /dev/null
+++ b/configs/llff/colorfountain.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'colorfountain'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/colorfountain',
+)
diff --git a/configs/llff/fern.py b/configs/llff/fern.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9b8ed01b5aa6676420b9867bb4cf361d296fdc7
--- /dev/null
+++ b/configs/llff/fern.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'fern'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/fern',
+)
diff --git a/configs/llff/flower.py b/configs/llff/flower.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0578b735b125abc5630b66cc29aaa59118a9c0
--- /dev/null
+++ b/configs/llff/flower.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'flower'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/flower',
+)
diff --git a/configs/llff/fortress.py b/configs/llff/fortress.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd675d52697b25a46832cd0c22cf65b281b767a1
--- /dev/null
+++ b/configs/llff/fortress.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'fortress'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/fortress',
+)
+
diff --git a/configs/llff/horns.py b/configs/llff/horns.py
new file mode 100644
index 0000000000000000000000000000000000000000..e12a4fe46bffacceb6f28012eec4b795447d3aa4
--- /dev/null
+++ b/configs/llff/horns.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'horns'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/horns',
+)
diff --git a/configs/llff/kitchen.py b/configs/llff/kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9a715b269591213c7d61accf10a5ab2a830e581
--- /dev/null
+++ b/configs/llff/kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './llff_default.py'
+
+expname = '360_dvgo_kitchen_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/kitchen',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/llff/lab_desk.py b/configs/llff/lab_desk.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efc25edd5f68dc4347193d81d4f92ce78f110e6
--- /dev/null
+++ b/configs/llff/lab_desk.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'lab_desk'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/lab_desk',
+)
diff --git a/configs/llff/leaves.py b/configs/llff/leaves.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d7cb3ee23158068fdd9fe1f2cb4267ab772d67
--- /dev/null
+++ b/configs/llff/leaves.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'leaves'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/leaves',
+)
+
diff --git a/configs/llff/llff_default.py b/configs/llff/llff_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0fd785a56891a9db1a2a20ada71c147b88c7350
--- /dev/null
+++ b/configs/llff/llff_default.py
@@ -0,0 +1,60 @@
+_base_ = '../default.py'
+
+basedir = './logs/llff'
+
+data = dict(
+    dataset_type='llff',
+    ndc=True,
+#    width=1008,
+#    height=756,
+    factor=4,
+)
+
+coarse_train = dict(
+    N_iters=0,
+)
+
+coarse_model_and_render = dict(
+    num_voxels=320**3,
+    f_num_voxels=320**3,
+    num_voxels_base=320**3,
+    f_num_voxels_base=320**3,
+    density_type='DenseGrid',
+    density_config=dict(n_comp=1),
+    k0_type='TensoRFGrid',
+    k0_config=dict(n_comp=48),
+    f_k0_type='TensoRFGrid',
+    f_k0_config=dict(n_comp=64),
+)
+
+fine_train = dict(
+    N_iters=30000,
+    #N_iters=60000,
+    N_rand=4096 * 1,
+    #weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000],
+    ray_sampler='flatten',
+    tv_before=1e9,
+    tv_dense_before=10000,
+    weight_tv_density=1e-5,
+    weight_tv_k0=1e-6,
+)
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    f_num_voxels=320**3,
+    num_voxels_base=320**3,
+    f_num_voxels_base=320**3,
+    density_type='DenseGrid',
+    density_config=dict(n_comp=1),
+    k0_type='TensoRFGrid',
+    k0_config=dict(n_comp=48),
+    f_k0_type='TensoRFGrid',
+    f_k0_config=dict(n_comp=64),
+
+    mpi_depth=128,
+    rgbnet_dim=9,
+    rgbnet_width=64,
+    world_bound_scale=1,
+    fast_color_thres=1e-3,
+)
diff --git a/configs/llff/orchids.py b/configs/llff/orchids.py
new file mode 100644
index 0000000000000000000000000000000000000000..385f52bfa2783ddc2e815f53be4236fe61874c0f
--- /dev/null
+++ b/configs/llff/orchids.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'orchids'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/orchids',
+)
+
diff --git a/configs/llff/plants.py b/configs/llff/plants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8646158bcbf5316cfd58b2391f249c389758f7
--- /dev/null
+++ b/configs/llff/plants.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'plants'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/plants',
+)
+
diff --git a/configs/llff/playground.py b/configs/llff/playground.py
new file mode 100644
index 0000000000000000000000000000000000000000..e909469a4330b61d10921c43eddc39defebf47e6
--- /dev/null
+++ b/configs/llff/playground.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'playground'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/playground',
+)
diff --git a/configs/llff/pond.py b/configs/llff/pond.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5f9099edb0b2646de1a86121bf886d6c4eefbb
--- /dev/null
+++ b/configs/llff/pond.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'pond'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/pond',
+)
diff --git a/configs/llff/room.py b/configs/llff/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..8310ac73bf90f63808c755914a2cab702e6e40b6
--- /dev/null
+++ b/configs/llff/room.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'room'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/room',
+)
diff --git a/configs/llff/santarex.py b/configs/llff/santarex.py
new file mode 100644
index 0000000000000000000000000000000000000000..db548034a5c62b7b778c31bcc2000e644f772d2b
--- /dev/null
+++ b/configs/llff/santarex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'santarex'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data_santarex',
+)
diff --git a/configs/llff/seg/llff_seg_default.py b/configs/llff/seg/llff_seg_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..65a089fee9a984f52d53cda46e63ccc0077f6d98
--- /dev/null
+++ b/configs/llff/seg/llff_seg_default.py
@@ -0,0 +1,52 @@
+_base_ = '../../seg_default.py'
+
+basedir = './logs/llff'
+
+data = dict(
+    dataset_type='llff',
+    ndc=True,
+#    width=1008,
+#    height=756,
+    factor=4,
+)
+
+coarse_train = dict(
+    N_iters=0,
+)
+
+coarse_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=320**3,
+    density_type='DenseGrid',
+    density_config=dict(n_comp=1),
+    k0_type='TensoRFGrid',
+    k0_config=dict(n_comp=48),
+)
+
+fine_train = dict(
+    N_iters=30000,
+    #N_iters=60000,
+    N_rand=4096 * 1,
+    #weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000],
+    ray_sampler='flatten',
+    tv_before=1e9,
+    tv_dense_before=10000,
+    weight_tv_density=1e-5,
+    weight_tv_k0=1e-6,
+)
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=320**3,
+    density_type='DenseGrid',
+    density_config=dict(n_comp=1),
+    k0_type='TensoRFGrid',
+    k0_config=dict(n_comp=48),
+
+    mpi_depth=128,
+    rgbnet_dim=9,
+    rgbnet_width=64,
+    world_bound_scale=1,
+    fast_color_thres=1e-1,
+)
diff --git a/configs/llff/seg/seg_butcher.py b/configs/llff/seg/seg_butcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d98eed69df067cf4e0b79b1eb27cda41cd1cb20b
--- /dev/null
+++ b/configs/llff/seg/seg_butcher.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'butcher'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_butcher',
+)
diff --git a/configs/llff/seg/seg_chesstable.py b/configs/llff/seg/seg_chesstable.py
new file mode 100644
index 0000000000000000000000000000000000000000..c845fc88d73dcca44b815af1dd545ed1a794a562
--- /dev/null
+++ b/configs/llff/seg/seg_chesstable.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'chesstable'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data2_chesstable',
+)
diff --git a/configs/llff/seg/seg_fern.py b/configs/llff/seg/seg_fern.py
new file mode 100644
index 0000000000000000000000000000000000000000..658bb21756b31070ccbe586397756b68cbfc98b5
--- /dev/null
+++ b/configs/llff/seg/seg_fern.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'fern'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/fern',
+)
diff --git a/configs/llff/seg/seg_flower.py b/configs/llff/seg/seg_flower.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5b6c368419da4ff99481407406c2523ed0e99c
--- /dev/null
+++ b/configs/llff/seg/seg_flower.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'flower'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/flower',
+)
diff --git a/configs/llff/seg/seg_fortress.py b/configs/llff/seg/seg_fortress.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d996616ff3e309b4a2521730aad7d6253ca5f5
--- /dev/null
+++ b/configs/llff/seg/seg_fortress.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'fortress'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/fortress',
+)
+
diff --git a/configs/llff/seg/seg_horns.py b/configs/llff/seg/seg_horns.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7ff38bc7b8700dac4fec4ed9dfd66dbb1870d4
--- /dev/null
+++ b/configs/llff/seg/seg_horns.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'horns'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/horns',
+)
diff --git a/configs/llff/seg/seg_kitchen.py b/configs/llff/seg/seg_kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac02e2539d933ce0c6e787b77f4650dde082dad
--- /dev/null
+++ b/configs/llff/seg/seg_kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './llff_seg_default.py'
+
+expname = '360_dvgo_kitchen_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/kitchen',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/llff/seg/seg_leaves.py b/configs/llff/seg/seg_leaves.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb4362626bb5f0d1d9db80a185dcb7eef0f2899
--- /dev/null
+++ b/configs/llff/seg/seg_leaves.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'leaves'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/leaves',
+)
diff --git a/configs/llff/seg/seg_orchids.py b/configs/llff/seg/seg_orchids.py
new file mode 100644
index 0000000000000000000000000000000000000000..7288f1151f20f5c292c331fa340815a05276e622
--- /dev/null
+++ b/configs/llff/seg/seg_orchids.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'orchids'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/orchids',
+)
+
diff --git a/configs/llff/seg/seg_pond.py b/configs/llff/seg/seg_pond.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c6e69f858bc8c47da4c189f84b0546da2cb7b3d
--- /dev/null
+++ b/configs/llff/seg/seg_pond.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'pond'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/pond',
+)
diff --git a/configs/llff/seg/seg_room.py b/configs/llff/seg/seg_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef1c4b99b03f0a8b4bd52fe99895cf571e338e0
--- /dev/null
+++ b/configs/llff/seg/seg_room.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'room'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/room',
+)
diff --git a/configs/llff/seg/seg_santarex.py b/configs/llff/seg/seg_santarex.py
new file mode 100644
index 0000000000000000000000000000000000000000..422abff46e9494208c8e64deeab8ceaf07a229ad
--- /dev/null
+++ b/configs/llff/seg/seg_santarex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'santarex'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data_santarex',
+)
diff --git a/configs/llff/seg/seg_shoerack.py b/configs/llff/seg/seg_shoerack.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd93c1ff08f3290f6a5501b6ae9288353672338
--- /dev/null
+++ b/configs/llff/seg/seg_shoerack.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'shoerack'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_shoerack',
+)
+
diff --git a/configs/llff/seg/seg_statue.py b/configs/llff/seg/seg_statue.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0286c7ea48d10a6a045362e21d7543f9a2b2d7
--- /dev/null
+++ b/configs/llff/seg/seg_statue.py
@@ -0,0 +1,8 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'statue'
+
+data = dict(
+    datadir='./data/statue',
+    factor=1,
+)
diff --git a/configs/llff/seg/seg_stove.py b/configs/llff/seg/seg_stove.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e06241a0b76e371ca34187abdc25ed97f43b51
--- /dev/null
+++ b/configs/llff/seg/seg_stove.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'stove'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_stove',
+)
diff --git a/configs/llff/seg/seg_trex.py b/configs/llff/seg/seg_trex.py
new file mode 100644
index 0000000000000000000000000000000000000000..528721d4051fadcd769b725588675713ef799ab1
--- /dev/null
+++ b/configs/llff/seg/seg_trex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_seg_default.py'
+
+expname = 'trex'
+
+data = dict(
+    datadir='./data/nerf_data/nerf_llff_data(NVOS)/trex',
+)
diff --git a/configs/llff/shelves.py b/configs/llff/shelves.py
new file mode 100644
index 0000000000000000000000000000000000000000..002f35d6ef6df9cecb9149946640034ed0a20ae6
--- /dev/null
+++ b/configs/llff/shelves.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'shelves'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/shelves',
+)
diff --git a/configs/llff/shoerack.py b/configs/llff/shoerack.py
new file mode 100644
index 0000000000000000000000000000000000000000..08fae68c3abb3c176c46627b2011e0785c78fa88
--- /dev/null
+++ b/configs/llff/shoerack.py
@@ -0,0 +1,8 @@
+_base_ = './llff_default.py'
+
+expname = 'shoerack'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_shoerack',
+)
+
diff --git a/configs/llff/statue.py b/configs/llff/statue.py
new file mode 100644
index 0000000000000000000000000000000000000000..825b8024563843b5998071b79ef22842c1b93a5a
--- /dev/null
+++ b/configs/llff/statue.py
@@ -0,0 +1,14 @@
+_base_ = './llff_default.py'
+
+expname = 'statue'
+
+data = dict(
+    datadir='data/statue',
+    factor=1,
+    # ndc=True,
+    # spherify=False,
+    # white_bkgd=True,
+    # rand_bkgd=False,
+    # unbounded_inward=False,
+    # load2gpu_on_the_fly=False,
+)
diff --git a/configs/llff/stove.py b/configs/llff/stove.py
new file mode 100644
index 0000000000000000000000000000000000000000..026aa7a4abff8a325d8e9bfdf5207f186990ed14
--- /dev/null
+++ b/configs/llff/stove.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'stove'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/real_iconic/data4_stove',
+)
diff --git a/configs/llff/succtrough.py b/configs/llff/succtrough.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd699b49221d7750186d038a4cb37d5fd51e8d3
--- /dev/null
+++ b/configs/llff/succtrough.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'succtrough'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/succtrough',
+)
diff --git a/configs/llff/trex.py b/configs/llff/trex.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b299e74fa14d3bcec0bdf68e4c47d0647e7e774
--- /dev/null
+++ b/configs/llff/trex.py
@@ -0,0 +1,7 @@
+_base_ = './llff_default.py'
+
+expname = 'trex'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/trex',
+)
diff --git a/configs/nerf_unbounded/bicycle.py b/configs/nerf_unbounded/bicycle.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95e646989e0f48b448327dc0efdc2b1a9b9c0e5
--- /dev/null
+++ b/configs/nerf_unbounded/bicycle.py
@@ -0,0 +1,14 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_bicycle_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/bicycle',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
+
diff --git a/configs/nerf_unbounded/bonsai.py b/configs/nerf_unbounded/bonsai.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f8d5b0be377547adbf3fd2dfbcb314984864d3
--- /dev/null
+++ b/configs/nerf_unbounded/bonsai.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_bonsai_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/bonsai',
+    factor=4, # 1559x1039
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=0, # negative down
+        shift_z=0,
+        scale_r=1.0,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/counter.py b/configs/nerf_unbounded/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c7d362485b684b6e4da4bafd40868f5cc89f55e
--- /dev/null
+++ b/configs/nerf_unbounded/counter.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_counter_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/counter',
+    factor=4, # 1558x1038
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.2, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/fish.py b/configs/nerf_unbounded/fish.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fb9ba7968bc1e60f34d0d10ea02ba2a2748798d
--- /dev/null
+++ b/configs/nerf_unbounded/fish.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_fish_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/fish',
+    factor=2, # 1297x840
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        #shift_y=-0.0, # negative down
+        shift_y=-0.10, # negative down
+        shift_z=0.0,
+        scale_r=0.9,
+        pitch_deg=-0,
+    ),
+)
diff --git a/configs/nerf_unbounded/fork.py b/configs/nerf_unbounded/fork.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80c67079744bcf80e0828b005cc3bffb56f5308
--- /dev/null
+++ b/configs/nerf_unbounded/fork.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_fork_unbounded'
+
+data = dict(
+    datadir='./data/fork/dense',
+    factor=8, # 1558x1038
+    bd_factor=None,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=0.0, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/garden.py b/configs/nerf_unbounded/garden.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecbc1dbbb10c0c8ba50c7f68d24bf66e0e64a4ac
--- /dev/null
+++ b/configs/nerf_unbounded/garden.py
@@ -0,0 +1,15 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_garden_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/garden',
+    factor=8,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.0, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30,
+    ),
+)
diff --git a/configs/nerf_unbounded/kitchen.py b/configs/nerf_unbounded/kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..aec8a4bcb666a39c0afbea633958023a8648a167
--- /dev/null
+++ b/configs/nerf_unbounded/kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_kitchen_unbounded'
+
+data = dict(
+    datadir='./nerf/data/360_v2/kitchen',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/nerf_unbounded/lab_desk.py b/configs/nerf_unbounded/lab_desk.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fba48d4e7e1a18c9f645959886a4dc34ccf55ab
--- /dev/null
+++ b/configs/nerf_unbounded/lab_desk.py
@@ -0,0 +1,8 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'lab_desk'
+
+data = dict(
+    datadir='./data/nerf_llff_data(NVOS)/lab_desk',
+    factor=2,
+)
diff --git a/configs/nerf_unbounded/legohouse.py b/configs/nerf_unbounded/legohouse.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c49c26a20676c26703c9af54d15b1357819305e
--- /dev/null
+++ b/configs/nerf_unbounded/legohouse.py
@@ -0,0 +1,13 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_legohouse_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/legohouse',
+    factor=8,
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/nerf_unbounded/mat.py b/configs/nerf_unbounded/mat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f912bcd496657673b3d0d692dfba69bff3058bb6
--- /dev/null
+++ b/configs/nerf_unbounded/mat.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_mat_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/mat',
+    factor=2, # 1297x840
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        #shift_y=-0.0, # negative down
+        shift_y=-0.10, # negative down
+        shift_z=0.0,
+        scale_r=1.0,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/nerf_unbounded/nerf_unbounded_default.py b/configs/nerf_unbounded/nerf_unbounded_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb18b7e885aff50f7f82b9a6817f775529e26d42
--- /dev/null
+++ b/configs/nerf_unbounded/nerf_unbounded_default.py
@@ -0,0 +1,51 @@
+_base_ = '../default.py'
+
+basedir = './logs/nerf_unbounded'
+
+data = dict(
+    dataset_type='llff',
+    spherify=True,
+    factor=4,
+    llffhold=8,
+    white_bkgd=True,
+    rand_bkgd=True,
+    unbounded_inward=True,
+    load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+    N_iters=800000,
+    N_rand=1024 * 4,
+    lrate_decay=80,
+    ray_sampler='flatten',
+    weight_nearclip=1.0,
+    weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+    tv_before=20000,
+    tv_dense_before=20000,
+    weight_tv_density=1e-6,
+    weight_tv_k0=1e-7,
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=160**3,
+    alpha_init=alpha_init,
+    stepsize=stepsize,
+    fast_color_thres={
+        '_delete_': True,
+        0   : alpha_init*stepsize/10,
+        1500: min(alpha_init, 1e-4)*stepsize/5,
+        2500: min(alpha_init, 1e-4)*stepsize/2,
+        3500: min(alpha_init, 1e-4)*stepsize/1.5,
+        4500: min(alpha_init, 1e-4)*stepsize,
+        5500: min(alpha_init, 1e-4),
+        6500: 1e-4,
+    },
+    world_bound_scale=1,
+)
diff --git a/configs/nerf_unbounded/pinecone.py b/configs/nerf_unbounded/pinecone.py
new file mode 100644
index 0000000000000000000000000000000000000000..4daa91a07a6aeb375efd106e0f108395a1ff6831
--- /dev/null
+++ b/configs/nerf_unbounded/pinecone.py
@@ -0,0 +1,15 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_pinecone_unbounded'
+
+data = dict(
+    datadir='./data/nerf_real_360/pinecone',
+    factor=8, # 484x363
+    movie_render_kwargs=dict(
+        shift_x=0.0,
+        shift_y=0.0,
+        shift_z=0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
\ No newline at end of file
diff --git a/configs/nerf_unbounded/redtable.py b/configs/nerf_unbounded/redtable.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd3e20e100f39b6ba50c7a6b0d9c6a015f2f6b0
--- /dev/null
+++ b/configs/nerf_unbounded/redtable.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_redtable_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/redtable',
+    factor=2, # 1297x840
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        #shift_y=-0.0, # negative down
+        shift_y=-0.10, # negative down
+        shift_z=0.0,
+        scale_r=0.9,
+        pitch_deg=-0,
+    ),
+)
diff --git a/configs/nerf_unbounded/room.py b/configs/nerf_unbounded/room.py
new file mode 100644
index 0000000000000000000000000000000000000000..602e5a86434a1c7cab42d827cd364af84ae331a8
--- /dev/null
+++ b/configs/nerf_unbounded/room.py
@@ -0,0 +1,17 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_room_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/room',
+    # factor=2, # 1557x1038
+    factor=4,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.3, # negative down
+        shift_z=0,
+        scale_r=0.2,
+        pitch_deg=-40, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/sculptures.py b/configs/nerf_unbounded/sculptures.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f9398989e1039ee38f9253daf99b85803603e7
--- /dev/null
+++ b/configs/nerf_unbounded/sculptures.py
@@ -0,0 +1,16 @@
+_base_ = './nerf_unbounded_default.py'
+
+expname = 'dcvgo_sculptures_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/sculptures',
+    factor=4, # 1297x840
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        #shift_y=-0.0, # negative down
+        shift_y=-0.10, # negative down
+        shift_z=0.0,
+        scale_r=0.9,
+        pitch_deg=-0,
+    ),
+)
diff --git a/configs/nerf_unbounded/seg_bicycle.py b/configs/nerf_unbounded/seg_bicycle.py
new file mode 100644
index 0000000000000000000000000000000000000000..54ffa8ca8680b0a5a4b1169c6a1fc33575b00036
--- /dev/null
+++ b/configs/nerf_unbounded/seg_bicycle.py
@@ -0,0 +1,14 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_bicycle_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/bicycle',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
+
diff --git a/configs/nerf_unbounded/seg_bonsai.py b/configs/nerf_unbounded/seg_bonsai.py
new file mode 100644
index 0000000000000000000000000000000000000000..d186467b2e8012653c314225dd7f1bee15d19778
--- /dev/null
+++ b/configs/nerf_unbounded/seg_bonsai.py
@@ -0,0 +1,16 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_bonsai_unbounded'
+
+data = dict(
+    datadir='./data/nerf/360_v2/bonsai',
+    factor=4, # 1559x1039
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=0, # negative down
+        shift_z=0,
+        scale_r=1.0,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/seg_counter.py b/configs/nerf_unbounded/seg_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18f4c63884ef6f87ef24f1dd0008ad43f096391
--- /dev/null
+++ b/configs/nerf_unbounded/seg_counter.py
@@ -0,0 +1,16 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_counter_unbounded'
+
+data = dict(
+    datadir='./data/nerf/360_v2/counter',
+    factor=8, # 1558x1038
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.2, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/seg_fork.py b/configs/nerf_unbounded/seg_fork.py
new file mode 100644
index 0000000000000000000000000000000000000000..5318166be98d21ed1b0b6f2c791505c185345e4e
--- /dev/null
+++ b/configs/nerf_unbounded/seg_fork.py
@@ -0,0 +1,17 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dcvgo_fork_unbounded'
+
+data = dict(
+    datadir='./data/nerf/fork/',
+    factor=8, # 1558x1038
+    bd_factor=None,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=0.0, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30, # negative look downward
+    ),
+)
+
diff --git a/configs/nerf_unbounded/seg_garden.py b/configs/nerf_unbounded/seg_garden.py
new file mode 100644
index 0000000000000000000000000000000000000000..11909ce245e4140afe24f174e7fdc3bf5d591940
--- /dev/null
+++ b/configs/nerf_unbounded/seg_garden.py
@@ -0,0 +1,15 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_garden_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/garden',
+    factor=8,
+    movie_render_kwargs=dict(
+        shift_x=0.0,  # positive right
+        shift_y=-0.0, # negative down
+        shift_z=0,
+        scale_r=0.9,
+        pitch_deg=-30,
+    ),
+)
diff --git a/configs/nerf_unbounded/seg_kitchen.py b/configs/nerf_unbounded/seg_kitchen.py
new file mode 100644
index 0000000000000000000000000000000000000000..90caab07762467ab5488121539b9afd78ae5d7ef
--- /dev/null
+++ b/configs/nerf_unbounded/seg_kitchen.py
@@ -0,0 +1,13 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dvgo_kitchen_unbounded'
+
+data = dict(
+    datadir='./data/360_v2/kitchen',
+    factor=4, # 1558x1039
+    movie_render_kwargs=dict(
+        shift_y=-0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/nerf_unbounded/seg_nerf_unbounded_default.py b/configs/nerf_unbounded/seg_nerf_unbounded_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ac5106edf1433df199ae1a3baf669a21fe67fb
--- /dev/null
+++ b/configs/nerf_unbounded/seg_nerf_unbounded_default.py
@@ -0,0 +1,52 @@
+_base_ = '../seg_default.py'
+
+basedir = './logs/nerf_unbounded'
+
+data = dict(
+    dataset_type='llff',
+    spherify=True,
+    factor=4,
+    llffhold=8,
+    white_bkgd=True,
+    rand_bkgd=True,
+    unbounded_inward=True,
+    load2gpu_on_the_fly=True,
+)
+
+coarse_train = dict(N_iters=0)
+
+fine_train = dict(
+    N_iters=800000,
+    N_rand=1024 * 4,
+    lrate_decay=80,
+    ray_sampler='flatten',
+    weight_nearclip=1.0,
+    weight_distortion=0.01,
+    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],
+    tv_before=20000,
+    tv_dense_before=20000,
+    weight_tv_density=1e-6,
+    weight_tv_k0=1e-7,
+)
+
+alpha_init = 1e-4
+stepsize = 0.5
+
+fine_model_and_render = dict(
+    num_voxels=320**3,
+    num_voxels_base=160**3,
+    alpha_init=alpha_init,
+    stepsize=stepsize,
+    fast_color_thres=0.1,
+#     fast_color_thres={
+#         '_delete_': True,
+#         0   : alpha_init*stepsize/10,
+#         1500: min(alpha_init, 1e-4)*stepsize/5,
+#         2500: min(alpha_init, 1e-4)*stepsize/2,
+#         3500: min(alpha_init, 1e-4)*stepsize/1.5,
+#         4500: min(alpha_init, 1e-4)*stepsize,
+#         5500: min(alpha_init, 1e-4),
+#         6500: 1e-4,
+#     },
+    world_bound_scale=1,
+)
diff --git a/configs/nerf_unbounded/seg_pinecone.py b/configs/nerf_unbounded/seg_pinecone.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36a3617272f6fc48ce78cee2929c0ae6bf00696
--- /dev/null
+++ b/configs/nerf_unbounded/seg_pinecone.py
@@ -0,0 +1,15 @@
+_base_ = './seg_nerf_unbounded_default.py'
+
+expname = 'dcvgo_pinecone_unbounded'
+
+data = dict(
+    datadir='./data/nerf/nerf_real_360/pinecone',
+    factor=8, # 484x363
+    movie_render_kwargs=dict(
+        shift_x=0.0,
+        shift_y=0.0,
+        shift_z=0.0,
+        scale_r=0.9,
+        pitch_deg=-40,
+    ),
+)
diff --git a/configs/seg_default.py b/configs/seg_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85034dc67cb6e6121df7bd04a1b96994968b5d3
--- /dev/null
+++ b/configs/seg_default.py
@@ -0,0 +1,122 @@
+from copy import deepcopy
+
+expname = None                    # experiment name
+basedir = './logs/'               # where to store ckpts and logs
+
+''' Template of data options
+'''
+data = dict(
+    datadir=None,                 # path to dataset root folder
+    dataset_type=None,            # blender | nsvf | blendedmvs | tankstemple | deepvoxels | co3d
+    inverse_y=False,              # intrinsict mode (to support blendedmvs, nsvf, tankstemple)
+    flip_x=False,                 # to support co3d
+    flip_y=False,                 # to suppo/= 10
+    annot_path='',                # to support co3d
+    split_path='',                # to support co3d
+    sequence_name='',             # to support co3d
+#    load2gpu_on_the_fly=False,    # do not load all images into gpu (to save gpu memory)
+    load2gpu_on_the_fly=True,    # do not load all images into gpu (to save gpu memory)
+    testskip=5,                   # subsample testset to preview results
+    white_bkgd=True,             # use white background (note that some dataset don't provide alpha and with blended bg color)
+    rand_bkgd=False,              # use random background during training
+    half_res=False,               # [TODO]
+    bd_factor=.75,
+    movie_render_kwargs=dict(),
+
+    # Below are forward-facing llff specific settings.
+    ndc=False,                    # use ndc coordinate (only for forward-facing; not support yet)
+    spherify=False,               # inward-facing
+    factor=4,                     # [TODO]
+    width=None,                   # enforce image width
+    height=None,                  # enforce image height
+    llffhold=8,                   # testsplit
+    load_depths=False,            # load depth
+
+    # Below are unbounded inward-facing specific settings.
+    unbounded_inward=False,
+    unbounded_inner_r=1.0,
+)
+
+''' Template of training options
+'''
+coarse_train = dict(
+    N_iters=5000,                 # number of optimization steps
+    N_rand=8192,                  # batch size (number of random rays per optimization step)
+    #N_rand=1024,                  # batch size (number of random rays per optimization step)
+    lrate_seg_mask_grid=1,           # lr of segmentation voxel grid
+    lrate_dual_seg_mask_grid=1,           # lr of dual segmentation voxel grid
+    # lrate_k0_mask_grid=1e-2,
+    lrate_density=0,           # lr of density voxel grid
+    lrate_k0=0,                # lr of color/feature voxel grid
+    lrate_rgbnet=0,            # lr of the mlp to preduct view-dependent color
+    lrate_decay=20,               # lr decay by 0.1 after every lrate_decay*1000 steps
+    pervoxel_lr=False,             # view-count-based lr
+    pervoxel_lr_downrate=0,       # downsampled image for computing view-count-based lr
+    ray_sampler='random',         # ray sampling strategies
+    weight_main=1.0,              # weight of photometric loss
+    weight_entropy_last=0.01,     # weight of background entropy loss
+    weight_nearclip=0,
+    weight_distortion=0,
+    weight_rgbper=0.1,            # weight of per-point rgb loss
+    tv_every=1,                   # count total variation loss every tv_every step
+    tv_after=0,                   # count total variation loss from tv_from step
+    tv_before=0,                  # count total variation before the given number of iterations
+    tv_dense_before=0,            # count total variation densely before the given number of iterations
+    weight_tv_density=0.0,        # weight of total variation loss of density voxel grid
+    weight_tv_k0=0.0,             # weight of total variation loss of color/feature voxel grid
+    pg_scale=[],                  # checkpoints for progressive scaling
+    decay_after_scale=1.0,        # decay act_shift after scaling
+    skip_zero_grad_fields=[],     # the variable name to skip optimizing parameters w/ zero grad in each iteration
+    maskout_lt_nviews=0,
+)
+
+fine_train = deepcopy(coarse_train)
+fine_train.update(dict(
+    N_iters=20000,
+    pervoxel_lr=False,
+    ray_sampler='flatten',
+    weight_entropy_last=0.001,
+    weight_rgbper=0.01,
+    pg_scale=[1000, 2000, 3000, 4000],
+    skip_zero_grad_fields=['density', 'k0'],
+))
+
+''' Template of model and rendering options
+'''
+coarse_model_and_render = dict(
+    num_voxels=1024000,           # expected number of voxel
+    num_voxels_base=1024000,      # to rescale delta distance
+    density_type='DenseGrid',     # DenseGrid, TensoRFGrid
+    k0_type='TensoRFGrid',        # DenseGrid, TensoRFGrid
+    density_config=dict(),
+    k0_config=dict(n_comp=48),
+    mpi_depth=128,                # the number of planes in Multiplane Image (work when ndc=True)
+    nearest=False,                # nearest interpolation
+    pre_act_density=False,        # pre-activated trilinear interpolation
+    in_act_density=False,         # in-activated trilinear interpolation
+    bbox_thres=1e-3,              # threshold to determine known free-space in the fine stage
+    mask_cache_thres=1e-3,        # threshold to determine a tighten BBox in the fine stage
+    rgbnet_dim=0,                 # feature voxel grid dim
+    rgbnet_full_implicit=False,   # let the colors MLP ignore feature voxel grid
+    rgbnet_direct=True,           # set to False to treat the first 3 dim of feature voxel grid as diffuse rgb
+    rgbnet_depth=3,               # depth of the colors MLP (there are rgbnet_depth-1 intermediate features)
+    rgbnet_width=128,             # width of the colors MLP
+    alpha_init=1e-6,              # set the alpha values everywhere at the begin of training
+    fast_color_thres=1e-7,        # threshold of alpha value to skip the fine stage sampled point
+    maskout_near_cam_vox=True,    # maskout grid points that between cameras and their near planes
+    world_bound_scale=1,          # rescale the BBox enclosing the scene
+    stepsize=0.5,                 # sampling stepsize in volume rendering
+)
+
+fine_model_and_render = deepcopy(coarse_model_and_render)
+fine_model_and_render.update(dict(
+    num_voxels=160**3,
+    num_voxels_base=160**3,
+    rgbnet_dim=12,
+    alpha_init=1e-2,
+    fast_color_thres=1e-4,
+    maskout_near_cam_vox=False,
+    world_bound_scale=1.05,
+))
+
+del deepcopy
diff --git a/croco/LICENSE b/croco/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d
--- /dev/null
+++ b/croco/LICENSE
@@ -0,0 +1,52 @@
+CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+	
+	
+SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+This software in this file incorporates parts of the following software available here:
+
+Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py
+available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE
+
+MoCo v3: https://github.com/facebookresearch/moco-v3
+available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE
+
+DeiT: https://github.com/facebookresearch/deit
+available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE
+
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/facebookresearch/mae/blob/main/LICENSE
+
+Attribution-NonCommercial 4.0 International
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/blocks.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/rwightman/pytorch-image-models
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/croco/NOTICE b/croco/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c
--- /dev/null
+++ b/croco/NOTICE
@@ -0,0 +1,21 @@
+CroCo
+Copyright 2022-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms. 
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+facebookresearch/mae
+https://github.com/facebookresearch/mae
+
+Attribution-NonCommercial 4.0 International
+
+====
+
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/croco/README.MD b/croco/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b
--- /dev/null
+++ b/croco/README.MD
@@ -0,0 +1,124 @@
+# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow
+
+[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)]
+
+This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2:
+
+![image](assets/arch.jpg)
+
+```bibtex
+@inproceedings{croco,
+  title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}},
+  author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}},
+  booktitle={{NeurIPS}},
+  year={2022}
+}
+
+@inproceedings{croco_v2,
+  title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}},
+  author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me}, 
+  booktitle={ICCV},
+  year={2023}
+}
+```
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information.
+Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License.
+Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license.
+
+## Preparation
+
+1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version.
+
+```bash
+conda create -n croco python=3.7 cmake=3.14.0
+conda activate croco
+conda install habitat-sim headless -c conda-forge -c aihabitat
+conda install pytorch torchvision -c pytorch
+conda install notebook ipykernel matplotlib
+conda install ipywidgets widgetsnbextension
+conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation
+
+```
+
+2. Compile cuda kernels for RoPE
+
+CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels.
+```bash
+cd models/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+
+This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only.
+You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation.
+
+In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded.
+
+3. Download pre-trained model
+
+We provide several pre-trained models:
+
+| modelname                                                                                                                          | pre-training data | pos. embed. | Encoder | Decoder |
+|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------|
+| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth)                                                 | Habitat           | cosine      | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real    | RoPE        | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth)   | Habitat + real    | RoPE        | ViT-B   | Base    |
+| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real    | RoPE        | ViT-L   | Base    |
+
+To download a specific model, i.e., the first one (`CroCo.pth`)
+```bash
+mkdir -p pretrained_models/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/
+```
+
+## Reconstruction example
+
+Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`)
+```bash
+python demo.py
+```
+
+## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator
+
+First download the test scene from Habitat:
+```bash
+python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/
+```
+
+Then, run the Notebook demo `interactive_demo.ipynb`.
+
+In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo.
+![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg)
+
+## Pre-training 
+
+### CroCo 
+
+To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command:
+```
+torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/
+```
+
+Our CroCo pre-training was launched on a single server with 4 GPUs.
+It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+The first run can take a few minutes to start, to parse all available pre-training pairs.
+
+### CroCo v2 
+
+For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD).
+Then, run the following command for the largest model (ViT-L encoder, Base decoder):
+```
+torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/
+```
+
+Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases.
+The largest model should take around 12 days on A100.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+
+## Stereo matching and Optical flow downstream tasks
+
+For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD).
diff --git a/croco/assets/Chateau1.png b/croco/assets/Chateau1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d282fc6a51c00b8dd8267d5d507220ae253c2d65
Binary files /dev/null and b/croco/assets/Chateau1.png differ
diff --git a/croco/assets/Chateau2.png b/croco/assets/Chateau2.png
new file mode 100644
index 0000000000000000000000000000000000000000..722b2fc553ec089346722efb9445526ddfa8e7bd
Binary files /dev/null and b/croco/assets/Chateau2.png differ
diff --git a/croco/assets/arch.jpg b/croco/assets/arch.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3f5b032729ddc58c06d890a0ebda1749276070c4
Binary files /dev/null and b/croco/assets/arch.jpg differ
diff --git a/croco/croco-stereo-flow-demo.ipynb b/croco/croco-stereo-flow-demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0
--- /dev/null
+++ b/croco/croco-stereo-flow-demo.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9bca0f41",
+   "metadata": {},
+   "source": [
+    "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80653ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f033862",
+   "metadata": {},
+   "source": [
+    "First download the model(s) of your choice by running\n",
+    "```\n",
+    "bash stereoflow/download_model.sh crocostereo.pth\n",
+    "bash stereoflow/download_model.sh crocoflow.pth\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb2e392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "import matplotlib.pylab as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0e25d77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from stereoflow.test import _load_model_and_criterion\n",
+    "from stereoflow.engine import tiled_pred\n",
+    "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+    "from stereoflow.datasets_flow import flowToColor\n",
+    "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86a921f5",
+   "metadata": {},
+   "source": [
+    "### CroCo-Stereo example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64e483cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_left_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_right_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0d04303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47dc14b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "583b9f16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(vis_disparity(pred))\n",
+    "plt.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2df5d70",
+   "metadata": {},
+   "source": [
+    "### CroCo-Flow example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ee257a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_first_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_second_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5edccf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19692c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f79db3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(flowToColor(pred))\n",
+    "plt.axis('off')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/croco/datasets/__init__.py b/croco/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/croco/datasets/crops/README.MD b/croco/datasets/crops/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f
--- /dev/null
+++ b/croco/datasets/crops/README.MD
@@ -0,0 +1,104 @@
+## Generation of crops from the real datasets
+
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+
+### Download the metadata of the crops to generate 
+
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+
+### Prepare the original datasets 
+
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+
+##### ARKitScenes
+
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+    └───40753679
+     │  │   ultrawide
+     │  │   ...
+     └───40753686
+     │   
+      ...
+```
+
+##### MegaDepth
+
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│   └───images
+│    │      │   1000557903_87fa96b8a4_o.jpg
+│    │      └ ...
+│    └─── ...
+└───0001
+│   │   
+│   └ ...
+└─── ...
+```
+
+##### 3DStreetView
+
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+
+``` 
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│   └───0002
+│    │      │   0000002_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+└───dataset_unaligned
+│   └───0003
+│    │      │   0000003_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+```
+
+##### IndoorVL
+
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install  "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install  "GangnamStation_*"
+cd -
+```
+
+### Extract the crops
+
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; 
+do 
+  python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+
+##### Note for IndoorVL
+
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.
diff --git a/croco/datasets/crops/extract_crops_from_images.py b/croco/datasets/crops/extract_crops_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb66a0474ce44b54c44c08887cbafdb045b11ff3
--- /dev/null
+++ b/croco/datasets/crops/extract_crops_from_images.py
@@ -0,0 +1,159 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list')
+
+    parser.add_argument('--crops', type=str, required=True, help='crop file')
+    parser.add_argument('--root-dir', type=str, required=True, help='root directory')
+    parser.add_argument('--output-dir', type=str, required=True, help='output directory')
+    parser.add_argument('--imsize', type=int, default=256, help='size of the crops')
+    parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads')
+    parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories')
+    parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir')
+    return parser
+
+
+def main(args):
+    listing_path = os.path.join(args.output_dir, 'listing.txt')
+
+    print(f'Loading list of crops ... ({args.nthread} threads)')
+    crops, num_crops_to_generate = load_crop_file(args.crops)
+
+    print(f'Preparing jobs ({len(crops)} candidate image pairs)...')
+    num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels)
+    num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels))
+
+    jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+    del crops
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+    call = functools.partial(save_image_crops, args)
+
+    print(f"Generating cropped images to {args.output_dir} ...")
+    with open(listing_path, 'w') as listing:
+        listing.write('# pair_path\n')
+        for results in tqdm(mmap(call, jobs), total=len(jobs)):
+            for path in results:
+                listing.write(f'{path}\n')
+    print('Finished writing listing to', listing_path)
+
+
+def load_crop_file(path):
+    data = open(path).read().splitlines()
+    pairs = []
+    num_crops_to_generate = 0
+    for line in tqdm(data):
+        if line.startswith('#'):
+            continue
+        line = line.split(', ')
+        if len(line) < 8:
+            img1, img2, rotation = line
+            pairs.append((img1, img2, int(rotation), []))
+        else:
+            l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+            rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+            pairs[-1][-1].append((rect1, rect2))
+            num_crops_to_generate += 1
+    return pairs, num_crops_to_generate
+
+
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+    jobs = []
+    powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+
+    def get_path(idx):
+        idx_array = []
+        d = idx
+        for level in range(num_levels - 1):
+            idx_array.append(idx // powers[level])
+            idx = idx % powers[level]
+        idx_array.append(d)
+        return '/'.join(map(lambda x: hex(x)[2:], idx_array))
+
+    idx = 0
+    for pair_data in tqdm(pairs):
+        img1, img2, rotation, crops = pair_data
+        if -60 <= rotation and rotation <= 60:
+            rotation = 0  # most likely not a true rotation
+        paths = [get_path(idx + k) for k in range(len(crops))]
+        idx += len(crops)
+        jobs.append(((img1, img2), rotation, crops, paths))
+    return jobs
+
+
+def load_image(path):
+    try:
+        return Image.open(path).convert('RGB')
+    except Exception as e:
+        print('skipping', path, e)
+        raise OSError()
+
+
+def save_image_crops(args, data):
+    # load images
+    img_pair, rot, crops, paths = data
+    try:
+        img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair]
+    except OSError as e:
+        return []
+
+    def area(sz):
+        return sz[0] * sz[1]
+
+    tgt_size = (args.imsize, args.imsize)
+
+    def prepare_crop(img, rect, rot=0):
+        # actual crop
+        img = img.crop(rect)
+
+        # resize to desired size
+        interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC
+        img = img.resize(tgt_size, resample=interp)
+
+        # rotate the image
+        rot90 = (round(rot/90) % 4) * 90
+        if rot90 == 90:
+            img = img.transpose(Image.Transpose.ROTATE_90)
+        elif rot90 == 180:
+            img = img.transpose(Image.Transpose.ROTATE_180)
+        elif rot90 == 270:
+            img = img.transpose(Image.Transpose.ROTATE_270)
+        return img
+
+    results = []
+    for (rect1, rect2), path in zip(crops, paths):
+        crop1 = prepare_crop(img1, rect1)
+        crop2 = prepare_crop(img2, rect2, rot)
+
+        fullpath1 = os.path.join(args.output_dir,  path+'_1.jpg')
+        fullpath2 = os.path.join(args.output_dir,  path+'_2.jpg')
+        os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+
+        assert not os.path.isfile(fullpath1), fullpath1
+        assert not os.path.isfile(fullpath2), fullpath2
+        crop1.save(fullpath1)
+        crop2.save(fullpath2)
+        results.append(path)
+
+    return results
+
+
+if __name__ == '__main__':
+    args = arg_parser().parse_args()
+    main(args)
+
diff --git a/croco/datasets/habitat_sim/README.MD b/croco/datasets/habitat_sim/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940
--- /dev/null
+++ b/croco/datasets/habitat_sim/README.MD
@@ -0,0 +1,76 @@
+## Generation of synthetic image pairs using Habitat-Sim
+
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+   └──scene_datasets/
+      ├──hm3d/
+      ├──gibson/
+      ├──habitat-test-scenes/
+      ├──replica_cad_baked_lighting/
+      ├──replica_cad/
+      ├──ReplicaDataset/
+      └──scannet/
+```
+
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+  - Scannet: 1097 scenes / 985 209 pairs
+  - HM3D:
+    - hm3d/train: 800 / 800k pairs
+    - hm3d/val: 100 scenes / 100k pairs
+    - hm3d/minival: 10 scenes / 10k pairs
+  - habitat-test-scenes: 3 scenes / 3k pairs
+  - replica_cad_baked_lighting: 13 scenes / 13k pairs
+
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+
+## Metadata generation
+
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR  $METADATA_DIR
+```
diff --git a/croco/datasets/habitat_sim/__init__.py b/croco/datasets/habitat_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/croco/datasets/habitat_sim/generate_from_metadata.py b/croco/datasets/habitat_sim/generate_from_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0d399084359495250dc8184671ff498adfbf2
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_from_metadata.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+
+def generate_multiview_images_from_metadata(metadata_filename,
+                                            output_dir,
+                                            overload_params = dict(),
+                                            scene_datasets_paths=None,
+                                            exist_ok=False):   
+    """
+    Generate images from a metadata file for reproducibility purposes.
+    """
+    # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+    if scene_datasets_paths is not None:
+        scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True))
+
+    with open(metadata_filename, 'r') as f:
+        input_metadata = json.load(f)
+    metadata = dict()
+    for key, value in input_metadata.items():
+        # Optionally replace some paths
+        if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+            if scene_datasets_paths is not None:
+                for dataset_label, dataset_path in scene_datasets_paths.items():
+                    if value.startswith(dataset_label):
+                        value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label)))
+                        break
+        metadata[key] = value
+
+    # Overload some parameters
+    for key, value in overload_params.items():
+        metadata[key] = value
+
+    generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))])
+    generate_depth = metadata["generate_depth"]
+
+    os.makedirs(output_dir, exist_ok=exist_ok)
+ 
+    generator = MultiviewHabitatSimGenerator(**generation_entries)
+
+    # Generate views
+    for idx_label, data in tqdm(metadata['multiviews'].items()):
+        positions = data["positions"]
+        orientations = data["orientations"]
+        n = len(positions)
+        for oidx in range(n):
+            observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx]))
+            observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+            # Color image saved using PIL
+            img = PIL.Image.fromarray(observation['color'][:,:,:3])
+            filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+            img.save(filename)
+            if generate_depth:
+                # Depth image as EXR file
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                # Camera parameters
+                camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                with open(filename, "w") as f:
+                    json.dump(camera_params, f)
+                # Save metadata
+    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+        json.dump(metadata, f)
+
+    generator.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metadata_filename", required=True)
+    parser.add_argument("--output_dir", required=True)
+    args = parser.parse_args()
+
+    generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename,
+                             output_dir=args.output_dir,
+                             scene_datasets_paths=SCENES_DATASET,
+                             overload_params=dict(),
+                             exist_ok=True)
+
+ 
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/generate_from_metadata_files.py b/croco/datasets/habitat_sim/generate_from_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..962ef849d8c31397b8622df4f2d9140175d78873
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_from_metadata_files.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.")
+    args = parser.parse_args()
+
+    input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True)
+
+    for metadata_filename in tqdm(input_metadata_filenames):
+        output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir))
+        # Do not process the scene if the metadata file already exists
+        if os.path.exists(os.path.join(output_dir, "metadata.json")):
+            continue
+        commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+        print(commandline)
diff --git a/croco/datasets/habitat_sim/generate_multiview_images.py b/croco/datasets/habitat_sim/generate_multiview_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..421d49a1696474415940493296b3f2d982398850
--- /dev/null
+++ b/croco/datasets/habitat_sim/generate_multiview_images.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+
+def generate_multiview_images_for_scene(scene_dataset_config_file,
+                                        scene,
+                                        navmesh,
+                                        output_dir, 
+                                        views_count,
+                                        size, 
+                                        exist_ok=False, 
+                                        generate_depth=False,
+                                        **kwargs):
+    """
+    Generate tuples of overlapping views for a given scene.
+    generate_depth: generate depth images and camera parameters.
+    """
+    if os.path.exists(output_dir) and not exist_ok:
+        print(f"Scene {scene}: data already generated. Ignoring generation.")
+        return
+    try:
+        print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+        os.makedirs(output_dir, exist_ok=exist_ok)
+
+        metadata_filename = os.path.join(output_dir, "metadata.json")
+
+        metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene, 
+            navmesh=navmesh,
+            views_count=views_count,
+            size=size,
+            generate_depth=generate_depth,
+            **kwargs)
+        metadata_template["multiviews"] = dict()
+
+        if os.path.exists(metadata_filename):
+            print("Metadata file already exists:", metadata_filename)
+            print("Loading already generated metadata file...")
+            with open(metadata_filename, "r") as f:
+                metadata = json.load(f)
+
+            for key in metadata_template.keys():
+                if key != "multiviews":
+                    assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+        else:
+            print("No temporary file found. Starting generation from scratch...")
+            metadata = metadata_template
+
+        starting_id = len(metadata["multiviews"])
+        print(f"Starting generation from index {starting_id}/{size}...")
+        if starting_id >= size:
+            print("Generation already done.")
+            return
+
+        generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file,
+                                                scene=scene,
+                                                navmesh=navmesh,
+                                                views_count = views_count,
+                                                size = size,
+                                                **kwargs)
+
+        for idx in tqdm(range(starting_id, size)):
+            # Generate / re-generate the observations
+            try:
+                data = generator[idx]
+                observations = data["observations"]
+                positions = data["positions"]
+                orientations = data["orientations"]
+
+                idx_label = f"{idx:08}"
+                for oidx, observation in enumerate(observations):
+                    observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+                    # Color image saved using PIL
+                    img = PIL.Image.fromarray(observation['color'][:,:,:3])
+                    filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+                    img.save(filename)
+                    if generate_depth:
+                        # Depth image as EXR file
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                        cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                        # Camera parameters
+                        camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                        with open(filename, "w") as f:
+                            json.dump(camera_params, f)
+                metadata["multiviews"][idx_label] = {"positions": positions.tolist(),
+                                                    "orientations": orientations.tolist(),
+                                                    "covisibility_ratios": data["covisibility_ratios"].tolist(),
+                                                    "valid_fractions": data["valid_fractions"].tolist(),
+                                                    "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()}
+            except RecursionError:
+                print("Recursion error: unable to sample observations for this scene. We will stop there.")
+                break
+
+            # Regularly save a temporary metadata file, in case we need to restart the generation
+            if idx % 10 == 0:
+                with open(metadata_filename, "w") as f:
+                    json.dump(metadata, f)
+
+        # Save metadata
+        with open(metadata_filename, "w") as f:
+            json.dump(metadata, f)
+
+        generator.close()
+    except NoNaviguableSpaceError:
+        pass
+
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+    """
+    Create a commandline string to generate a scene.
+    """
+    def my_formatting(val):
+        if val is None or val == "":
+            return '""'
+        else:
+            return val
+    commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} 
+    --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} 
+    --navmesh {my_formatting(scene_data.navmesh)} 
+    --output_dir {my_formatting(scene_data.output_dir)} 
+    --generate_depth {int(generate_depth)} 
+    --exist_ok {int(exist_ok)}
+    """
+    commandline = " ".join(commandline.split())
+    return commandline
+
+if __name__ == "__main__":
+    os.umask(2)
+
+    parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available:
+    > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+    """)
+    
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true")
+    parser.add_argument("--scene", type=str, default="")
+    parser.add_argument("--scene_dataset_config_file", type=str, default="")
+    parser.add_argument("--navmesh", type=str, default="")
+    
+    parser.add_argument("--generate_depth", type=int, default=1)
+    parser.add_argument("--exist_ok", type=int, default=0)
+
+    kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000)
+
+    args = parser.parse_args()
+    generate_depth=bool(args.generate_depth)
+    exist_ok = bool(args.exist_ok)
+
+    if args.list_commands:
+        # Listing scenes available...
+        scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+        
+        for scene_data in scenes_data:
+            print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok))
+    else:
+        if args.scene == "" or args.output_dir == "":
+            print("Missing scene or output dir argument!")
+            print(parser.format_help())
+        else:
+            generate_multiview_images_for_scene(scene=args.scene,
+                                                scene_dataset_config_file = args.scene_dataset_config_file,
+                                                navmesh = args.navmesh,
+                                                output_dir = args.output_dir,
+                                                exist_ok=exist_ok,
+                                                generate_depth=generate_depth,
+                                                **kwargs)
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e5f923b836a645caf5d8e4aacc425047e3c144
--- /dev/null
+++ b/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
@@ -0,0 +1,390 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+
+def compute_camera_intrinsics(height, width, hfov):
+    f = width/2 / np.tan(hfov/2 * np.pi/180)
+    cu, cv = width/2, height/2
+    return f, cu, cv
+
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+    R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+    t_cam2world = np.asarray(camera_position)
+    return R_cam2world, t_cam2world
+
+def compute_pointmap(depthmap, hfov):
+    """ Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+    height, width = depthmap.shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    # Cast depth map to point
+    z_cam = depthmap
+    u, v = np.meshgrid(range(width), range(height))
+    x_cam = (u - cu) / f * z_cam
+    y_cam = (v - cv) / f * z_cam
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+    return X_cam
+
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+    """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation)
+
+    X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+    valid_mask = (X_cam[:,:,2] != 0.0)
+
+    X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+    X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+    return X_world
+
+def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False):
+    """
+    Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+    """
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2)
+    distances, indices = nbrs.kneighbors(pointcloud1)
+    intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+
+    data = {"intersection1": intersection1,
+            "size1": len(pointcloud1)}
+    if compute_symmetric:
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1)
+        distances, indices = nbrs.kneighbors(pointcloud2)
+        intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+        data["intersection2"] = intersection2
+        data["size2"] = len(pointcloud2)
+
+    return data
+
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+    """
+    Add camera parameters to the observation dictionnary produced by Habitat-Sim
+    In-place modifications.
+    """
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation)
+    height, width = observation['depth'].shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    K = np.asarray([[f, 0, cu],
+                    [0, f, cv],
+                    [0, 0, 1.0]])
+    observation["camera_intrinsics"] = K
+    observation["t_cam2world"] = t_cam2world
+    observation["R_cam2world"] = R_cam2world
+
+def look_at(eye, center, up, return_cam2world=True):
+    """
+    Return camera pose looking at a given center point.
+    Analogous of gluLookAt function, using OpenCV camera convention.
+    """
+    z = center - eye
+    z /= np.linalg.norm(z, axis=-1, keepdims=True)
+    y = -up
+    y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+    y /= np.linalg.norm(y, axis=-1, keepdims=True)
+    x = np.cross(y, z, axis=-1)
+
+    if return_cam2world:
+        R = np.stack((x, y, z), axis=-1)
+        t = eye
+    else:
+        # World to camera transformation
+        # Transposed matrix
+        R = np.stack((x, y, z), axis=-2)
+        t = - np.einsum('...ij, ...j', R, eye)
+    return R, t
+
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+    R, t = look_at(eye, center, up)
+    orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+    return orientation, t
+
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+    return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP)
+            * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT)
+            * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT))
+
+
+class NoNaviguableSpaceError(RuntimeError):
+    def __init__(self, *args):
+            super().__init__(*args)
+
+class MultiviewHabitatSimGenerator:
+    def __init__(self,
+                scene,
+                navmesh,
+                scene_dataset_config_file,
+                resolution = (240, 320),
+                views_count=2,
+                hfov = 60,
+                gpu_id = 0,
+                size = 10000,
+                minimum_covisibility = 0.5,
+                transform = None):
+        self.scene = scene
+        self.navmesh = navmesh
+        self.scene_dataset_config_file = scene_dataset_config_file
+        self.resolution = resolution
+        self.views_count = views_count
+        assert(self.views_count >= 1)
+        self.hfov = hfov
+        self.gpu_id = gpu_id
+        self.size = size
+        self.transform = transform
+
+        # Noise added to camera orientation
+        self.pan_range = (-3, 3)
+        self.tilt_range = (-10, 10)
+        self.roll_range = (-5, 5)
+
+        # Height range to sample cameras
+        self.height_range = (1.2, 1.8)
+
+        # Random steps between the camera views
+        self.random_steps_count = 5
+        self.random_step_variance = 2.0
+
+        # Minimum fraction of the scene which should be valid (well defined depth)
+        self.minimum_valid_fraction = 0.7
+
+        # Distance threshold to see  to select pairs
+        self.distance_threshold = 0.05
+        # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+        self.minimum_covisibility = minimum_covisibility
+
+        # Maximum number of retries.
+        self.max_attempts_count = 100
+
+        self.seed = None
+        self._lazy_initialization()
+
+    def _lazy_initialization(self):
+        # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+        if self.seed == None:
+            # Re-seed numpy generator
+            np.random.seed()
+            self.seed = np.random.randint(2**32-1)
+            sim_cfg = habitat_sim.SimulatorConfiguration()
+            sim_cfg.scene_id = self.scene
+            if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+                    sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+            sim_cfg.random_seed = self.seed
+            sim_cfg.load_semantic_mesh = False
+            sim_cfg.gpu_device_id = self.gpu_id
+
+            depth_sensor_spec = habitat_sim.CameraSensorSpec()
+            depth_sensor_spec.uuid = "depth"
+            depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+            depth_sensor_spec.resolution = self.resolution
+            depth_sensor_spec.hfov = self.hfov
+            depth_sensor_spec.position = [0.0, 0.0, 0]
+            depth_sensor_spec.orientation
+
+            rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+            rgb_sensor_spec.uuid = "color"
+            rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+            rgb_sensor_spec.resolution = self.resolution
+            rgb_sensor_spec.hfov = self.hfov
+            rgb_sensor_spec.position = [0.0, 0.0, 0]
+            agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec])
+
+            cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+            self.sim = habitat_sim.Simulator(cfg)
+            if self.navmesh is not None and self.navmesh != "":
+                # Use pre-computed navmesh when available (usually better than those generated automatically)
+                self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+            if not self.sim.pathfinder.is_loaded:
+                # Try to compute a navmesh
+                navmesh_settings = habitat_sim.NavMeshSettings()
+                navmesh_settings.set_defaults()
+                self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+            # Ensure that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+
+            self.agent = self.sim.initialize_agent(agent_id=0)
+
+    def close(self):
+        self.sim.close()
+
+    def __del__(self):
+        self.sim.close()
+
+    def __len__(self):
+        return self.size
+
+    def sample_random_viewpoint(self):
+        """ Sample a random viewpoint using the navmesh """
+        nav_point = self.sim.pathfinder.get_random_navigable_point()
+
+        # Sample a random viewpoint height
+        viewpoint_height = np.random.uniform(*self.height_range)
+        viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+        viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return viewpoint_position, viewpoint_orientation, nav_point
+
+    def sample_other_random_viewpoint(self, observed_point, nav_point):
+        """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+        other_nav_point = nav_point
+
+        walk_directions = self.random_step_variance * np.asarray([1,0,1])
+        for i in range(self.random_steps_count):
+            temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3))
+            # Snapping may return nan when it fails
+            if not np.isnan(temp[0]):
+                    other_nav_point = temp
+
+        other_viewpoint_height = np.random.uniform(*self.height_range)
+        other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+
+        # Set viewing direction towards the central point
+        rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True)
+        rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return position, rotation, other_nav_point
+
+    def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        pixels_count = self.resolution[0] * self.resolution[1]
+        valid_fraction = len(other_pointcloud) / pixels_count
+        assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+        overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True)
+        covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count)
+        is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility)
+        return is_valid, valid_fraction, covisibility
+
+    def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation)
+        return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+
+    def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+        agent_state = habitat_sim.AgentState()
+        agent_state.position = viewpoint_position
+        agent_state.rotation = viewpoint_orientation
+        self.agent.set_state(agent_state)
+        viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+        _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation)
+        return viewpoint_observations
+
+    def __getitem__(self, useless_idx):
+        ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        # Extract point cloud
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                        camera_position=ref_position, camera_rotation=ref_orientation)
+
+        pixels_count = self.resolution[0] * self.resolution[1]
+        ref_valid_fraction = len(ref_pointcloud) / pixels_count
+        assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+        if ref_valid_fraction < self.minimum_valid_fraction:
+                # This should produce a recursion error at some point when something is very wrong.
+                return self[0]
+        # Pick an reference observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+
+        # Add the first image as reference
+        viewpoints_observations = [ref_observations]
+        viewpoints_covisibility = [ref_valid_fraction]
+        viewpoints_positions = [ref_position]
+        viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+        viewpoints_clouds = [ref_pointcloud]
+        viewpoints_valid_fractions = [ref_valid_fraction]
+
+        for _ in range(self.views_count - 1):
+            # Generate an other viewpoint using some dummy random walk
+            successful_sampling = False
+            for sampling_attempt in range(self.max_attempts_count):
+                position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point)
+                # Observation
+                other_viewpoint_observations = self.render_viewpoint(position, rotation)
+                other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation)
+
+                is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+                if is_valid:
+                        successful_sampling = True
+                        break
+            if not successful_sampling:
+                print("WARNING: Maximum number of attempts reached.")
+                # Dirty hack, try using a novel original viewpoint
+                return self[0]
+            viewpoints_observations.append(other_viewpoint_observations)
+            viewpoints_covisibility.append(covisibility)
+            viewpoints_positions.append(position)
+            viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding.
+            viewpoints_clouds.append(other_pointcloud)
+            viewpoints_valid_fractions.append(valid_fraction)
+
+        # Estimate relations between all pairs of images
+        pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations)))
+        for i in range(len(viewpoints_observations)):
+            pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i]
+            for j in range(i+1, len(viewpoints_observations)):
+                overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True)
+                pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count
+                pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count
+
+        # IoU is relative to the image 0
+        data = {"observations": viewpoints_observations,
+                "positions": np.asarray(viewpoints_positions),
+                "orientations": np.asarray(viewpoints_orientations),
+                "covisibility_ratios": np.asarray(viewpoints_covisibility),
+                "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+                "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float),
+                }
+
+        if self.transform is not None:
+            data = self.transform(data)
+        return  data
+
+    def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False):
+        """
+        Return a list of images corresponding to a spiral trajectory from a random starting point.
+        Useful to generate nice visualisations.
+        Use an even number of half turns to get a nice "C1-continuous" loop effect 
+        """
+        ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                                        camera_position=ref_position, camera_rotation=ref_orientation)
+        pixels_count = self.resolution[0] * self.resolution[1]
+        if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+            # Dirty hack: ensure that the valid part of the image is significant
+            return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation)
+
+        # Pick an observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+        ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation)
+
+        images = []
+        is_valid = []
+        # Spiral trajectory, use_constant orientation
+        for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+            r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius
+            theta = alpha * half_turns * np.pi 
+            x = r * np.cos(theta)
+            y = r * np.sin(theta)
+            z = 0.0
+            position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten()
+            if use_constant_orientation:
+                orientation = ref_orientation
+            else:
+                # trajectory looking at a mean point in front of the ref observation
+                orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP)
+            observations = self.render_viewpoint(position, orientation)
+            images.append(observations['color'][...,:3])
+            _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation)
+            is_valid.append(_is_valid)
+        return images, np.all(is_valid)
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/pack_metadata_files.py b/croco/datasets/habitat_sim/pack_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..10672a01f7dd615d3b4df37781f7f6f97e753ba6
--- /dev/null
+++ b/croco/datasets/habitat_sim/pack_metadata_files.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_dir")
+    parser.add_argument("output_dir")
+    args = parser.parse_args()
+
+    input_dirname = args.input_dir
+    output_dirname = args.output_dir
+
+    input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True)
+
+    images_count = collections.defaultdict(lambda : 0)
+    
+    os.makedirs(output_dirname)
+    for input_filename in tqdm(input_metadata_filenames):
+        # Ignore empty files
+        with open(input_filename, "r") as f:
+            original_metadata = json.load(f)
+            if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0:
+                print("No views in", input_filename)
+                continue
+
+        relpath = os.path.relpath(input_filename, input_dirname)
+        print(relpath)
+
+        # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+        # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+        scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True))
+        metadata = dict()
+        for key, value in original_metadata.items():
+            if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+                known_path = False
+                for dataset, dataset_path in scenes_dataset_paths.items():
+                    if value.startswith(dataset_path):
+                        value = os.path.join(dataset, os.path.relpath(value, dataset_path))
+                        known_path = True
+                        break
+                if not known_path:
+                    raise KeyError("Unknown path:" + value)
+            metadata[key] = value
+
+        # Compile some general statistics while packing data
+        scene_split = metadata["scene"].split("/")
+        upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+        images_count[upper_level] += len(metadata["multiviews"])
+        
+        output_filename = os.path.join(output_dirname, relpath)
+        os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+        with open(output_filename, "w") as f:
+            json.dump(metadata, f)
+
+    # Print statistics
+    print("Images count:")
+    for upper_level, count in images_count.items():
+        print(f"- {upper_level}: {count}")
\ No newline at end of file
diff --git a/croco/datasets/habitat_sim/paths.py b/croco/datasets/habitat_sim/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d63b5fa29c274ddfeae084734a35ba66d7edee8
--- /dev/null
+++ b/croco/datasets/habitat_sim/paths.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Paths to Habitat-Sim scenes
+"""
+
+import os
+import json
+import collections
+from tqdm import tqdm
+
+
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+    "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+    "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+    "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+    "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+    "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+    "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+    "scannet": "./data/habitat-sim/scene_datasets/scannet/"
+}
+
+SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"])
+
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json")
+    scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+    navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+        # Add scene
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx] + ".scene_instance.json",
+                    navmesh = os.path.join(base_path, navmeshes[idx]),
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+
+def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json")
+    scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [])
+    navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx])
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx],
+                    navmesh = "",
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data    
+
+def list_replica_scenes(base_output_dir, base_path):
+    scenes_data = []
+    for scene_id in os.listdir(base_path):
+        scene = os.path.join(base_path, scene_id, "mesh.ply")
+        navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it
+        scene_dataset_config_file = ""
+        output_dir = os.path.join(base_output_dir, scene_id)
+        # Add scene only if it does not exist already, or if exist_ok
+        data = SceneData(scene_dataset_config_file = scene_dataset_config_file,
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+
+
+def list_scenes(base_output_dir, base_path):
+    """
+    Generic method iterating through a base_path folder to find scenes.
+    """
+    scenes_data = []
+    for root, dirs, files in os.walk(base_path, followlinks=True):
+        folder_scenes_data = []
+        for file in files:
+            name, ext = os.path.splitext(file)
+            if ext == ".glb":
+                scene = os.path.join(root, name + ".glb")
+                navmesh = os.path.join(root, name + ".navmesh")
+                if not os.path.exists(navmesh):
+                    navmesh = ""
+                relpath = os.path.relpath(root, base_path)
+                output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name))
+                data = SceneData(scene_dataset_config_file="",
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+                folder_scenes_data.append(data)
+
+        # Specific check for HM3D:
+        # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+        basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")]
+        if len(basis_scenes) != 0:
+            folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)]
+
+        scenes_data.extend(folder_scenes_data)
+    return scenes_data
+
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+    scenes_data = []
+
+    # HM3D
+    for split in ("minival", "train", "val", "examples"):
+        scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+                                    base_path=f"{scenes_dataset_paths['hm3d']}/{split}")
+
+    # Gibson
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"),
+                                base_path=scenes_dataset_paths["gibson"])
+
+    # Habitat test scenes (just a few)
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+                                base_path=scenes_dataset_paths["habitat-test-scenes"])
+
+    # ReplicaCAD (baked lightning)
+    scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir)
+
+    # ScanNet
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"), 
+                            base_path=scenes_dataset_paths["scannet"])
+    
+    # Replica
+    list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"),
+                        base_path=scenes_dataset_paths["replica"])
+    return scenes_data    
diff --git a/croco/datasets/pairs_dataset.py b/croco/datasets/pairs_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f107526b34e154d9013a9a7a0bde3d5ff6f581c
--- /dev/null
+++ b/croco/datasets/pairs_dataset.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+
+from datasets.transforms import get_pair_transforms
+
+def load_image(impath):
+    return Image.open(impath)
+
+def load_pairs_from_cache_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines]
+    return pairs
+    
+def load_pairs_from_list_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')]
+    return pairs
+    
+    
+def write_cache_file(fname, pairs, root=''):
+    if len(root)>0:
+        if not root.endswith('/'): root+='/'
+        assert os.path.isdir(root)
+    s = ''
+    for im1, im2 in pairs:
+        if len(root)>0:
+            assert im1.startswith(root), im1
+            assert im2.startswith(root), im2
+        s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):])
+    with open(fname, 'w') as fid:
+        fid.write(s[:-1])
+    
+def parse_and_cache_all_pairs(dname, data_dir='./data/'):
+    if dname=='habitat_release':
+        dirname = os.path.join(data_dir, 'habitat_release')
+        assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+        cache_file = os.path.join(dirname, 'pairs.txt')
+        assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file
+        
+        print('Parsing pairs for dataset: '+dname)
+        pairs = []
+        for root, dirs, files in os.walk(dirname):
+            if 'val' in root: continue
+            dirs.sort()
+            pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')]
+        print('Found {:,} pairs'.format(len(pairs)))
+        print('Writing cache to: '+cache_file)
+        write_cache_file(cache_file, pairs, root=dirname)
+
+    else:
+        raise NotImplementedError('Unknown dataset: '+dname)
+    
+def dnames_to_image_pairs(dnames, data_dir='./data/'):
+    """
+    dnames: list of datasets with image pairs, separated by +
+    """
+    all_pairs = []
+    for dname in dnames.split('+'):
+        if dname=='habitat_release':
+            dirname = os.path.join(data_dir, 'habitat_release')
+            assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+            cache_file = os.path.join(dirname, 'pairs.txt')
+            assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file
+            pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+        elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']:
+            dirname = os.path.join(data_dir, dname+'_crops')
+            assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+            list_file = os.path.join(dirname, 'listing.txt')
+            assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file)
+            pairs = load_pairs_from_list_file(list_file, root=dirname)            
+        print('  {:s}: {:,} pairs'.format(dname, len(pairs)))
+        all_pairs += pairs 
+    if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs)))
+    return all_pairs 
+
+
+class PairsDataset(Dataset):
+
+    def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'):
+        super().__init__()
+        self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+        self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize)
+              
+    def __len__(self):
+        return len(self.image_pairs)
+            
+    def __getitem__(self, index):
+        im1path, im2path = self.image_pairs[index]
+        im1 = load_image(im1path)
+        im2 = load_image(im2path)
+        if self.transforms is not None: im1, im2 = self.transforms(im1, im2)
+        return im1, im2
+
+        
+if __name__=="__main__":
+    import argparse
+    parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset")
+    parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+    parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset")
+    args = parser.parse_args()
+    parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)
diff --git a/croco/datasets/transforms.py b/croco/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..216bac61f8254fd50e7f269ee80301f250a2d11e
--- /dev/null
+++ b/croco/datasets/transforms.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+
+class ComposePair(torchvision.transforms.Compose):
+    def __call__(self, img1, img2):
+        for t in self.transforms:
+            img1, img2 = t(img1, img2)
+        return img1, img2
+
+class NormalizeBoth(torchvision.transforms.Normalize):
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+class ToTensorBoth(torchvision.transforms.ToTensor):
+    def __call__(self, img1, img2):
+        img1 = super().__call__(img1)
+        img2 = super().__call__(img2)
+        return img1, img2
+        
+class RandomCropPair(torchvision.transforms.RandomCrop): 
+    # the crop will be intentionally different for the two images with this class
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+class ColorJitterPair(torchvision.transforms.ColorJitter): 
+    # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob  
+    def __init__(self, assymetric_prob, **kwargs):
+        super().__init__(**kwargs)
+        self.assymetric_prob = assymetric_prob
+    def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return img
+        
+    def forward(self, img1, img2):
+
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
+        img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        if torch.rand(1) < self.assymetric_prob: # assymetric:
+            fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        return img1, img2
+
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+    # transform_str is eg    crop224+color
+    trfs = []
+    for s in transform_str.split('+'):
+        if s.startswith('crop'):
+            size = int(s[len('crop'):])
+            trfs.append(RandomCropPair(size))
+        elif s=='acolor':
+            trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0))
+        elif s=='': # if transform_str was ""
+            pass
+        else:
+            raise NotImplementedError('Unknown augmentation: '+s)
+            
+    if totensor:
+        trfs.append( ToTensorBoth() )
+    if normalize:
+        trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) )
+
+    if len(trfs)==0:
+        return None
+    elif len(trfs)==1:
+        return trfs
+    else:
+        return ComposePair(trfs)
+        
+        
+        
+        
+        
diff --git a/croco/demo.py b/croco/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b80ccc5c98c18e20d1ce782511aa824ef28f77
--- /dev/null
+++ b/croco/demo.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+from models.croco import CroCoNet
+from PIL import Image
+import torchvision.transforms
+from torchvision.transforms import ToTensor, Normalize, Compose
+
+def main():
+    device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu')
+    
+    # load 224x224 images and transform them to tensor 
+    imagenet_mean = [0.485, 0.456, 0.406]
+    imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True)
+    imagenet_std = [0.229, 0.224, 0.225]
+    imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True)
+    trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)])
+    image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    
+    # load model 
+    ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+    model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device)
+    model.eval()
+    msg = model.load_state_dict(ckpt['model'], strict=True)
+    
+    # forward 
+    with torch.inference_mode():
+        out, mask, target = model(image1, image2)
+        
+    # the output is normalized, thus use the mean/std of the actual image to go back to RGB space 
+    patchified = model.patchify(image1)
+    mean = patchified.mean(dim=-1, keepdim=True)
+    var = patchified.var(dim=-1, keepdim=True)
+    decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean)
+    # undo imagenet normalization, prepare masked image
+    decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor
+    input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor
+    ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor
+    image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])
+    masked_input_image = ((1 - image_masks) * input_image)
+
+    # make visualization
+    visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4
+    B, C, H, W = visualization.shape
+    visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W)
+    visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1))
+    fname = "demo_output.png"
+    visualization.save(fname)
+    print('Visualization save in '+fname)
+    
+
+if __name__=="__main__":
+    main()
diff --git a/croco/interactive_demo.ipynb b/croco/interactive_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71
--- /dev/null
+++ b/croco/interactive_demo.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interactive demo of Cross-view Completion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from models.croco import CroCoNet\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import quaternion\n",
+    "import models.masking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load CroCo model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+    "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+    "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "model = model.eval()\n",
+    "model = model.to(device=device)\n",
+    "print(msg)\n",
+    "\n",
+    "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+    "    \"\"\"\n",
+    "    Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+    "    \"\"\"\n",
+    "    # Replace the mask generator\n",
+    "    model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+    "\n",
+    "    # ImageNet-1k color normalization\n",
+    "    imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+    "    imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+    "\n",
+    "    normalize_input_colors = True\n",
+    "    is_output_normalized = True\n",
+    "    with torch.no_grad():\n",
+    "        # Cast data to torch\n",
+    "        target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "        ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "\n",
+    "        if normalize_input_colors:\n",
+    "            ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+    "            target_image = (target_image - imagenet_mean) / imagenet_std\n",
+    "\n",
+    "        out, mask, _ = model(target_image, ref_image)\n",
+    "        # # get target\n",
+    "        if not is_output_normalized:\n",
+    "            predicted_image = model.unpatchify(out)\n",
+    "        else:\n",
+    "            # The output only contains higher order information,\n",
+    "            # we retrieve mean and standard deviation from the actual target image\n",
+    "            patchified = model.patchify(target_image)\n",
+    "            mean = patchified.mean(dim=-1, keepdim=True)\n",
+    "            var = patchified.var(dim=-1, keepdim=True)\n",
+    "            pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+    "            predicted_image = model.unpatchify(pred_renorm)\n",
+    "\n",
+    "        image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+    "        masked_target_image = (1 - image_masks) * target_image\n",
+    "      \n",
+    "        if not reconstruct_unmasked_patches:\n",
+    "            # Replace unmasked patches by their actual values\n",
+    "            predicted_image = predicted_image * image_masks + masked_target_image\n",
+    "\n",
+    "        # Unapply color normalization\n",
+    "        if normalize_input_colors:\n",
+    "            predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+    "            masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+    "        \n",
+    "        # Cast to Numpy\n",
+    "        masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        return masked_target_image, predicted_image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+    "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+    "import habitat_sim\n",
+    "\n",
+    "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+    "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+    "\n",
+    "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+    "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+    "sim_cfg.scene_id = scene\n",
+    "sim_cfg.load_semantic_mesh = False\n",
+    "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+    "rgb_sensor_spec.uuid = \"color\"\n",
+    "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+    "rgb_sensor_spec.resolution = (224,224)\n",
+    "rgb_sensor_spec.hfov = 56.56\n",
+    "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+    "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+    "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+    "\n",
+    "\n",
+    "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+    "sim = habitat_sim.Simulator(cfg)\n",
+    "if navmesh is not None:\n",
+    "    sim.pathfinder.load_nav_mesh(navmesh)\n",
+    "agent = sim.initialize_agent(agent_id=0)\n",
+    "\n",
+    "def sample_random_viewpoint():\n",
+    "    \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+    "    nav_point = sim.pathfinder.get_random_navigable_point()\n",
+    "    # Sample a random viewpoint height\n",
+    "    viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+    "    viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+    "    viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+    "    return viewpoint_position, viewpoint_orientation\n",
+    "\n",
+    "def render_viewpoint(position, orientation):\n",
+    "    agent_state = habitat_sim.AgentState()\n",
+    "    agent_state.position = position\n",
+    "    agent_state.rotation = orientation\n",
+    "    agent.set_state(agent_state)\n",
+    "    viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+    "    image = viewpoint_observations['color'][:,:,:3]\n",
+    "    image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sample a random reference view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_position, ref_orientation = sample_random_viewpoint()\n",
+    "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "plt.clf()\n",
+    "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+    "axes[0,0].imshow(ref_image)\n",
+    "for ax in axes.flatten():\n",
+    "    ax.set_xticks([])\n",
+    "    ax.set_yticks([])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interactive cross-view completion using CroCo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reconstruct_unmasked_patches = False\n",
+    "\n",
+    "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+    "    R = quaternion.as_rotation_matrix(ref_orientation)\n",
+    "    target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+    "    target_orientation = (ref_orientation\n",
+    "         * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+    "         * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+    "    \n",
+    "    ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "    target_image = render_viewpoint(target_position, target_orientation)\n",
+    "\n",
+    "    masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+    "\n",
+    "    fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+    "    axes[0].imshow(ref_image)\n",
+    "    axes[0].set_xlabel(\"Reference\")\n",
+    "    axes[1].imshow(masked_target_image)\n",
+    "    axes[1].set_xlabel(\"Masked target\")\n",
+    "    axes[2].imshow(predicted_image)\n",
+    "    axes[2].set_xlabel(\"Reconstruction\")        \n",
+    "    axes[3].imshow(target_image)\n",
+    "    axes[3].set_xlabel(\"Target\")\n",
+    "    for ax in axes.flatten():\n",
+    "        ax.set_xticks([])\n",
+    "        ax.set_yticks([])\n",
+    "\n",
+    "interact(show_demo,\n",
+    "        masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+    "        x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+    "        elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/croco/models/__pycache__/blocks.cpython-310.pyc b/croco/models/__pycache__/blocks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efa90d1c03a84c2f7a38687a5ebd669574b21742
Binary files /dev/null and b/croco/models/__pycache__/blocks.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/blocks.cpython-38.pyc b/croco/models/__pycache__/blocks.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5656e856bfb677b31397411f53e97897605bf57e
Binary files /dev/null and b/croco/models/__pycache__/blocks.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/croco.cpython-310.pyc b/croco/models/__pycache__/croco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca15afa9df757bf3657afc2417916c50397a5a61
Binary files /dev/null and b/croco/models/__pycache__/croco.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/croco.cpython-38.pyc b/croco/models/__pycache__/croco.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cda8f1ec2e60077d9fb4a030c5fadd36db05ea42
Binary files /dev/null and b/croco/models/__pycache__/croco.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/dpt_block.cpython-310.pyc b/croco/models/__pycache__/dpt_block.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee797f8209b6197b7cefb7a0abab6b9b4114dfa3
Binary files /dev/null and b/croco/models/__pycache__/dpt_block.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/dpt_block.cpython-38.pyc b/croco/models/__pycache__/dpt_block.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ec48ecea7f4bc4a0222bd683e9758cdfcce70fa
Binary files /dev/null and b/croco/models/__pycache__/dpt_block.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/masking.cpython-310.pyc b/croco/models/__pycache__/masking.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3438d62138dac00d4d6e63691da265481006fd25
Binary files /dev/null and b/croco/models/__pycache__/masking.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/masking.cpython-38.pyc b/croco/models/__pycache__/masking.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b40acb53c29bf1bb9d4aeb6f19c6a4dee7b2bb0f
Binary files /dev/null and b/croco/models/__pycache__/masking.cpython-38.pyc differ
diff --git a/croco/models/__pycache__/pos_embed.cpython-310.pyc b/croco/models/__pycache__/pos_embed.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e0c3ef339c700e1c7b215d709e70a2d55d078b
Binary files /dev/null and b/croco/models/__pycache__/pos_embed.cpython-310.pyc differ
diff --git a/croco/models/__pycache__/pos_embed.cpython-38.pyc b/croco/models/__pycache__/pos_embed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45a53efdb0071da6cfa9f741cf0ab5a804ae8bb9
Binary files /dev/null and b/croco/models/__pycache__/pos_embed.cpython-38.pyc differ
diff --git a/croco/models/blocks.py b/croco/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..297187871314937a9a43f2901c1ffa8bb41cf762
--- /dev/null
+++ b/croco/models/blocks.py
@@ -0,0 +1,241 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References: 
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn 
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+class Attention(nn.Module):
+
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope 
+
+    def forward(self, x, xpos): # 多头注意力机制，这里head的默认个数是16
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+               
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+               
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class CrossAttention(nn.Module):
+    
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        
+        self.rope = rope
+        
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        
+        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+            
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class DecoderBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos)) # 给x做自注意力机制
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) # x和y做交叉注意力机制
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+        
+        
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+
+    def __init__(self):
+        self.cache_positions = {}
+        
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos # 返回位置编码，其实就是PatchEmbedding后每个patch块的相对坐标
+
+class PatchEmbed(nn.Module):
+    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        
+        self.position_getter = PositionGetter()
+        
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+        
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 
+
diff --git a/croco/models/criterion.py b/croco/models/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..11696c40865344490f23796ea45e8fbd5e654731
--- /dev/null
+++ b/croco/models/criterion.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+class MaskedMSE(torch.nn.Module):
+
+    def __init__(self, norm_pix_loss=False, masked=True):
+        """
+            norm_pix_loss: normalize each patch by their pixel mean and variance
+            masked: compute loss over the masked patches only 
+        """
+        super().__init__()
+        self.norm_pix_loss = norm_pix_loss
+        self.masked = masked 
+        
+    def forward(self, pred, mask, target):
+        
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+            
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        if self.masked:
+            loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches
+        else:
+            loss = loss.mean()  # mean loss
+        return loss
diff --git a/croco/models/croco.py b/croco/models/croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0145e353831ac29e5889a7ec9a70191c71a0028d
--- /dev/null
+++ b/croco/models/croco.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from croco.models.blocks import Block, DecoderBlock, PatchEmbed
+from croco.models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from croco.models.masking import RandomMask
+
+
+class CroCoNet(nn.Module):
+
+    def __init__(self,
+                 img_size=224,           # input image size
+                 patch_size=16,          # patch_size 
+                 mask_ratio=0.9,         # ratios of masked tokens 
+                 enc_embed_dim=768,      # encoder feature dimension
+                 enc_depth=12,           # encoder depth 
+                 enc_num_heads=12,       # encoder number of heads in the transformer block 
+                 dec_embed_dim=512,      # decoder feature dimension 
+                 dec_depth=8,            # decoder depth 
+                 dec_num_heads=16,       # decoder number of heads in the transformer block 
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_im2_in_dec=True,   # whether to apply normalization of the 'memory' = (second image) in the decoder 
+                 pos_embed='cosine',     # positional embedding (either cosine or RoPE100)
+                ):
+                
+        super(CroCoNet, self).__init__()
+                
+        # patch embeddings  (with initialization done as in MAE)
+        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+
+        # mask generations
+        self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+
+        self.pos_embed = pos_embed
+        if pos_embed=='cosine':
+            # positional embedding of the encoder 
+            enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+            self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+            # positional embedding of the decoder  
+            dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+            self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+            # pos embedding in each block
+            self.rope = None # nothing for cosine 
+        elif pos_embed.startswith('RoPE'): # eg RoPE100 
+            self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError('Unknown pos_embed '+pos_embed)
+
+        # transformer for the encoder 
+        self.enc_depth = enc_depth
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_blocks = nn.ModuleList([
+            Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+            for i in range(enc_depth)])
+        self.enc_norm = norm_layer(enc_embed_dim)
+        
+        # masked tokens 
+        self._set_mask_token(dec_embed_dim)
+
+        # decoder 
+        self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+        
+        # prediction head 
+        self._set_prediction_head(dec_embed_dim, patch_size)
+        
+        # initializer weights
+        self.initialize_weights()           
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+    def _set_mask_generator(self, num_patches, mask_ratio):
+        self.mask_generator = RandomMask(num_patches, mask_ratio)
+        
+    def _set_mask_token(self, dec_embed_dim):
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+        
+    def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        # transfer from encoder to decoder 
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        # transformer for the decoder 
+        self.dec_blocks = nn.ModuleList([
+            DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+            for i in range(dec_depth)])
+        # final norm layer 
+        self.dec_norm = norm_layer(dec_embed_dim)
+        
+    def _set_prediction_head(self, dec_embed_dim, patch_size):
+         self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+        
+        
+    def initialize_weights(self):
+        # patch embed 
+        self.patch_embed._init_weights()
+        # mask tokens
+        if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+        # linears and layer norms
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+            
+    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+        """
+        image has B x 3 x img_size x img_size 
+        do_mask: whether to perform masking or not
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+        """
+        # embed the image into patches  (x has size B x Npatches x C) 
+        # and get position if each return patch (pos has size B x Npatches x 2)
+        x, pos = self.patch_embed(image)              
+        # add positional embedding without cls token  
+        if self.enc_pos_embed is not None: 
+            x = x + self.enc_pos_embed[None,...]
+        # apply masking 
+        B,N,C = x.size()
+        if do_mask:
+            masks = self.mask_generator(x)
+            x = x[~masks].view(B, -1, C)
+            posvis = pos[~masks].view(B, -1, 2)
+        else:
+            B,N,C = x.size()
+            masks = torch.zeros((B,N), dtype=bool)
+            posvis = pos
+        # now apply the transformer encoder and normalization        
+        if return_all_blocks:
+            out = []
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+                out.append(x)
+            out[-1] = self.enc_norm(out[-1])
+            return out, pos, masks
+        else:
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+            x = self.enc_norm(x)
+            return x, pos, masks
+ 
+    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+        """
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+                           
+        masks1 can be None => assume image1 fully visible 
+        """
+        # encoder to decoder layer 
+        visf1 = self.decoder_embed(feat1)
+        f2 = self.decoder_embed(feat2)
+        # append masked tokens to the sequence
+        B,Nenc,C = visf1.size()
+        if masks1 is None: # downstreams
+            f1_ = visf1
+        else: # pretraining 
+            Ntotal = masks1.size(1)
+            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+            f1_[~masks1] = visf1.view(B * Nenc, C)
+        # add positional embedding
+        if self.dec_pos_embed is not None:
+            f1_ = f1_ + self.dec_pos_embed
+            f2 = f2 + self.dec_pos_embed
+        # apply Transformer blocks
+        out = f1_
+        out2 = f2 
+        if return_all_blocks:
+            _out, out = out, []
+            for blk in self.dec_blocks:
+                _out, out2 = blk(_out, out2, pos1, pos2)
+                out.append(_out)
+            out[-1] = self.dec_norm(out[-1])
+        else:
+            for blk in self.dec_blocks:
+                out, out2 = blk(out, out2, pos1, pos2)
+            out = self.dec_norm(out)
+        return out
+
+    def patchify(self, imgs):
+        """
+        imgs: (B, 3, H, W)
+        x: (B, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        
+        return x
+
+    def unpatchify(self, x, channels=3):
+        """
+        x: (N, L, patch_size**2 *channels)
+        imgs: (N, 3, H, W)
+        """
+        patch_size = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+        return imgs
+
+    def forward(self, img1, img2):
+        """
+        img1: tensor of size B x 3 x img_size x img_size
+        img2: tensor of size B x 3 x img_size x img_size
+        
+        out will be    B x N x (3*patch_size*patch_size)
+        masks are also returned as B x N just in case 
+        """
+        # encoder of the masked first image 
+        feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+        # encoder of the second image 
+        feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+        # decoder 
+        decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+        # prediction head 
+        out = self.prediction_head(decfeat)
+        # get target
+        target = self.patchify(img1)
+        return out, mask1, target
diff --git a/croco/models/croco_downstream.py b/croco/models/croco_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..159dfff4d2c1461bc235e21441b57ce1e2088f76
--- /dev/null
+++ b/croco/models/croco_downstream.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# CroCo model for downstream tasks
+# --------------------------------------------------------
+
+import torch
+
+from .croco import CroCoNet
+
+
+def croco_args_from_ckpt(ckpt):
+    if 'croco_kwargs' in ckpt: # CroCo v2 released models
+        return ckpt['croco_kwargs']
+    elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release
+        s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
+        assert s.startswith('CroCoNet(')
+        return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it
+    else: # CroCo v1 released models
+        return dict()
+
+class CroCoDownstreamMonocularEncoder(CroCoNet):
+
+    def __init__(self,
+                 head,
+                 **kwargs):
+        """ Build network for monocular downstream task, only using the encoder.
+        It takes an extra argument head, that is called with the features 
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        NOTE: It works by *calling super().__init__() but with redefined setters
+        
+        """
+        super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """ No mask generator """
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """ No mask token """
+        self.mask_token = None
+        return
+
+    def _set_decoder(self, *args, **kwargs):
+        """ No decoder """
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No 'prediction head' for downstream tasks."""
+        return
+
+    def forward(self, img):
+        """
+        img if of size batch_size x 3 x h x w
+        """
+        B, C, H, W = img.size()
+        img_info = {'height': H, 'width': W}
+        need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+        out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers)
+        return self.head(out, img_info)
+        
+        
+class CroCoDownstreamBinocular(CroCoNet):
+
+    def __init__(self,
+                 head,
+                 **kwargs):
+        """ Build network for binocular downstream task
+        It takes an extra argument head, that is called with the features 
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        """
+        super(CroCoDownstreamBinocular, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """ No mask generator """
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """ No mask token """
+        self.mask_token = None
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No prediction head for downstream tasks, define your own head """
+        return
+        
+    def encode_image_pairs(self, img1, img2, return_all_blocks=False):
+        """ run encoder for a pair of images
+            it is actually ~5% faster to concatenate the images along the batch dimension 
+             than to encode them separately
+        """
+        ## the two commented lines below is the naive version with separate encoding
+        #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
+        #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
+        ## and now the faster version
+        out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks )
+        if return_all_blocks:
+            out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
+            out2 = out2[-1]
+        else:
+            out,out2 = out.chunk(2, dim=0)
+        pos,pos2 = pos.chunk(2, dim=0)            
+        return out, out2, pos, pos2
+
+    def forward(self, img1, img2):
+        B, C, H, W = img1.size()
+        img_info = {'height': H, 'width': W}
+        return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+        out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks)
+        if return_all_blocks:
+            decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+            decout = out+decout
+        else:
+            decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+        return self.head(decout, img_info)
\ No newline at end of file
diff --git a/croco/models/curope/__init__.py b/croco/models/curope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878
--- /dev/null
+++ b/croco/models/curope/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from .curope2d import cuRoPE2D
diff --git a/croco/models/curope/__pycache__/__init__.cpython-310.pyc b/croco/models/curope/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae664f61817c08229de3b3bd25d09e67c512fb68
Binary files /dev/null and b/croco/models/curope/__pycache__/__init__.cpython-310.pyc differ
diff --git a/croco/models/curope/__pycache__/__init__.cpython-38.pyc b/croco/models/curope/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c0e5621f0af15475b19c83026bdab80bf4055e
Binary files /dev/null and b/croco/models/curope/__pycache__/__init__.cpython-38.pyc differ
diff --git a/croco/models/curope/__pycache__/curope2d.cpython-310.pyc b/croco/models/curope/__pycache__/curope2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e4c5bd8907288ae181f415c682f7f57f430674
Binary files /dev/null and b/croco/models/curope/__pycache__/curope2d.cpython-310.pyc differ
diff --git a/croco/models/curope/__pycache__/curope2d.cpython-38.pyc b/croco/models/curope/__pycache__/curope2d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d3788feadd6ad7a5c123f5ddb9191fee182dcb
Binary files /dev/null and b/croco/models/curope/__pycache__/curope2d.cpython-38.pyc differ
diff --git a/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so b/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..54cb8272bdaeecf27eb33e7b36377339e8a4ba3b
Binary files /dev/null and b/croco/models/curope/build/lib.linux-x86_64-cpython-311/curope.cpython-311-x86_64-linux-gnu.so differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps
new file mode 100644
index 0000000000000000000000000000000000000000..6898f9369bb638a32223f5ed309410a5c55bcb77
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log
new file mode 100644
index 0000000000000000000000000000000000000000..dded40ff4052fee4defdd363fa72572da12ec117
--- /dev/null
+++ b/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_log
@@ -0,0 +1,4 @@
+# ninja log v5
+0	18414	1711080902080587065	/data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o	1ee4a1dd32c06eb6
+3	17217	1711081567047778386	/data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o	1ee4a1dd32c06eb6
+3	234838	1711081784655296109	/data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o	d5d1e582a9379a6b
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja b/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja
new file mode 100644
index 0000000000000000000000000000000000000000..75f901ccb7b6145f0b3d0c5a56a9e4117b1afb9b
--- /dev/null
+++ b/croco/models/curope/build/temp.linux-x86_64-cpython-311/build.ninja
@@ -0,0 +1,33 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -pthread -B /home/hy/anaconda3/envs/dust3r/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/hy/anaconda3/envs/dust3r/include -fPIC -O2 -isystem /home/hy/anaconda3/envs/dust3r/include -fPIC -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/TH -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/hy/anaconda3/envs/dust3r/include/python3.11 -c
+post_cflags = -O3 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=curope -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_cflags = -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/TH -I/home/hy/anaconda3/envs/dust3r/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/hy/anaconda3/envs/dust3r/include/python3.11 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O3 --ptxas-options=-v --use_fast_math -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=curope -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_dlink_post_cflags = 
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o: compile /data/hunterj-projects/dust3r/croco/models/curope/curope.cpp
+build /data/hunterj-projects/dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o: cuda_compile /data/hunterj-projects/dust3r/croco/models/curope/kernels.cu
+
+
+
+
+
+
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o b/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o
new file mode 100644
index 0000000000000000000000000000000000000000..25a1503c7698cb5202cb176904e3c3a62a28687f
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o differ
diff --git a/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o b/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o
new file mode 100644
index 0000000000000000000000000000000000000000..532e378f66f053ac04f6842fed02a063260f0cba
Binary files /dev/null and b/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o differ
diff --git a/croco/models/curope/curope.cpp b/croco/models/curope/curope.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b
--- /dev/null
+++ b/croco/models/curope/curope.cpp
@@ -0,0 +1,69 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+    const int B = tokens.size(0);
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3) / 4;
+
+    auto tok = tokens.accessor<float, 4>();
+    auto pos = positions.accessor<int64_t, 3>();
+
+    for (int b = 0; b < B; b++) {
+      for (int x = 0; x < 2; x++) { // y and then x (2d)
+        for (int n = 0; n < N; n++) {
+        
+            // grab the token position
+            const int p = pos[b][n][x];
+
+            for (int h = 0; h < H; h++) {
+                for (int d = 0; d < D; d++) {
+                    // grab the two values
+                    float u = tok[b][n][h][d+0+x*2*D];
+                    float v = tok[b][n][h][d+D+x*2*D];
+
+                    // grab the cos,sin
+                    const float inv_freq = fwd * p / powf(base, d/float(D));
+                    float c = cosf(inv_freq);
+                    float s = sinf(inv_freq);
+
+                    // write the result
+                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+                }
+            }
+        }
+      }
+    }
+}
+
+void rope_2d( torch::Tensor tokens,     // B,N,H,D
+        const torch::Tensor positions,  // B,N,2
+        const float base, 
+        const float fwd )
+{
+    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+    TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+    if (tokens.is_cuda())
+        rope_2d_cuda( tokens, positions, base, fwd );
+    else
+        rope_2d_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}
diff --git a/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so b/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..54cb8272bdaeecf27eb33e7b36377339e8a4ba3b
Binary files /dev/null and b/croco/models/curope/curope.cpython-311-x86_64-linux-gnu.so differ
diff --git a/croco/models/curope/curope2d.py b/croco/models/curope/curope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49c12f8c529e9a889b5ac20c5767158f238e17d
--- /dev/null
+++ b/croco/models/curope/curope2d.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+
+try:
+    import curope as _kernels # run `python setup.py install`
+except ModuleNotFoundError:
+    from . import curope as _kernels # run `python setup.py build_ext --inplace`
+
+
+class cuRoPE2D_func (torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, tokens, positions, base, F0=1):
+        ctx.save_for_backward(positions)
+        ctx.saved_base = base
+        ctx.saved_F0 = F0
+        # tokens = tokens.clone() # uncomment this if inplace doesn't work
+        _kernels.rope_2d( tokens, positions, base, F0 )
+        ctx.mark_dirty(tokens)
+        return tokens
+
+    @staticmethod
+    def backward(ctx, grad_res):
+        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+        _kernels.rope_2d( grad_res, positions, base, -F0 )
+        ctx.mark_dirty(grad_res)
+        return grad_res, None, None, None
+
+
+class cuRoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq 
+        self.F0 = F0
+
+    def forward(self, tokens, positions): 
+        cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
+        return tokens
\ No newline at end of file
diff --git a/croco/models/curope/kernels.cu b/croco/models/curope/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7156cd1bb935cb1f0be45e58add53f9c21505c20
--- /dev/null
+++ b/croco/models/curope/kernels.cu
@@ -0,0 +1,108 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#define CHECK_CUDA(tensor) {\
+    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t  >
+__global__ void rope_2d_cuda_kernel( 
+        //scalar_t* __restrict__ tokens, 
+        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
+        const int64_t* __restrict__ pos, 
+        const float base, 
+        const float fwd )
+        // const int N, const int H, const int D )
+{
+    // tokens shape = (B, N, H, D)
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3);
+    
+    // each block update a single token, for all heads
+    // each thread takes care of a single output
+    extern __shared__ float shared[];
+    float* shared_inv_freq = shared + D;
+
+    const int b = blockIdx.x / N;
+    const int n = blockIdx.x % N;
+
+    const int Q = D / 4; 
+    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+    //              u_Y     v_Y     u_X      v_X
+
+    // shared memory: first, compute inv_freq
+    if (threadIdx.x < Q)
+        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+    __syncthreads();
+
+    // start of X or Y part
+    const int X = threadIdx.x < D/2 ? 0 : 1; 
+    const int m = (X*D/2) + (threadIdx.x % Q);   // index of u_Y or u_X
+
+    // grab the cos,sin appropriate for me
+    const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+    const float cos = cosf(freq);
+    const float sin = sinf(freq);
+    /*
+    float* shared_cos_sin = shared + D + D/4;
+    if ((threadIdx.x % (D/2)) < Q)
+        shared_cos_sin[m+0] = cosf(freq);
+    else
+        shared_cos_sin[m+Q] = sinf(freq);
+    __syncthreads();
+    const float cos = shared_cos_sin[m+0];
+    const float sin = shared_cos_sin[m+Q];
+    */
+
+    for (int h = 0; h < H; h++)
+    {
+        // then, load all the token for this head in shared memory
+        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+        __syncthreads();
+
+        const float u = shared[m];
+        const float v = shared[m+Q];
+        
+        // write output
+        if ((threadIdx.x % (D/2)) < Q)
+            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+        else
+            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+    }
+}
+
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) 
+{
+    const int B = tokens.size(0); // batch size
+    const int N = tokens.size(1); // sequence length
+    const int H = tokens.size(2); // number of heads
+    const int D = tokens.size(3); // dimension per head
+
+    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+    TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+
+    // one block for each layer, one thread per local-max
+    const int THREADS_PER_BLOCK = D;
+    const int N_BLOCKS = B * N; // each block takes care of H*D values
+    const int SHARED_MEM = sizeof(float) * (D + D/4);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+        rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
+            //tokens.data_ptr<scalar_t>(), 
+            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+            pos.data_ptr<int64_t>(), 
+            base, fwd); //, N, H, D );
+    }));
+}
diff --git a/croco/models/curope/setup.py b/croco/models/curope/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..230632ed05e309200e8f93a3a852072333975009
--- /dev/null
+++ b/croco/models/curope/setup.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+    # '-gencode', 'arch=compute_70,code=sm_70',
+    # '-gencode', 'arch=compute_75,code=sm_75',
+    # '-gencode', 'arch=compute_80,code=sm_80',
+    # '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+
+setup(
+    name = 'curope',
+    ext_modules = [
+        CUDAExtension(
+                name='curope',
+                sources=[
+                    "curope.cpp",
+                    "kernels.cu",
+                ],
+                extra_compile_args = dict(
+                    nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs, 
+                    cxx=['-O3'])
+                )
+    ],
+    cmdclass = {
+        'build_ext': BuildExtension
+    })
diff --git a/croco/models/dpt_block.py b/croco/models/dpt_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a3b63c6267270883c2e620ca226059d63dc8df
--- /dev/null
+++ b/croco/models/dpt_block.py
@@ -0,0 +1,450 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References: 
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    scratch.layer_rn = nn.ModuleList([
+        scratch.layer1_rn,
+        scratch.layer2_rn,
+        scratch.layer3_rn,
+        scratch.layer4_rn,
+    ])
+
+    return scratch
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x): # RefineNet中的RCU组件: Residual Conv Unit
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x) # ReLU
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+
+        out = self.activation(out) # ReLU
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x) # 残差连接
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        width_ratio=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.width_ratio = width_ratio
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs): # Multi-Path Refinement
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0] # 前面小分辨率图像对应的RefineNet的累计输出结果
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])# 当前分辨率图像对应RefineNet的输入
+            if self.width_ratio != 1: # 不执行
+                res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+
+            output = self.skip_add.add(output, res) # 残差连接 output += res
+
+
+        output = self.resConfUnit2(output)
+
+        if self.width_ratio != 1:
+            # and output.shape[3] < self.width_ratio * output.shape[2]
+            #size=(image.shape[])
+            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+                shape = 3 * output.shape[3]
+            else:
+                shape = int(self.width_ratio * 2 * output.shape[2])
+            output  = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+        else:
+            output = nn.functional.interpolate(output, scale_factor=2,
+                    mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+
+def make_fusion_block(features, use_bn, width_ratio=1):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        width_ratio=width_ratio,
+    )
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+class DPTOutputAdapter(nn.Module):
+    """DPT output adapter.
+
+    :param num_cahnnels: Number of output channels
+    :param stride_level: tride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param hooks: Index of intermediate layers
+    :param layer_dims: Dimension of intermediate layers
+    :param feature_dim: Feature dimension
+    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+    :param use_bn: If set to True, activates batch norm
+    :param dim_tokens_enc:  Dimension of tokens coming from encoder
+    """
+
+    def __init__(self,
+                 num_channels: int = 1,
+                 stride_level: int = 1,
+                 patch_size: Union[int, Tuple[int, int]] = 16,
+                 main_tasks: Iterable[str] = ('rgb',),
+                 hooks: List[int] = [2, 5, 8, 11],
+                 layer_dims: List[int] = [96, 192, 384, 768],
+                 feature_dim: int = 256,
+                 last_dim: int = 32,
+                 use_bn: bool = False,
+                 dim_tokens_enc: Optional[int] = None,
+                 head_type: str = 'regression',
+                 output_width_ratio=1,
+                 **kwargs):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size = pair(patch_size)
+        self.main_tasks = main_tasks
+        self.hooks = hooks
+        self.layer_dims = layer_dims
+        self.feature_dim = feature_dim
+        self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+        self.head_type = head_type
+
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size[0] // stride_level)
+        self.P_W = max(1, self.patch_size[1] // stride_level)
+
+        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+
+        self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+
+        if self.head_type == 'regression':
+            # The "DPTDepthModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+            )
+        elif self.head_type == 'semseg':
+            # The "DPTSegmentationModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+                nn.ReLU(True),
+                nn.Dropout(0.1, False),
+                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            )
+        else:
+            raise ValueError('DPT head_type must be "regression" or "semseg".')
+
+        if self.dim_tokens_enc is not None:
+            self.init(dim_tokens_enc=dim_tokens_enc)
+
+    def init(self, dim_tokens_enc=768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        #print(dim_tokens_enc)
+
+        # Set up activation postprocessing layers
+        if isinstance(dim_tokens_enc, int):
+            dim_tokens_enc = 4 * [dim_tokens_enc]
+
+        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+
+        self.act_1_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=4, stride=4, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+
+        self.act_2_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=2, stride=2, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+
+        self.act_3_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[2],
+                out_channels=self.layer_dims[2],
+                kernel_size=1, stride=1, padding=0,
+            )
+        )
+
+        self.act_4_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.Conv2d(
+                in_channels=self.layer_dims[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=3, stride=2, padding=1,
+            )
+        )
+
+        self.act_postprocess = nn.ModuleList([
+            self.act_1_postprocess,
+            self.act_2_postprocess,
+            self.act_3_postprocess,
+            self.act_4_postprocess
+        ])
+
+    def adapt_tokens(self, encoder_tokens):
+        # Adapt tokens
+        x = []
+        x.append(encoder_tokens[:, :])
+        x = torch.cat(x, dim=-1)
+        return x
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+            #input_info: Dict):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        H, W = image_size
+        
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        # Output head
+        out = self.head(path_1)
+
+        return out
diff --git a/croco/models/head_downstream.py b/croco/models/head_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd40c91ba244d6c3522c6efd4ed4d724b7bdc650
--- /dev/null
+++ b/croco/models/head_downstream.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Heads for downstream tasks
+# --------------------------------------------------------
+
+"""
+A head is a module where the __init__ defines only the head hyperparameters.
+A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes.
+The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height'
+"""
+
+import torch
+import torch.nn as nn
+from .dpt_block import DPTOutputAdapter
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for CroCo.
+    by default, hooks_idx will be equal to:
+    * for encoder-only: 4 equally spread layers
+    * for encoder+decoder: last encoder + 3 equally spread layers of the decoder 
+    """
+
+    def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768],
+                 output_width_ratio=1, num_channels=1, postprocess=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_blocks = True # backbone needs to return all layers 
+        self.postprocess = postprocess
+        self.output_width_ratio = output_width_ratio
+        self.num_channels = num_channels
+        self.hooks_idx = hooks_idx
+        self.layer_dims = layer_dims
+    
+    def setup(self, croconet):
+        dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels}
+        if self.hooks_idx is None:
+            if hasattr(croconet, 'dec_blocks'): # encoder + decoder 
+                step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth]
+                hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+            else: # encoder only
+                step = croconet.enc_depth//4
+                hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+            self.hooks_idx = hooks_idx
+            print(f'  PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}')
+        dpt_args['hooks'] = self.hooks_idx
+        dpt_args['layer_dims'] = self.layer_dims
+        self.dpt = DPTOutputAdapter(**dpt_args)
+        dim_tokens = [croconet.enc_embed_dim if hook<croconet.enc_depth else croconet.dec_embed_dim for hook in self.hooks_idx]
+        dpt_init_args = {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info['height'],img_info['width']))
+        if self.postprocess: out = self.postprocess(out)
+        return out
\ No newline at end of file
diff --git a/croco/models/masking.py b/croco/models/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0d36f53efb4d42f3270db515235dceea8a44c2
--- /dev/null
+++ b/croco/models/masking.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Masking utils
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn    
+    
+class RandomMask(nn.Module):
+    """
+    random masking
+    """
+
+    def __init__(self, num_patches, mask_ratio):
+        super().__init__()
+        self.num_patches = num_patches
+        self.num_mask = int(mask_ratio * self.num_patches)
+    
+    def __call__(self, x):
+        noise = torch.rand(x.size(0), self.num_patches, device=x.device) 
+        argsort = torch.argsort(noise, dim=1) 
+        return argsort < self.num_mask
diff --git a/croco/models/pos_embed.py b/croco/models/pos_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..61360af923bd302617f21b5485a11d7e367b76dd
--- /dev/null
+++ b/croco/models/pos_embed.py
@@ -0,0 +1,159 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+
+
+
+import numpy as np
+
+import torch
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+
+
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+
+try:
+    from models.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+
+    class RoPE2D(torch.nn.Module):
+        
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq 
+            self.F0 = F0
+            self.cache = {}
+
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+            
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+            
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+            
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens
\ No newline at end of file
diff --git a/croco/pretrain.py b/croco/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c45e488015ef5380c71d0381ff453fdb860759e
--- /dev/null
+++ b/croco/pretrain.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Pre-training CroCo 
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math 
+from pathlib import Path
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco import CroCoNet
+from models.criterion import MaskedMSE
+from datasets.pairs_dataset import PairsDataset
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('CroCo pre-training', add_help=False)
+    # model and criterion
+    parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build")
+    parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss")
+    # dataset 
+    parser.add_argument('--dataset', default='habitat_release', type=str, help="training set")
+    parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful
+    # training 
+    parser.add_argument('--seed', default=0, type=int, help="Random seed")
+    parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+    parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler")
+    parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch")
+    parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+    parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR')              
+    parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining")
+    # others 
+    parser.add_argument('--num_workers', default=8, type=int)
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth')
+    parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+    parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training')
+    # paths 
+    parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+    parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+    return parser
+
+
+
+        
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    world_size = misc.get_world_size()
+    
+    print("output_dir: "+args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)                         
+
+    # auto resume 
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # fix the seed
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    ## training dataset and loader 
+    print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms))
+    dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir)
+    if world_size>1:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset, num_replicas=world_size, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+   
+    ## model 
+    print('Loading model: {:s}'.format(args.model))
+    model = eval(args.model)
+    print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss))))
+    criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss))
+   
+    model.to(device)
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True)
+        model_without_ddp = model.module
+    
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir)
+    else:
+        log_writer = None
+
+    print(f"Start training until {args.max_epoch} epochs")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if world_size>1:
+            data_loader_train.sampler.set_epoch(epoch)
+            
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            log_writer=log_writer,
+            args=args
+        )
+        
+        if args.output_dir and epoch % args.save_freq == 0 :
+            misc.save_model(
+                args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch, fname='last')
+                
+        if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1):
+            misc.save_model(
+                args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch)
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        'epoch': epoch,}
+
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    log_writer=None,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+
+        # we use a per iteration  lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+        image1 = image1.to(device, non_blocking=True) 
+        image2 = image2.to(device, non_blocking=True)
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            out, mask, target = model(image1, image2)
+            loss = criterion(out, mask, target)
+
+        loss_value = loss.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0:
+            # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes 
+            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', lr, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    
+    
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/croco/stereoflow/README.MD b/croco/stereoflow/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34
--- /dev/null
+++ b/croco/stereoflow/README.MD
@@ -0,0 +1,318 @@
+## CroCo-Stereo and CroCo-Flow
+
+This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained.
+All commands should be launched from the root directory.
+
+### Simple inference example
+
+We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`.
+Before running it, please download the trained models with:
+```
+bash stereoflow/download_model.sh crocostereo.pth
+bash stereoflow/download_model.sh crocoflow.pth
+```
+
+### Prepare data for training or evaluation
+
+Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`).
+Please find below on the file structure should look for each dataset:
+<details>
+<summary>FlyingChairs</summary>
+
+```
+./data/stereoflow/FlyingChairs/
+└───chairs_split.txt
+└───data/
+    └─── ...
+```
+</details>
+
+<details>
+<summary>MPI-Sintel</summary>
+
+```
+./data/stereoflow/MPI-Sintel/
+└───training/
+│   └───clean/
+│   └───final/
+│   └───flow/
+└───test/
+    └───clean/
+    └───final/
+```
+</details>
+
+<details>
+<summary>SceneFlow (including FlyingThings)</summary>
+
+```
+./data/stereoflow/SceneFlow/
+└───Driving/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+└───FlyingThings/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+│   └───optical_flow/
+└───Monkaa/
+    └───disparity/
+    └───frames_cleanpass/
+    └───frames_finalpass/
+```
+</details>
+
+<details>
+<summary>TartanAir</summary>
+
+```
+./data/stereoflow/TartanAir/
+└───abandonedfactory/
+│   └───.../
+└───abandonedfactory_night/
+│   └───.../
+└───.../
+```
+</details>
+
+<details>
+<summary>Booster</summary>
+
+```
+./data/stereoflow/booster_gt/
+└───train/
+    └───balanced/
+        └───Bathroom/
+        └───Bedroom/
+        └───...
+```
+</details>
+
+<details>
+<summary>CREStereo</summary>
+
+```
+./data/stereoflow/crenet_stereo_trainset/
+└───stereo_trainset/
+    └───crestereo/
+        └───hole/
+        └───reflective/
+        └───shapenet/
+        └───tree/
+```
+</details>
+
+<details>
+<summary>ETH3D Two-view Low-res</summary>
+
+```
+./data/stereoflow/eth3d_lowres/
+└───test/
+│   └───lakeside_1l/
+│   └───...
+└───train/
+│   └───delivery_area_1l/
+│   └───...
+└───train_gt/
+    └───delivery_area_1l/
+    └───...
+```
+</details>
+
+<details>
+<summary>KITTI 2012</summary>
+
+```
+./data/stereoflow/kitti-stereo-2012/
+└───testing/
+│   └───colored_0/
+│   └───colored_1/
+└───training/
+    └───colored_0/
+    └───colored_1/
+    └───disp_occ/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>KITTI 2015</summary>
+
+```
+./data/stereoflow/kitti-stereo-2015/
+└───testing/
+│   └───image_2/
+│   └───image_3/
+└───training/
+    └───image_2/
+    └───image_3/
+    └───disp_occ_0/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>Middlebury</summary>
+
+```
+./data/stereoflow/middlebury
+└───2005/
+│   └───train/
+│       └───Art/
+│       └───...
+└───2006/
+│   └───Aloe/
+│   └───Baby1/
+│   └───...
+└───2014/
+│   └───Adirondack-imperfect/
+│   └───Adirondack-perfect/
+│   └───...
+└───2021/
+│   └───data/
+│       └───artroom1/
+│       └───artroom2/
+│       └───...
+└───MiddEval3_F/
+    └───test/
+    │   └───Australia/
+    │   └───...
+    └───train/
+        └───Adirondack/
+        └───...
+```
+</details>
+
+<details>
+<summary>Spring</summary>
+
+```
+./data/stereoflow/spring/
+└───test/
+│   └───0003/
+│   └───...
+└───train/
+    └───0001/
+    └───...
+```
+</details>
+
+
+### CroCo-Stereo
+
+##### Main model 
+
+The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo.pth
+# Middlebury v3 submission
+python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9
+# Training command that was used, using checkpoint-last.pth
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus:
+torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+```
+
+For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo_subtrain.pth
+# Evaluation on validation sets 
+python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9
+# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/
+```
+
+##### Other models 
+
+<details>
+	<summary>Model for ETH3D</summary> 
+	The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss.
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_eth3d.pth
+	# ETH3D submission
+	python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/
+	
+</details>
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9
+	# Training that was used
+	python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth</code>.
+</details>
+	
+
+### CroCo-Flow
+
+##### Main model
+
+The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets.
+It was used for our submission to the MPI-Sintel benchmark.
+
+```
+# Download the model 
+bash stereoflow/download_model.sh crocoflow.pth
+# Evaluation 
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9
+# Sintel submission
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9
+# Training command that was used, with checkpoint-best.pth
+python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/
+```
+
+##### Other models 
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99
+	# Training that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Flow with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth</code>.
+</details>
diff --git a/croco/stereoflow/augmentor.py b/croco/stereoflow/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e6117151988d94cbc4b385e0d88e982133bf10
--- /dev/null
+++ b/croco/stereoflow/augmentor.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Data augmentation for training stereo and flow
+# --------------------------------------------------------
+
+# References
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py
+
+
+import numpy as np
+import random
+from PIL import Image
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torchvision.transforms.functional as FF
+
+class StereoAugmentor(object):
+
+    def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5):
+        self.crop_size = crop_size
+        self.scale_prob = scale_prob
+        self.scale_xonly = scale_xonly
+        self.lhth = lhth
+        self.lminscale = lminscale
+        self.lmaxscale = lmaxscale
+        self.hminscale = hminscale
+        self.hmaxscale = hmaxscale
+        self.scale_interp_nearest = scale_interp_nearest
+        self.rightjitterprob = rightjitterprob
+        self.v_flip_prob = v_flip_prob
+        self.color_aug_asym = color_aug_asym
+        self.color_choice_prob = color_choice_prob
+        
+    def _random_scale(self, img1, img2, disp):
+        ch,cw = self.crop_size
+        h,w = img1.shape[:2]
+        if self.scale_prob>0. and np.random.rand()<self.scale_prob:
+            min_scale, max_scale = (self.lminscale,self.lmaxscale) if min(h,w) < self.lhth else (self.hminscale,self.hmaxscale)
+            scale_x = 2. ** np.random.uniform(min_scale, max_scale)
+            scale_x = np.clip(scale_x, (cw+8) / float(w), None)
+            scale_y = 1.
+            if not self.scale_xonly:
+                scale_y = scale_x
+                scale_y = np.clip(scale_y, (ch+8) / float(h), None)
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+        else: # check if we need to resize to be able to crop 
+            h,w = img1.shape[:2]
+            clip_scale = (cw+8) / float(w)
+            if clip_scale>1.:
+                scale_x = clip_scale
+                scale_y = scale_x if not self.scale_xonly else 1.0
+                img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+                img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+                disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+        return img1, img2, disp 
+                
+    def _random_crop(self, img1, img2, disp): 
+        h,w = img1.shape[:2]
+        ch,cw = self.crop_size
+        assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw)
+        offset_x = np.random.randint(w - cw + 1)
+        offset_y = np.random.randint(h - ch + 1)
+        img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        return img1, img2, disp
+    
+    def _random_vflip(self, img1, img2, disp):
+        # vertical flip
+        if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob:
+            img1 = np.copy(np.flipud(img1))
+            img2 = np.copy(np.flipud(img2))
+            disp = np.copy(np.flipud(disp))
+        return img1, img2, disp
+        
+    def _random_rotate_shift_right(self, img2):
+        if self.rightjitterprob>0. and np.random.rand()<self.rightjitterprob:
+            angle, pixel = 0.1, 2
+            px = np.random.uniform(-pixel, pixel)
+            ag = np.random.uniform(-angle, angle)
+            image_center = (np.random.uniform(0, img2.shape[0]), np.random.uniform(0, img2.shape[1])  )
+            rot_mat = cv2.getRotationMatrix2D(image_center, ag, 1.0)
+            img2 = cv2.warpAffine(img2, rot_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR)
+            trans_mat = np.float32([[1, 0, 0], [0, 1, px]])
+            img2 = cv2.warpAffine(img2, trans_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR)
+        return img2
+            
+    def _random_color_contrast(self, img1, img2):
+        if np.random.random() < 0.5:
+            contrast_factor = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_contrast(img1, contrast_factor)
+            if self.color_aug_asym and np.random.random() < 0.5: contrast_factor = np.random.uniform(0.8, 1.2)
+            img2 = FF.adjust_contrast(img2, contrast_factor)
+        return img1, img2
+    def _random_color_gamma(self, img1, img2):
+        if np.random.random() < 0.5:
+            gamma = np.random.uniform(0.7, 1.5)
+            img1 = FF.adjust_gamma(img1, gamma)
+            if self.color_aug_asym and np.random.random() < 0.5: gamma = np.random.uniform(0.7, 1.5)
+            img2 = FF.adjust_gamma(img2, gamma)
+        return img1, img2
+    def _random_color_brightness(self, img1, img2):
+        if np.random.random() < 0.5:
+            brightness = np.random.uniform(0.5, 2.0)
+            img1 = FF.adjust_brightness(img1, brightness)
+            if self.color_aug_asym and np.random.random() < 0.5: brightness = np.random.uniform(0.5, 2.0)
+            img2 = FF.adjust_brightness(img2, brightness)
+        return img1, img2
+    def _random_color_hue(self, img1, img2):
+        if np.random.random() < 0.5:
+            hue = np.random.uniform(-0.1, 0.1)
+            img1 = FF.adjust_hue(img1, hue)
+            if self.color_aug_asym and np.random.random() < 0.5: hue = np.random.uniform(-0.1, 0.1)
+            img2 = FF.adjust_hue(img2, hue)
+        return img1, img2
+    def _random_color_saturation(self, img1, img2):
+        if np.random.random() < 0.5:
+            saturation = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_saturation(img1, saturation)
+            if self.color_aug_asym and np.random.random() < 0.5: saturation = np.random.uniform(-0.8,1.2)
+            img2 = FF.adjust_saturation(img2, saturation)
+        return img1, img2   
+    def _random_color(self, img1, img2):
+        trfs = [self._random_color_contrast,self._random_color_gamma,self._random_color_brightness,self._random_color_hue,self._random_color_saturation]
+        img1 = Image.fromarray(img1.astype('uint8'))
+        img2 = Image.fromarray(img2.astype('uint8'))
+        if np.random.random() < self.color_choice_prob:
+            # A single transform
+            t = random.choice(trfs)
+            img1, img2 = t(img1, img2)
+        else:
+            # Combination of trfs
+            # Random order
+            random.shuffle(trfs)
+            for t in trfs:
+                img1, img2 = t(img1, img2)
+        img1 = np.array(img1).astype(np.float32)
+        img2 = np.array(img2).astype(np.float32)
+        return img1, img2
+                    
+    def __call__(self, img1, img2, disp, dataset_name):
+        img1, img2, disp = self._random_scale(img1, img2, disp)
+        img1, img2, disp = self._random_crop(img1, img2, disp)
+        img1, img2, disp = self._random_vflip(img1, img2, disp)
+        img2 = self._random_rotate_shift_right(img2)
+        img1, img2 = self._random_color(img1, img2)
+        return img1, img2, disp
+
+
+
+class FlowAugmentor:
+
+    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, spatial_aug_prob=0.8, stretch_prob=0.8, max_stretch=0.2, h_flip_prob=0.5, v_flip_prob=0.1, asymmetric_color_aug_prob=0.2):
+    
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = spatial_aug_prob
+        self.stretch_prob = stretch_prob
+        self.max_stretch = max_stretch
+
+        # flip augmentation params
+        self.h_flip_prob = h_flip_prob
+        self.v_flip_prob = v_flip_prob
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14)
+
+        self.asymmetric_color_aug_prob = asymmetric_color_aug_prob
+        
+    def color_transform(self, img1, img2):
+        """ Photometric augmentation """
+
+        # asymmetric
+        if np.random.rand() < self.asymmetric_color_aug_prob:
+            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
+            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)
+
+        # symmetric
+        else:
+            image_stack = np.concatenate([img1, img2], axis=0)
+            image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+            img1, img2 = np.split(image_stack, 2, axis=0)
+
+        return img1, img2
+
+    def _resize_flow(self, flow, scale_x, scale_y, factor=1.0):
+        if np.all(np.isfinite(flow)):
+            flow = cv2.resize(flow, None, fx=scale_x/factor, fy=scale_y/factor, interpolation=cv2.INTER_LINEAR)
+            flow = flow * [scale_x, scale_y]
+        else: # sparse version
+            fx, fy = scale_x, scale_y
+            ht, wd = flow.shape[:2]
+            coords = np.meshgrid(np.arange(wd), np.arange(ht))
+            coords = np.stack(coords, axis=-1)
+
+            coords = coords.reshape(-1, 2).astype(np.float32)
+            flow = flow.reshape(-1, 2).astype(np.float32)
+            valid = np.isfinite(flow[:,0])
+
+            coords0 = coords[valid]
+            flow0 = flow[valid]
+
+            ht1 = int(round(ht * fy/factor))
+            wd1 = int(round(wd * fx/factor))
+            
+            rescale = np.expand_dims(np.array([fx, fy]), axis=0)
+            coords1 = coords0 * rescale / factor
+            flow1 = flow0 * rescale
+
+            xx = np.round(coords1[:, 0]).astype(np.int32)
+            yy = np.round(coords1[:, 1]).astype(np.int32)
+
+            v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+            xx = xx[v]
+            yy = yy[v]
+            flow1 = flow1[v]
+
+            flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones
+            flow[yy, xx] = flow1
+        return flow
+        
+    def spatial_transform(self, img1, img2, flow, dname):
+    
+        if np.random.rand() < self.spatial_aug_prob:
+            # randomly sample scale
+            ht, wd = img1.shape[:2]
+            clip_min_scale = np.maximum(
+                (self.crop_size[0] + 8) / float(ht),
+                (self.crop_size[1] + 8) / float(wd))
+            min_scale, max_scale = self.min_scale, self.max_scale
+            scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+            scale_x = scale
+            scale_y = scale
+            if np.random.rand() < self.stretch_prob:
+                scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+                scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+            scale_x = np.clip(scale_x, clip_min_scale, None)
+            scale_y = np.clip(scale_y, clip_min_scale, None)
+            # rescale the images
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0)
+        elif dname=="Spring":
+            flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0)
+
+        if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob:  # h-flip
+            img1 = img1[:, ::-1]
+            img2 = img2[:, ::-1]
+            flow = flow[:, ::-1] * [-1.0, 1.0]
+
+        if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob:  # v-flip
+            img1 = img1[::-1, :]
+            img2 = img2[::-1, :]
+            flow = flow[::-1, :] * [1.0, -1.0]
+                
+        # In case no cropping
+        if img1.shape[0] - self.crop_size[0] > 0:
+            y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+        else:
+            y0 = 0
+        if img1.shape[1] - self.crop_size[1] > 0:
+            x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+        else:
+            x0 = 0
+
+        img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+
+        return img1, img2, flow
+
+    def __call__(self, img1, img2, flow, dname):
+        img1, img2, flow = self.spatial_transform(img1, img2, flow, dname)
+        img1, img2 = self.color_transform(img1, img2)
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+        return img1, img2, flow
\ No newline at end of file
diff --git a/croco/stereoflow/criterion.py b/croco/stereoflow/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..57792ebeeee34827b317a4d32b7445837bb33f17
--- /dev/null
+++ b/croco/stereoflow/criterion.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Losses, metrics per batch, metrics per dataset 
+# --------------------------------------------------------
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+def _get_gtnorm(gt):
+    if gt.size(1)==1: # stereo
+        return gt
+    # flow 
+    return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW
+
+############ losses without confidence
+
+class L1Loss(nn.Module):
+    
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = False 
+    
+    def _error(self, gt, predictions):
+        return torch.abs(gt-predictions)
+    
+    def forward(self, predictions, gt, inspect=False):
+        mask = torch.isfinite(gt)
+        if self.max_gtnorm is not None: 
+            mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1)<self.max_gtnorm
+        if inspect:
+            return self._error(gt, predictions)
+        return self._error(gt[mask],predictions[mask]).mean()
+
+############## losses with confience
+## there are several parametrizations
+
+class LaplacianLoss(nn.Module): # used for CroCo-Stereo on ETH3D, d'=exp(d)
+    
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + conf[mask] ).mean()# + torch.log(2) => which is a constant
+
+
+class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b
+    def __init__(self, max_gtnorm=10000., a=0.25, b=4.):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = (self.b - self.a) * torch.sigmoid(conf) + self.a
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / conf[mask] + torch.log(conf)[mask] ).mean()# + torch.log(2) => which is a constant
+
+class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b
+    def __init__(self, max_gtnorm=None, a=3.0, b=3.0):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = 2 * self.a * (torch.sigmoid(conf / self.b) - 0.5 )
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + conf[mask] ).mean()# + torch.log(2) => which is a constant
+        
+############## metrics per batch 
+
+class StereoMetrics(nn.Module):
+
+    def __init__(self, do_quantile=False):
+        super().__init__()
+        self.bad_ths = [0.5,1,2,3]
+        self.do_quantile = do_quantile
+        
+    def forward(self, predictions, gt):
+        B = predictions.size(0)
+        metrics = {}
+        gtcopy = gt.clone() 
+        mask = torch.isfinite(gtcopy)
+        gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0
+        Npx = mask.view(B,-1).sum(dim=1)
+        L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1)
+        L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1)
+        # avgerr
+        metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx )
+        # rmse
+        metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0)
+        # err > t for t in [0.5,1,2,3]
+        for ths in self.bad_ths:
+            metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+        return metrics
+        
+class FlowMetrics(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [1,3,5]
+        
+    def forward(self, predictions, gt):
+        B = predictions.size(0)        
+        metrics = {}
+        mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+        Npx = mask.view(B,-1).sum(dim=1)
+        gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored
+        gtcopy[:,0,:,:][~mask] = 999999.0
+        gtcopy[:,1,:,:][~mask] = 999999.0
+        L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1)
+        L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1)
+        metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx )
+        metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx )
+        for ths in self.bad_ths:
+            metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+        return metrics
+        
+############## metrics per dataset
+## we update the average and maintain the number of pixels while adding data batch per batch 
+## at the beggining, call reset()
+## after each batch, call add_batch(...)
+## at the end: call get_results()
+
+class StereoDatasetMetrics(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5,1,2,3]
+        
+    def reset(self):
+        self.agg_N = 0 # number of pixels so far 
+        self.agg_L1err = torch.tensor(0.0) # L1 error so far 
+        self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels 
+        self._metrics = None
+                
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1)==1, predictions.size()
+        assert gt.size(1)==1, gt.size()
+        if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+            L1err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+                torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+            valid = torch.isfinite(L1err)
+        else:
+            valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_N = Nnew
+        for i,th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu()
+   
+    def _compute_metrics(self):
+        if self._metrics is not None: return
+        out = {}
+        out['L1err'] = self.agg_L1err.item()
+        for i,th in enumerate(self.bad_ths):
+            out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+        self._metrics = out
+        
+    def get_results(self): 
+        self._compute_metrics() # to avoid recompute them multiple times
+        return self._metrics
+
+class FlowDatasetMetrics(nn.Module):
+    
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5,1,3,5]
+        self.speed_ths = [(0,10),(10,40),(40,torch.inf)]
+    
+    def reset(self):
+        self.agg_N = 0 # number of pixels so far 
+        self.agg_L1err = torch.tensor(0.0) # L1 error so far 
+        self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far 
+        self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels 
+        self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far 
+        self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far
+        self._metrics = None
+        self.pairname_results = {}
+
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1)==2, predictions.size()
+        assert gt.size(1)==2, gt.size()
+        if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+            L1err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+                torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+            L2err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1)))
+            valid = torch.isfinite(L1err)
+            gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\
+                       torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them
+        else:
+            valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+            L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1))
+            gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1))
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_N = Nnew
+        for i,th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu()
+        for i,(th1,th2) in enumerate(self.speed_ths):
+            vv = (gtspeed[valid]>=th1) * (gtspeed[valid]<th2)
+            iNspeed = vv.sum()
+            if iNspeed==0: continue
+            iNnew = self.agg_Nspeed[i] + iNspeed
+            self.agg_EPEspeed[i] = float(self.agg_Nspeed[i]) / iNnew * self.agg_EPEspeed[i] + float(iNspeed) / iNnew * L2err[valid][vv].mean().cpu()
+            self.agg_Nspeed[i] = iNnew
+
+    def _compute_metrics(self):
+        if self._metrics is not None: return
+        out = {}
+        out['L1err'] = self.agg_L1err.item()
+        out['EPE']  = self.agg_L2err.item()
+        for i,th in enumerate(self.bad_ths):
+            out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+        for i,(th1,th2) in enumerate(self.speed_ths):
+            out['s{:d}{:s}'.format(th1, '-'+str(th2) if th2<torch.inf else '+')] = self.agg_EPEspeed[i].item()
+        self._metrics = out
+    
+    def get_results(self): 
+        self._compute_metrics() # to avoid recompute them multiple times
+        return self._metrics
\ No newline at end of file
diff --git a/croco/stereoflow/datasets_flow.py b/croco/stereoflow/datasets_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f553ff0caf5924065e55bf81e106e645a4f74ff
--- /dev/null
+++ b/croco/stereoflow/datasets_flow.py
@@ -0,0 +1,630 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for flow
+# --------------------------------------------------------
+
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import struct
+from PIL import Image 
+import json
+import h5py
+import torch
+from torch.utils import data
+
+from .augmentor import FlowAugmentor
+from .datasets_stereo import _read_img, img_to_tensor, dataset_to_root, _read_pfm
+from copy import deepcopy
+dataset_to_root = deepcopy(dataset_to_root)
+
+dataset_to_root.update(**{
+    'TartanAir': './data/stereoflow/TartanAir',
+    'FlyingChairs': './data/stereoflow/FlyingChairs/',
+    'FlyingThings': osp.join(dataset_to_root['SceneFlow'],'FlyingThings')+'/',
+    'MPISintel': './data/stereoflow//MPI-Sintel/'+'/',
+})
+cache_dir = "./data/stereoflow/datasets_flow_cache/"
+
+
+def flow_to_tensor(disp):
+    return torch.from_numpy(disp).float().permute(2, 0, 1)
+
+class FlowDataset(data.Dataset):
+    
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor: assert crop_size is None 
+        if crop_size is not None: assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = FlowAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1 # keep track of rmul
+        self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+        
+    def prepare_data(self):
+        """
+        to be defined for each dataset 
+        """
+        raise NotImplementedError 
+        
+    def __len__(self):
+        return len(self.pairnames) # each pairname is typically of the form (str, int1, int2) 
+        
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+        
+        # get filenames 
+        img1name = self.pairname_to_img1name(pairname)
+        img2name = self.pairname_to_img2name(pairname)
+        flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None
+        
+        # load images and disparities
+        img1 = _read_img(img1name)
+        img2 = _read_img(img2name)
+        flow = self.load_flow(flowname) if flowname is not None else None
+
+        # apply augmentations
+        if self.augmentor is not None:
+            img1, img2, flow = self.augmentor(img1, img2, flow, self.name)
+        
+        if self.totensor:
+            img1 = img_to_tensor(img1)
+            img2 = img_to_tensor(img2)
+            if flow is not None: 
+                flow = flow_to_tensor(flow)
+            else:
+                flow = torch.tensor([]) # to allow dataloader batching with default collate_gn
+            pairname = str(pairname) # transform potential tuple to str to be able to batch it
+
+        return img1, img2, flow, pairname
+        
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+        
+    def __str__(self):
+        return f'{self.__class__.__name__}_{self.split}'
+        
+    def __repr__(self):
+        s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+        if self.rmul==1:
+            s+=f'\n\tnum pairs: {len(self.pairnames)}'
+        else:
+            s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"       
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name+'.pkl')
+        if osp.isfile(cache_file):
+            with open(cache_file, 'rb') as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, 'wb') as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+
+class TartanAirDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "TartanAir"
+        self._set_root()
+        assert self.split in ['train']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2]))
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2]))
+        self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2]))
+        self.load_flow = _read_numpy_flow
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]]
+        assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir"
+        tosave = {'train': pairs}
+        return tosave
+        
+class FlyingChairsDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "FlyingChairs"
+        self._set_root()
+        assert self.split in ['train','val']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm')
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_flow = _read_flo_file
+        
+    def _build_cache(self):
+        split_file = osp.join(self.root, 'chairs_split.txt')
+        split_list = np.loadtxt(split_file, dtype=np.int32)
+        trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1]
+        valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1]
+        assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {'train': trainpairs, 'val': valpairs}
+        return tosave
+        
+class FlyingThingsDataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "FlyingThings"
+        self._set_root()
+        assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']]
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2]))
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+        self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past',  i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+        self.load_flow = _read_pfm_flow
+        
+    def _build_cache(self):
+        tosave = {}
+        # train and test splits for the different passes 
+        for set_ in ['train', 'test']:
+            sroot = osp.join(self.root, 'optical_flow', set_.upper())
+            fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')])
+            pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]]
+            pairs  = [(a,i,i+1) for a,i in pp]
+            pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp]
+            assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things"
+            for cam in ['left','right']:
+                camstr = '' if cam=='left' else f'_{cam}cam'
+                for pass_ in ['final', 'clean']:
+                    tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs]
+                tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}']
+        # test1024: this is the same split as unimatch 'validation' split
+        # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229
+        test1024_nsamples = 1024
+        alltest_nsamples = len(tosave['test_cleanpass'])  # 7866
+        stride = alltest_nsamples // test1024_nsamples
+        remove = alltest_nsamples % test1024_nsamples
+        for cam in ['left','right']:
+            camstr = '' if cam=='left' else f'_{cam}cam'
+            for pass_ in ['final','clean']:
+                tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before
+            assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things"
+            tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}']
+        return tosave
+        
+      
+class MPISintelDataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "MPISintel"
+        self._set_root()
+        assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']]
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1))
+        self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1]))
+        self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1]))
+        self.load_flow = _read_flo_file
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root+'training/clean'))
+        trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))]
+        subvalseqs = ['temple_2','temple_3']
+        subtrainseqs = [s for s in trainseqs if s not in subvalseqs]
+        subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)]
+        subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)]
+        testseqs = sorted(os.listdir(self.root+'test/clean'))
+        testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))]
+        assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {}
+        tosave['train_cleanpass'] = trainpairs
+        tosave['test_cleanpass'] = testpairs
+        tosave['subval_cleanpass'] = subvalpairs
+        tosave['subtrain_cleanpass'] = subtrainpairs         
+        for t in ['train','test','subval','subtrain']: 
+            tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']]
+            tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass'] 
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, _time):
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowFile(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split == 'test_allpass'
+        bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg <bundle_exe> <path_to_results_for_clean> <path_to_results_for_final> <output/bundled.lzma>
+        if os.path.isfile(bundle_exe):
+            cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"'
+            print(cmd)
+            os.system(cmd)
+            print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"')
+        else:
+            print('Could not find bundler executable for submission.')
+            print('Please download it and run:')
+            print(f'<bundle_exe> "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"')
+        
+class SpringDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ['train','test','subtrain','subval']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]+(1 if pairname[2]=='FW' else -1)))
+        self.pairname_to_flowname = lambda pairname: None if pairname[0]=='test' else osp.join(self.root, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5')
+        self.pairname_to_str = lambda pairname: osp.join(pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}')
+        self.load_flow = _read_hdf5_flow
+
+    def _build_cache(self):
+        # train 
+        trainseqs = sorted(os.listdir( osp.join(self.root,'train')))
+        trainpairs = []
+        for leftright in ['left','right']:
+            for fwbw in ['FW','BW']:
+                trainpairs += [('train',s,fwbw,leftright,int(f[len(f'flow_{fwbw}_{leftright}_'):-len('.flo5')])) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,f'flow_{fwbw}_{leftright}')))]
+        # test 
+        testseqs = sorted(os.listdir( osp.join(self.root,'test')))
+        testpairs = []
+        for leftright in ['left','right']:
+            testpairs += [('test',s,'FW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]]
+            testpairs += [('test',s,'BW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])+1) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]]
+        # subtrain / subval
+        subtrainpairs = [p for p in trainpairs if p[1]!='0041']
+        subvalpairs = [p for p in trainpairs if p[1]=='0041']
+        assert len(trainpairs)==19852 and len(testpairs)==3960 and len(subtrainpairs)==19472 and len(subvalpairs)==380, "incorrect parsing of pairs in Spring"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        assert prediction.dtype==np.float32
+        outfile = osp.join(outdir, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlo5File(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        exe = "{self.root}/flow_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+            print(f'Done. Submission file at {outdir}/test/flow_submission.hdf5')
+        else:
+            print('Could not find flow_subsampling executable for submission.')
+            print('Please download it and run:')
+            print(f'cd "{outdir}/test"; <flow_subsampling_exe> .')
+
+        
+class Kitti12Dataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ['train','test']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png')
+        self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/flow_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/')
+        self.load_flow = _read_kitti_flow
+        
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)]
+        assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12"
+        tosave = {'train': trainseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti12_flow_results.zip')
+
+
+class Kitti15Dataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval','test']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png')
+        self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/flow_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/')
+        self.load_flow = _read_kitti_flow
+        
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d"%(i) for i in range(200)]
+        subtrainseqs = trainseqs[:-10]
+        subvalseqs = trainseqs[-10:]
+        testseqs = ["testing/image_2/%06d"%(i) for i in range(200)]
+        assert len(trainseqs)==200 and len(subtrainseqs)==190 and len(subvalseqs)==10 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15"
+        tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, 'flow', pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti15_flow_results.zip')
+
+
+import cv2
+def _read_numpy_flow(filename): 
+    return np.load(filename)
+    
+def _read_pfm_flow(filename):
+    f, _ = _read_pfm(filename)
+    assert np.all(f[:,:,2]==0.0)
+    return np.ascontiguousarray(f[:,:,:2])
+
+TAG_FLOAT = 202021.25 # tag to check the sanity of the file
+TAG_STRING = 'PIEH'   # string containing the tag
+MIN_WIDTH = 1
+MAX_WIDTH = 99999
+MIN_HEIGHT = 1
+MAX_HEIGHT = 99999
+def readFlowFile(filename):
+    """
+    readFlowFile(<FILENAME>) reads a flow file <FILENAME> into a 2-band np.array.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised.
+    ---- PARAMETERS ----
+        filename: string containg the name of the file to read a flow
+    ---- OUTPUTS ----
+        a np.array of dimension (height x width x 2) containing the flow of type 'float32'
+    """
+        
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception("readFlowFile({:s}): filename must finish with '.flo'".format(filename))
+    
+    # open the file and read it
+    with open(filename,'rb') as f:
+        # check tag
+        tag = struct.unpack('f',f.read(4))[0]
+        if tag != TAG_FLOAT:
+            raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename))
+        # read dimension
+        w,h = struct.unpack('ii',f.read(8))
+        if w < MIN_WIDTH or w > MAX_WIDTH:
+            raise Exception("flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename,w))
+        if h < MIN_HEIGHT or h > MAX_HEIGHT:
+            raise Exception("flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename,h))
+        flow = np.fromfile(f,'float32')
+        if not flow.shape == (h*w*2,):
+            raise Exception("flow_utils.readFlowFile({:s}: illegal size of the file".format(filename))
+        flow.shape = (h,w,2)
+        return flow
+
+def writeFlowFile(flow,filename):
+    """
+    writeFlowFile(flow,<FILENAME>) write flow to the file <FILENAME>.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish with '.flo' or the flow has not 2 bands, an Exception is raised.
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to write
+        filename: string containg the name of the file to write a flow
+    """
+    
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception("flow_utils.writeFlowFile(<flow>,{:s}): filename must finish with '.flo'".format(filename))
+    
+    if not flow.shape[2:] == (2,):
+        raise Exception("flow_utils.writeFlowFile(<flow>,{:s}): <flow> must have 2 bands".format(filename))
+
+
+    # open the file and write it
+    with open(filename,'wb') as f:
+        # write TAG
+        f.write( TAG_STRING.encode('utf-8') )
+        # write dimension
+        f.write( struct.pack('ii',flow.shape[1],flow.shape[0]) )
+        # write the flow
+        
+        flow.astype(np.float32).tofile(f)
+        
+_read_flo_file = readFlowFile
+
+def _read_kitti_flow(filename):
+    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    valid = flow[:, :, 2]>0
+    flow = flow[:, :, :2]
+    flow = (flow - 2 ** 15) / 64.0
+    flow[~valid,0] = np.inf
+    flow[~valid,1] = np.inf
+    return flow
+_read_hd1k_flow = _read_kitti_flow
+    
+        
+def writeFlowKitti(filename, uv):
+    uv = 64.0 * uv + 2 ** 15
+    valid = np.ones([uv.shape[0], uv.shape[1], 1])
+    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+    cv2.imwrite(filename, uv[..., ::-1])
+
+def writeFlo5File(flow, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5)
+    
+def _read_hdf5_flow(filename):
+    flow = np.asarray(h5py.File(filename)['flow'])
+    flow[np.isnan(flow)] = np.inf # make invalid values as +inf
+    return flow.astype(np.float32)
+
+# flow visualization
+RY = 15
+YG = 6
+GC = 4
+CB = 11
+BM = 13
+MR = 6
+UNKNOWN_THRESH = 1e9
+
+def colorTest():
+    """
+    flow_utils.colorTest(): display an example of image showing the color encoding scheme
+    """
+    import matplotlib.pylab as plt
+    truerange = 1
+    h,w = 151,151
+    trange = truerange*1.04
+    s2 = round(h/2)
+    x,y = np.meshgrid(range(w),range(h))
+    u = x*trange/s2-trange
+    v = y*trange/s2-trange
+    img = _computeColor(np.concatenate((u[:,:,np.newaxis],v[:,:,np.newaxis]),2)/trange/np.sqrt(2))
+    plt.imshow(img)
+    plt.axis('off')
+    plt.axhline(round(h/2),color='k')
+    plt.axvline(round(w/2),color='k')
+    
+def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False):
+    """
+    flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow
+    flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow
+    ---- PARAMETERS ----
+        flow: flow to display of shape (height x width x 2)
+        maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm
+        maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm
+    ---- OUTPUT ----
+        an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow
+    """
+    h,w,n = flow.shape
+    # check size of flow
+    assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands"
+    # fix unknown flow
+    unknown_idx = np.max(np.abs(flow),2)>UNKNOWN_THRESH
+    flow[unknown_idx] = 0.0
+    # compute max flow if needed
+    if maxflow is None:
+        maxflow = flowMaxNorm(flow)
+    if maxmaxflow is not None:
+        maxflow = min(maxmaxflow, maxflow)
+    # normalize flow
+    eps = np.spacing(1) # minimum positive float value to avoid division by 0
+    # compute the flow
+    img = _computeColor(flow/(maxflow+eps), saturate=saturate)
+    # put black pixels in unknown location
+    img[ np.tile( unknown_idx[:,:,np.newaxis],[1,1,3]) ] = 0.0 
+    return img
+
+def flowMaxNorm(flow):
+    """
+    flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow
+    ---- PARAMETERS ----
+        flow: the flow
+        
+    ---- OUTPUT ----
+        a float containing the maximum of the l2-norm of the flow
+    """
+    return np.max( np.sqrt( np.sum( np.square( flow ) , 2) ) )
+
+def _computeColor(flow, saturate=True):
+    """
+    flow_utils._computeColor(flow): compute color codes for the flow field flow
+    
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to display
+    ---- OUTPUTS ----
+        an np.array of dimension (height x width x 3) containing the color conversion of the flow
+    """
+    # set nan to 0
+    nanidx = np.isnan(flow[:,:,0])
+    flow[nanidx] = 0.0
+    
+    # colorwheel
+    ncols = RY + YG + GC + CB + BM + MR
+    nchans = 3
+    colorwheel = np.zeros((ncols,nchans),'uint8')
+    col = 0;
+    #RY
+    colorwheel[:RY,0] = 255
+    colorwheel[:RY,1] = [(255*i) // RY for i in range(RY)]
+    col += RY
+    # YG    
+    colorwheel[col:col+YG,0] = [255 - (255*i) // YG for i in range(YG)]
+    colorwheel[col:col+YG,1] = 255
+    col += YG
+    # GC
+    colorwheel[col:col+GC,1] = 255
+    colorwheel[col:col+GC,2] = [(255*i) // GC for i in range(GC)]
+    col += GC
+    # CB
+    colorwheel[col:col+CB,1] = [255 - (255*i) // CB for i in range(CB)]
+    colorwheel[col:col+CB,2] = 255
+    col += CB
+    # BM
+    colorwheel[col:col+BM,0] = [(255*i) // BM for i in range(BM)]
+    colorwheel[col:col+BM,2] = 255
+    col += BM
+    # MR
+    colorwheel[col:col+MR,0] = 255
+    colorwheel[col:col+MR,2] = [255 - (255*i) // MR for i in range(MR)]
+
+    # compute utility variables
+    rad = np.sqrt( np.sum( np.square(flow) , 2) ) # magnitude
+    a = np.arctan2( -flow[:,:,1] , -flow[:,:,0]) / np.pi # angle
+    fk = (a+1)/2 * (ncols-1) # map [-1,1] to [0,ncols-1]
+    k0 = np.floor(fk).astype('int')
+    k1 = k0+1
+    k1[k1==ncols] = 0
+    f = fk-k0
+
+    if not saturate:
+        rad = np.minimum(rad,1)
+
+    # compute the image
+    img = np.zeros( (flow.shape[0],flow.shape[1],nchans), 'uint8' )
+    for i in range(nchans):
+        tmp = colorwheel[:,i].astype('float')
+        col0 = tmp[k0]/255
+        col1 = tmp[k1]/255
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx] = 1-rad[idx]*(1-col[idx]) # increase saturation with radius
+        col[~idx] *= 0.75 # out of range
+        img[:,:,i] = (255*col*(1-nanidx.astype('float'))).astype('uint8')
+
+    return img
+    
+# flow dataset getter 
+    
+def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    if augmentor:
+        dataset_str = dataset_str.replace(')',', augmentor=True)')
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size)))
+    return eval(dataset_str)
+    
+def get_test_datasets_flow(dataset_str):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    return [eval(s) for s in dataset_str.split('+')]
\ No newline at end of file
diff --git a/croco/stereoflow/datasets_stereo.py b/croco/stereoflow/datasets_stereo.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbdf841a6650afa71ae5782702902c79eba31a5c
--- /dev/null
+++ b/croco/stereoflow/datasets_stereo.py
@@ -0,0 +1,674 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for stereo
+# --------------------------------------------------------
+
+import sys, os
+import os.path as osp
+import pickle
+import numpy as np
+from PIL import Image
+import json
+import h5py
+from glob import glob
+import cv2
+
+import torch
+from torch.utils import data
+
+from .augmentor import StereoAugmentor
+
+
+
+dataset_to_root = {
+    'CREStereo': './data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/',
+    'SceneFlow': './data/stereoflow//SceneFlow/',
+    'ETH3DLowRes': './data/stereoflow/eth3d_lowres/',
+    'Booster': './data/stereoflow/booster_gt/',
+    'Middlebury2021': './data/stereoflow/middlebury/2021/data/',
+    'Middlebury2014': './data/stereoflow/middlebury/2014/',
+    'Middlebury2006': './data/stereoflow/middlebury/2006/',
+    'Middlebury2005': './data/stereoflow/middlebury/2005/train/',
+    'MiddleburyEval3':  './data/stereoflow/middlebury/MiddEval3/',
+    'Spring': './data/stereoflow/spring/', 
+    'Kitti15': './data/stereoflow/kitti-stereo-2015/',
+    'Kitti12': './data/stereoflow/kitti-stereo-2012/',
+}
+cache_dir = "./data/stereoflow/datasets_stereo_cache/"
+
+
+in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
+in1k_std =  torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)
+def img_to_tensor(img):
+    img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.
+    img = (img-in1k_mean)/in1k_std
+    return img
+def disp_to_tensor(disp):
+    return torch.from_numpy(disp)[None,:,:]
+
+class StereoDataset(data.Dataset):
+    
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor: assert crop_size is None 
+        if crop_size: assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = StereoAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1 # keep track of rmul
+        self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+        
+    def prepare_data(self):
+        """
+        to be defined for each dataset 
+        """
+        raise NotImplementedError 
+        
+    def __len__(self):
+        return len(self.pairnames)
+        
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+        
+        # get filenames 
+        Limgname = self.pairname_to_Limgname(pairname)
+        Rimgname = self.pairname_to_Rimgname(pairname)
+        Ldispname = self.pairname_to_Ldispname(pairname) if self.pairname_to_Ldispname is not None else None
+        
+        # load images and disparities
+        Limg = _read_img(Limgname)
+        Rimg = _read_img(Rimgname)
+        disp = self.load_disparity(Ldispname) if Ldispname is not None else None
+        
+        # sanity check
+        if disp is not None: assert np.all(disp>0) or self.name=="Spring", (self.name, pairname, Ldispname)
+        
+        # apply augmentations
+        if self.augmentor is not None:
+            Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name)
+        
+        if self.totensor:
+            Limg = img_to_tensor(Limg)
+            Rimg = img_to_tensor(Rimg)
+            if disp is None:
+                disp = torch.tensor([]) # to allow dataloader batching with default collate_gn
+            else:
+                disp = disp_to_tensor(disp)
+        
+        return Limg, Rimg, disp, str(pairname)
+        
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+        
+    def __str__(self):
+        return f'{self.__class__.__name__}_{self.split}'
+        
+    def __repr__(self):
+        s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+        if self.rmul==1:
+            s+=f'\n\tnum pairs: {len(self.pairnames)}'
+        else:
+            s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"       
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name+'.pkl')
+        if osp.isfile(cache_file):
+            with open(cache_file, 'rb') as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, 'wb') as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+        
+class CREStereoDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = 'CREStereo'
+        self._set_root()
+        assert self.split in ['train']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_left.jpg')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'_right.jpg')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname+'_left.disp.png')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_crestereo_disp
+        
+    
+    def _build_cache(self):
+        allpairs = [s+'/'+f[:-len('_left.jpg')] for s in sorted(os.listdir(self.root)) for f in sorted(os.listdir(self.root+'/'+s)) if f.endswith('_left.jpg')]
+        assert len(allpairs)==200000, "incorrect parsing of pairs in CreStereo"
+        tosave = {'train': allpairs}
+        return tosave
+        
+class SceneFlowDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "SceneFlow"
+        self._set_root()
+        assert self.split in ['train_finalpass','train_cleanpass','train_allpass','test_finalpass','test_cleanpass','test_allpass','test1of100_cleanpass','test1of100_finalpass']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/left/','/right/')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname).replace('/frames_finalpass/','/disparity/').replace('/frames_cleanpass/','/disparity/')[:-4]+'.pfm'
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_sceneflow_disp
+        
+    def _build_cache(self):
+        trainpairs = []
+        # driving
+        pairs = sorted(glob(self.root+'Driving/frames_finalpass/*/*/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # monkaa
+        pairs = sorted(glob(self.root+'Monkaa/frames_finalpass/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # flyingthings
+        pairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow"
+        testpairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TEST/*/*/left/*.png'))
+        testpairs = list(map(lambda x: x[len(self.root):], testpairs))
+        assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow"
+        test1of100pairs = testpairs[::100]
+        assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow"
+        # all 
+        tosave = {'train_finalpass': trainpairs,
+                  'train_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), trainpairs)),
+                  'test_finalpass': testpairs,
+                  'test_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), testpairs)),
+                  'test1of100_finalpass': test1of100pairs,
+                  'test1of100_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), test1of100pairs)),
+                 }
+        tosave['train_allpass'] = tosave['train_finalpass']+tosave['train_cleanpass']
+        tosave['test_allpass'] = tosave['test_finalpass']+tosave['test_cleanpass']
+        return tosave
+   
+class Md21Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2021"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/im0','/im1'))
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp0.pfm')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            #trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings
+            trainpairs += [s+'/ambient/'+b+'/'+a for b in sorted(os.listdir(osp.join(self.root,s,'ambient'))) for a in sorted(os.listdir(osp.join(self.root,s,'ambient',b))) if a.startswith('im0')]
+        assert len(trainpairs)==355
+        subtrainpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[:-2])]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[-2:])]
+        assert len(subtrainpairs)==335 and len(subvalpairs)==20, "incorrect parsing of pairs in Middlebury 2021"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave 
+
+class Md14Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2014"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'disp0.pfm')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            trainpairs += [s+'/im1.png',s+'/im1E.png',s+'/im1L.png']
+        assert len(trainpairs)==138
+        valseqs = ['Umbrella-imperfect','Vintage-perfect']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==132 and len(subvalpairs)==6, "incorrect parsing of pairs in Middlebury 2014"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave 
+
+class Md06Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2006"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png')
+        self.load_disparity = _read_middlebury20052006_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ['Illum1','Illum2','Illum3']:
+                for e in ['Exp0','Exp1','Exp2']:
+                    trainpairs.append(osp.join(s,i,e,'view1.png'))
+        assert len(trainpairs)==189
+        valseqs = ['Rocks1','Wood2']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==171 and len(subvalpairs)==18, "incorrect parsing of pairs in Middlebury 2006"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+
+class Md05Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2005"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury20052006_disp
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ['Illum1','Illum2','Illum3']:
+                for e in ['Exp0','Exp1','Exp2']:
+                    trainpairs.append(osp.join(s,i,e,'view1.png'))
+        assert len(trainpairs)==54, "incorrect parsing of pairs in Middlebury 2005"
+        valseqs = ['Reindeer']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==45 and len(subvalpairs)==9, "incorrect parsing of pairs in Middlebury 2005"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+class MdEval3Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "MiddleburyEval3"
+        self._set_root()
+        assert self.split in [s+'_'+r for s in ['train','subtrain','subval','test','all'] for r in ['full','half','quarter']]
+        if self.split.endswith('_full'):
+            self.root = self.root.replace('/MiddEval3','/MiddEval3_F')
+        elif self.split.endswith('_half'):        
+            self.root = self.root.replace('/MiddEval3','/MiddEval3_H')
+        else:
+            assert self.split.endswith('_quarter')
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png')
+        self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname, 'disp0GT.pfm')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_middlebury_disp
+        # for submission only
+        self.submission_methodname = "CroCo-Stereo"
+        self.submission_sresolution = 'F' if self.split.endswith('_full') else ('H' if self.split.endswith('_half') else 'Q')
+        
+    def _build_cache(self):
+        trainpairs = ['train/'+s for s in sorted(os.listdir(self.root+'train/'))]
+        testpairs = ['test/'+s for s in sorted(os.listdir(self.root+'test/'))]
+        subvalpairs = trainpairs[-1:]
+        subtrainpairs = trainpairs[:-1]
+        allpairs = trainpairs+testpairs
+        assert len(trainpairs)==15 and len(testpairs)==15 and len(subvalpairs)==1 and len(subtrainpairs)==14 and len(allpairs)==30, "incorrect parsing of pairs in Middlebury Eval v3"
+        tosave = {}
+        for r in ['full','half','quarter']:
+            tosave.update(**{'train_'+r: trainpairs, 'subtrain_'+r: subtrainpairs, 'subval_'+r: subvalpairs, 'test_'+r: testpairs, 'all_'+r: allpairs})
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname.split('/')[0].replace('train','training')+self.submission_sresolution, pairname.split('/')[1], 'disp0'+self.submission_methodname+'.pfm')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = os.path.join( os.path.dirname(outfile), "time"+self.submission_methodname+'.txt')
+        with open(timefile, 'w') as fid:
+            fid.write(str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/{self.submission_methodname}.zip')
+
+class ETH3DLowResDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "ETH3DLowRes"
+        self._set_root()
+        assert self.split in ['train','test','subtrain','subval','all']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: None if pairname.startswith('test/') else osp.join(self.root, pairname.replace('train/','train_gt/'), 'disp0GT.pfm')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_eth3d_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        trainpairs = ['train/' + s for s in sorted(os.listdir(self.root+'train/'))]
+        testpairs = ['test/' + s for s in sorted(os.listdir(self.root+'test/'))]
+        assert len(trainpairs) == 27 and len(testpairs) == 20, "incorrect parsing of pairs in ETH3D Low Res"
+        subvalpairs = ['train/delivery_area_3s','train/electro_3l','train/playground_3l']
+        assert all(p in trainpairs for p in subvalpairs)
+        subtrainpairs = [p for p in trainpairs if not p in subvalpairs]
+        assert len(subvalpairs)==3 and len(subtrainpairs)==24, "incorrect parsing of pairs in ETH3D Low Res"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs, 'all': trainpairs+testpairs}
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, 'low_res_two_view', pairname.split('/')[1]+'.pfm')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = outfile[:-4]+'.txt'
+        with open(timefile, 'w') as fid:
+            fid.write('runtime '+str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip')
+
+class BoosterDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Booster"
+        self._set_root()
+        assert self.split in ['train_balanced','test_balanced','subtrain_balanced','subval_balanced'] # we use only the balanced version
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/camera_00/','/camera_02/')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), '../disp_00.npy') # same images with different colors, same gt per sequence
+        self.pairname_to_str = lambda pairname: pairname[:-4].replace('/camera_00/','/')
+        self.load_disparity = _read_booster_disp
+        
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root+'train/balanced'))
+        trainpairs = ['train/balanced/'+s+'/camera_00/'+imname for s in trainseqs for imname in sorted(os.listdir(self.root+'train/balanced/'+s+'/camera_00/'))]
+        testpairs = ['test/balanced/'+s+'/camera_00/'+imname for s in sorted(os.listdir(self.root+'test/balanced')) for imname in sorted(os.listdir(self.root+'test/balanced/'+s+'/camera_00/'))]
+        assert len(trainpairs) == 228 and len(testpairs) == 191
+        subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])]
+        subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])]
+        # warning: if we do validation split, we should split scenes!!!
+        tosave = {'train_balanced': trainpairs, 'test_balanced': testpairs, 'subtrain_balanced': subtrainpairs, 'subval_balanced': subvalpairs,}
+        return tosave
+        
+class SpringDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ['train', 'test', 'subtrain', 'subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'.png').replace('frame_right','<frame_right>').replace('frame_left','frame_right').replace('<frame_right>','frame_left')
+        self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_hdf5_disp        
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir( osp.join(self.root,'train')))
+        trainpairs = [osp.join('train',s,'frame_left',f[:-4]) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,'frame_left')))]
+        testseqs = sorted(os.listdir( osp.join(self.root,'test')))
+        testpairs = [osp.join('test',s,'frame_left',f[:-4]) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,'frame_left')))]
+        testpairs += [p.replace('frame_left','frame_right') for p in testpairs]
+        """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041"""
+        subtrainpairs = [p for p in trainpairs if p.split('/')[1]!='0041']
+        subvalpairs = [p for p in trainpairs if p.split('/')[1]=='0041']
+        assert len(trainpairs)==5000 and len(testpairs)==2000 and len(subtrainpairs)==4904 and len(subvalpairs)==96, "incorrect parsing of pairs in Spring"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeDsp5File(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        exe = "{self.root}/disp1_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+        else:
+            print('Could not find disp1_subsampling executable for submission.')
+            print('Please download it and run:')
+            print(f'cd "{outdir}/test"; <disp1_subsampling_exe> .')
+
+class Kitti12Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ['train','test']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/colored_1/')+'_10.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/disp_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/')
+        self.load_disparity = _read_kitti_disp
+        
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)]
+        assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12"
+        tosave = {'train': trainseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype('uint16')
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti12_results.zip')
+
+class Kitti15Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval','test']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/image_3/')+'_10.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/disp_occ_0/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/')
+        self.load_disparity = _read_kitti_disp
+        
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d"%(i) for i in range(200)]
+        subtrainseqs = trainseqs[:-5]
+        subvalseqs = trainseqs[-5:]
+        testseqs = ["testing/image_2/%06d"%(i) for i in range(200)]
+        assert len(trainseqs)==200 and len(subtrainseqs)==195 and len(subvalseqs)==5 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15"
+        tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, 'disp_0', pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype('uint16')
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti15_results.zip')
+
+
+### auxiliary functions
+
+def _read_img(filename):
+    # convert to RGB for scene flow finalpass data
+    img = np.asarray(Image.open(filename).convert('RGB'))
+    return img
+
+def _read_booster_disp(filename):
+    disp = np.load(filename)
+    disp[disp==0.0] = np.inf
+    return disp
+
+def _read_png_disp(filename, coef=1.0):
+    disp = np.asarray(Image.open(filename))
+    disp = disp.astype(np.float32) / coef
+    disp[disp==0.0] = np.inf
+    return disp 
+
+def _read_pfm_disp(filename):
+    disp = np.ascontiguousarray(_read_pfm(filename)[0])
+    disp[disp<=0] = np.inf # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm
+    return disp
+
+def _read_npy_disp(filename):
+    return np.load(filename)
+
+def _read_crestereo_disp(filename): return _read_png_disp(filename, coef=32.0)
+def _read_middlebury20052006_disp(filename): return _read_png_disp(filename, coef=1.0)
+def _read_kitti_disp(filename): return _read_png_disp(filename, coef=256.0)
+_read_sceneflow_disp = _read_pfm_disp
+_read_eth3d_disp = _read_pfm_disp
+_read_middlebury_disp = _read_pfm_disp
+_read_carla_disp = _read_pfm_disp
+_read_tartanair_disp = _read_npy_disp
+    
+def _read_hdf5_disp(filename):
+    disp = np.asarray(h5py.File(filename)['disparity'])
+    disp[np.isnan(disp)] = np.inf # make invalid values as +inf
+    #disp[disp==0.0] = np.inf # make invalid values as +inf
+    return disp.astype(np.float32)
+    
+import re
+def _read_pfm(file):
+    file = open(file, 'rb')
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header.decode("ascii") == 'PF':
+        color = True
+    elif header.decode("ascii") == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data, scale
+
+def writePFM(file, image, scale=1):
+    file = open(file, 'wb')
+
+    color = None
+
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+
+    image = np.flipud(image)
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+    file.write('PF\n' if color else 'Pf\n'.encode())
+    file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0]))
+
+    endian = image.dtype.byteorder
+
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+
+    file.write('%f\n'.encode() % scale)
+
+    image.tofile(file)
+
+def writeDsp5File(disp, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5)
+
+
+# disp visualization
+
+def vis_disparity(disp, m=None, M=None):
+    if m is None: m = disp.min()
+    if M is None: M = disp.max()
+    disp_vis = (disp - m) / (M-m) * 255.0
+    disp_vis = disp_vis.astype("uint8")
+    disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+    return disp_vis
+
+# dataset getter 
+    
+def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    if augmentor:
+        dataset_str = dataset_str.replace(')',', augmentor=True)')
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size)))
+    return eval(dataset_str)
+    
+def get_test_datasets_stereo(dataset_str):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    return [eval(s) for s in dataset_str.split('+')]
\ No newline at end of file
diff --git a/croco/stereoflow/download_model.sh b/croco/stereoflow/download_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..533119609108c5ec3c22ff79b10e9215c1ac5098
--- /dev/null
+++ b/croco/stereoflow/download_model.sh
@@ -0,0 +1,12 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+model=$1
+outfile="stereoflow_models/${model}"
+if [[ ! -f $outfile ]]
+then
+	mkdir -p stereoflow_models/;
+	wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/;
+else
+	echo "Model ${model} already downloaded in ${outfile}."
+fi
\ No newline at end of file
diff --git a/croco/stereoflow/engine.py b/croco/stereoflow/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..c057346b99143bf6b9c4666a58215b2b91aca7a6
--- /dev/null
+++ b/croco/stereoflow/engine.py
@@ -0,0 +1,280 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main function for training one epoch or testing
+# --------------------------------------------------------
+
+import math
+import sys
+from typing import Iterable
+import numpy as np
+import torch
+import torchvision
+
+from utils import misc as misc
+
+
+def split_prediction_conf(predictions, with_conf=False):
+    if not with_conf:
+        return predictions, None
+    conf = predictions[:,-1:,:,:]
+    predictions = predictions[:,:-1,:,:]
+    return predictions, conf
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, metrics: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    log_writer=None, print_freq = 20,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    details = {}
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    if args.img_per_epoch:
+        iter_per_epoch = args.img_per_epoch // args.batch_size + int(args.img_per_epoch % args.batch_size > 0)
+        assert len(data_loader) >= iter_per_epoch, 'Dataset is too small for so many iterations'
+        len_data_loader = iter_per_epoch
+    else:
+        len_data_loader, iter_per_epoch = len(data_loader), None
+
+    for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_logger.log_every(data_loader, print_freq, header, max_iter=iter_per_epoch)):
+        
+        image1 = image1.to(device, non_blocking=True)
+        image2 = image2.to(device, non_blocking=True)
+        gt = gt.to(device, non_blocking=True)
+        
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(optimizer, data_iter_step / len_data_loader + epoch, args)
+
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            prediction = model(image1, image2)
+            prediction, conf = split_prediction_conf(prediction, criterion.with_conf)
+            batch_metrics = metrics(prediction.detach(), gt)
+            loss = criterion(prediction, gt) if conf is None else criterion(prediction, gt, conf)
+            
+        loss_value = loss.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+        
+        metric_logger.update(loss=loss_value)
+        for k,v in batch_metrics.items():
+            metric_logger.update(**{k: v.item()})
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        #if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value)
+        time_to_log = ((data_iter_step + 1) % (args.tboard_log_step * accum_iter) == 0 or data_iter_step == len_data_loader-1)
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if log_writer is not None and time_to_log:
+            epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000)
+            # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes.
+            log_writer.add_scalar('train/loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', lr, epoch_1000x)
+            for k,v in batch_metrics.items():
+                log_writer.add_scalar('train/'+k, v.item(), epoch_1000x)
+
+    # gather the stats from all processes
+    #if args.distributed: metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def validate_one_epoch(model: torch.nn.Module,
+                   criterion: torch.nn.Module,
+                   metrics: torch.nn.Module,
+                   data_loaders: list[Iterable],
+                   device: torch.device,
+                   epoch: int,
+                   log_writer=None,
+                   args=None):
+
+    model.eval()
+    metric_loggers = []
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 20
+
+    conf_mode = args.tile_conf_mode
+    crop = args.crop
+    
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    results = {}
+    dnames = []
+    image1, image2, gt, prediction = None, None, None, None
+    for didx, data_loader in enumerate(data_loaders):
+        dname = str(data_loader.dataset)
+        dnames.append(dname)
+        metric_loggers.append(misc.MetricLogger(delimiter="  "))
+        for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_loggers[didx].log_every(data_loader, print_freq, header)):
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = gt.to(device, non_blocking=True)
+            if dname.startswith('Spring'):
+                assert gt.size(2)==image1.size(2)*2 and gt.size(3)==image1.size(3)*2
+                gt = (gt[:,:,0::2,0::2] + gt[:,:,0::2,1::2] + gt[:,:,1::2,0::2] + gt[:,:,1::2,1::2] ) / 4.0 # we approximate the gt based on the 2x upsampled ones
+
+            with torch.inference_mode():
+                prediction, tiled_loss, c = tiled_pred(model, criterion, image1, image2, gt, conf_mode=conf_mode, overlap=args.val_overlap, crop=crop, with_conf=criterion.with_conf)
+                batch_metrics = metrics(prediction.detach(), gt)
+                loss = criterion(prediction.detach(), gt) if not criterion.with_conf else criterion(prediction.detach(), gt, c)
+                loss_value = loss.item()
+                metric_loggers[didx].update(loss_tiled=tiled_loss.item())
+                metric_loggers[didx].update(**{f'loss': loss_value})
+                for k,v in batch_metrics.items():
+                    metric_loggers[didx].update(**{dname+'_' + k: v.item()})
+        
+    results = {k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items()}
+    if len(dnames)>1:
+        for k in batch_metrics.keys():
+            results['AVG_'+k] = sum(results[dname+'_'+k] for dname in dnames) / len(dnames)
+            
+    if log_writer is not None :
+        epoch_1000x = int((1 + epoch) * 1000)
+        for k,v in results.items():
+            log_writer.add_scalar('val/'+k, v, epoch_1000x)
+
+    print("Averaged stats:", results)
+    return results
+
+import torch.nn.functional as F
+def _resize_img(img, new_size):
+    return F.interpolate(img, size=new_size, mode='bicubic', align_corners=False)
+def _resize_stereo_or_flow(data, new_size):
+    assert data.ndim==4
+    assert data.size(1) in [1,2]
+    scale_x = new_size[1]/float(data.size(3))
+    out = F.interpolate(data, size=new_size, mode='bicubic', align_corners=False)
+    out[:,0,:,:] *= scale_x
+    if out.size(1)==2:
+        scale_y = new_size[0]/float(data.size(2))        
+        out[:,1,:,:] *= scale_y
+        print(scale_x, new_size, data.shape)
+    return out
+    
+
+@torch.no_grad()
+def tiled_pred(model, criterion, img1, img2, gt,
+               overlap=0.5, bad_crop_thr=0.05,
+               downscale=False, crop=512, ret='loss',
+               conf_mode='conf_expsigmoid_10_5', with_conf=False, 
+               return_time=False):
+                     
+    # for each image, we are going to run inference on many overlapping patches
+    # then, all predictions will be weighted-averaged
+    if gt is not None:
+        B, C, H, W = gt.shape
+    else:
+        B, _, H, W = img1.shape
+        C = model.head.num_channels-int(with_conf)
+    win_height, win_width = crop[0], crop[1]
+    
+    # upscale to be larger than the crop
+    do_change_scale =  H<win_height or W<win_width
+    if do_change_scale: 
+        upscale_factor = max(win_width/W, win_height/W)
+        original_size = (H,W)
+        new_size = (round(H*upscale_factor),round(W*upscale_factor))
+        img1 = _resize_img(img1, new_size)
+        img2 = _resize_img(img2, new_size)
+        # resize gt just for the computation of tiled losses
+        if gt is not None: gt = _resize_stereo_or_flow(gt, new_size)
+        H,W = img1.shape[2:4]
+        
+    if conf_mode.startswith('conf_expsigmoid_'): # conf_expsigmoid_30_10
+        beta, betasigmoid = map(float, conf_mode[len('conf_expsigmoid_'):].split('_'))
+    elif conf_mode.startswith('conf_expbeta'): # conf_expbeta3
+        beta = float(conf_mode[len('conf_expbeta'):])
+    else:
+        raise NotImplementedError(f"conf_mode {conf_mode} is not implemented")
+
+    def crop_generator():
+        for sy in _overlapping(H, win_height, overlap):
+          for sx in _overlapping(W, win_width, overlap):
+            yield sy, sx, sy, sx, True
+
+    # keep track of weighted sum of prediction*weights and weights
+    accu_pred = img1.new_zeros((B, C, H, W)) # accumulate the weighted sum of predictions 
+    accu_conf = img1.new_zeros((B, H, W)) + 1e-16 # accumulate the weights 
+    accu_c = img1.new_zeros((B, H, W)) # accumulate the weighted sum of confidences ; not so useful except for computing some losses
+
+    tiled_losses = []
+    
+    if return_time:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+
+    for sy1, sx1, sy2, sx2, aligned in crop_generator():
+        # compute optical flow there
+        pred =  model(_crop(img1,sy1,sx1), _crop(img2,sy2,sx2))
+        pred, predconf = split_prediction_conf(pred, with_conf=with_conf)
+        
+        if gt is not None: gtcrop = _crop(gt,sy1,sx1)
+        if criterion is not None and gt is not None: 
+            tiled_losses.append( criterion(pred, gtcrop).item() if predconf is None else criterion(pred, gtcrop, predconf).item() )
+        
+        if conf_mode.startswith('conf_expsigmoid_'):
+            conf = torch.exp(- beta * 2 * (torch.sigmoid(predconf / betasigmoid) - 0.5)).view(B,win_height,win_width)
+        elif conf_mode.startswith('conf_expbeta'):
+            conf = torch.exp(- beta * predconf).view(B,win_height,win_width)
+        else:
+            raise NotImplementedError
+                        
+        accu_pred[...,sy1,sx1] += pred * conf[:,None,:,:]
+        accu_conf[...,sy1,sx1] += conf
+        accu_c[...,sy1,sx1] += predconf.view(B,win_height,win_width) * conf 
+        
+    pred = accu_pred / accu_conf[:, None,:,:]
+    c = accu_c / accu_conf
+    assert not torch.any(torch.isnan(pred))
+
+    if return_time:
+        end.record()
+        torch.cuda.synchronize()
+        time = start.elapsed_time(end)/1000.0 # this was in milliseconds
+
+    if do_change_scale:
+        pred = _resize_stereo_or_flow(pred, original_size)
+    
+    if return_time:
+        return pred, torch.mean(torch.tensor(tiled_losses)), c, time
+    return pred, torch.mean(torch.tensor(tiled_losses)), c
+
+
+def _overlapping(total, window, overlap=0.5):
+    assert total >= window and 0 <= overlap < 1, (total, window, overlap)
+    num_windows = 1 + int(np.ceil( (total - window) / ((1-overlap) * window) ))
+    offsets = np.linspace(0, total-window, num_windows).round().astype(int)
+    yield from (slice(x, x+window) for x in offsets)
+
+def _crop(img, sy, sx):
+    B, THREE, H, W = img.shape
+    if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W:
+        return img[:,:,sy,sx]
+    l, r = max(0,-sx.start), max(0,sx.stop-W)
+    t, b = max(0,-sy.start), max(0,sy.stop-H)
+    img = torch.nn.functional.pad(img, (l,r,t,b), mode='constant')
+    return img[:, :, slice(sy.start+t,sy.stop+t), slice(sx.start+l,sx.stop+l)]
\ No newline at end of file
diff --git a/croco/stereoflow/test.py b/croco/stereoflow/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0248e56664c769752595af251e1eadcfa3a479d9
--- /dev/null
+++ b/croco/stereoflow/test.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main test function
+# --------------------------------------------------------
+
+import os
+import argparse
+import pickle
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from torch.utils.data import DataLoader
+
+import utils.misc as misc
+from models.croco_downstream import CroCoDownstreamBinocular
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.criterion import *
+from stereoflow.datasets_stereo import get_test_datasets_stereo
+from stereoflow.datasets_flow import get_test_datasets_flow
+from stereoflow.engine import tiled_pred
+
+from stereoflow.datasets_stereo import vis_disparity
+from stereoflow.datasets_flow import flowToColor
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Test CroCo models on stereo/flow', add_help=False)
+    # important argument 
+    parser.add_argument('--model', required=True, type=str, help='Path to the model to evaluate')
+    parser.add_argument('--dataset', required=True, type=str, help="test dataset (there can be multiple dataset separated by a +)")
+    # tiling 
+    parser.add_argument('--tile_conf_mode', type=str, default='', help='Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint')
+    parser.add_argument('--tile_overlap', type=float, default=0.7, help='overlap between tiles')
+    # save (it will automatically go to <model_path>_<dataset_str>/<tile_str>_<save>)
+    parser.add_argument('--save', type=str, nargs='+', default=[], 
+                        help='what to save: \
+                              metrics (pickle file), \
+                              pred (raw prediction save as torch tensor), \
+                              visu (visualization in png of each prediction), \
+                              err10 (visualization in png of the error clamp at 10 for each prediction), \
+                              submission (submission file)')
+    # other (no impact)
+    parser.add_argument('--num_workers', default=4, type=int)
+    return parser
+    
+    
+def _load_model_and_criterion(model_path, do_load_metrics, device):
+    print('loading model from', model_path)
+    assert os.path.isfile(model_path)
+    ckpt = torch.load(model_path, 'cpu')
+    
+    ckpt_args = ckpt['args']
+    task = ckpt_args.task
+    tile_conf_mode = ckpt_args.tile_conf_mode
+    num_channels = {'stereo': 1, 'flow': 2}[task]
+    with_conf =  eval(ckpt_args.criterion).with_conf
+    if with_conf: num_channels += 1
+    print('head: PixelwiseTaskWithDPT()')
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    print('croco_args:', ckpt_args.croco_args)
+    model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args)
+    msg = model.load_state_dict(ckpt['model'], strict=True)
+    model.eval()
+    model = model.to(device)
+    
+    if do_load_metrics:
+        if task=='stereo':
+            metrics = StereoDatasetMetrics().to(device)
+        else:
+            metrics = FlowDatasetMetrics().to(device)
+    else:
+        metrics = None
+    
+    return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode
+    
+    
+def _save_batch(pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None):
+
+    for i in range(len(pairnames)):
+        
+        pairname = eval(pairnames[i]) if pairnames[i].startswith('(') else pairnames[i] # unbatch pairname 
+        fname = os.path.join(outdir, dataset.pairname_to_str(pairname))
+        os.makedirs(os.path.dirname(fname), exist_ok=True)
+        
+        predi = pred[i,...]
+        if gt is not None: gti = gt[i,...]
+        
+        if 'pred' in save:
+            torch.save(predi.squeeze(0).cpu(), fname+'_pred.pth')
+            
+        if 'visu' in save:
+            if task=='stereo':
+                disparity = predi.permute((1,2,0)).squeeze(2).cpu().numpy()
+                m,M = None
+                if gt is not None:
+                    mask = torch.isfinite(gti)
+                    m = gt[mask].min()
+                    M = gt[mask].max()
+                img_disparity = vis_disparity(disparity, m=m, M=M)
+                Image.fromarray(img_disparity).save(fname+'_pred.png')
+            else:
+                # normalize flowToColor according to the maxnorm of gt (or prediction if not available)
+                flowNorm = torch.sqrt(torch.sum( (gti if gt is not None else predi)**2, dim=0)).max().item()
+                imgflow = flowToColor(predi.permute((1,2,0)).cpu().numpy(), maxflow=flowNorm)
+                Image.fromarray(imgflow).save(fname+'_pred.png')
+                
+        if 'err10' in save:
+            assert gt is not None
+            L2err = torch.sqrt(torch.sum( (gti-predi)**2, dim=0))
+            valid = torch.isfinite(gti[0,:,:])
+            L2err[~valid] = 0.0
+            L2err = torch.clamp(L2err, max=10.0)
+            red = (L2err*255.0/10.0).to(dtype=torch.uint8)[:,:,None]
+            zer = torch.zeros_like(red)
+            imgerr = torch.cat( (red,zer,zer), dim=2).cpu().numpy()
+            Image.fromarray(imgerr).save(fname+'_err10.png')
+            
+        if 'submission' in save:
+            assert submission_dir is not None
+            predi_np = predi.permute(1,2,0).squeeze(2).cpu().numpy() # transform into HxWx2 for flow or HxW for stereo
+            dataset.submission_save_pairname(pairname, predi_np, submission_dir, time)
+
+def main(args):
+        
+    # load the pretrained model and metrics
+    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+    model, metrics, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion(args.model, 'metrics' in args.save, device)
+    if args.tile_conf_mode=='': args.tile_conf_mode = tile_conf_mode
+    
+    # load the datasets 
+    datasets = (get_test_datasets_stereo if task=='stereo' else get_test_datasets_flow)(args.dataset)
+    dataloaders = [DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for dataset in datasets]    
+       
+    # run
+    for i,dataloader in enumerate(dataloaders):
+        dataset = datasets[i]
+        dstr = args.dataset.split('+')[i]
+        
+        outdir = args.model+'_'+misc.filename(dstr)
+        if 'metrics' in args.save and len(args.save)==1:
+            fname = os.path.join(outdir, f'conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl')
+            if os.path.isfile(fname) and len(args.save)==1:
+                print('  metrics already compute in '+fname)
+                with open(fname, 'rb') as fid:
+                    results = pickle.load(fid)
+                for k,v in results.items():
+                    print('{:s}: {:.3f}'.format(k, v))
+                continue
+                        
+        if 'submission' in args.save:
+            dirname = f'submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}'
+            submission_dir = os.path.join(outdir, dirname)
+        else:
+            submission_dir = None
+           
+        print('')
+        print('saving {:s} in {:s}'.format('+'.join(args.save), outdir))
+        print(repr(dataset))
+    
+        if metrics is not None: 
+            metrics.reset()
+                
+        for data_iter_step, (image1, image2, gt, pairnames) in enumerate(tqdm(dataloader)):
+        
+            do_flip = (task=='stereo' and dstr.startswith('Spring') and any("right" in p for p in pairnames)) # we flip the images and will flip the prediction after as we assume img1 is on the left 
+            
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = gt.to(device, non_blocking=True) if gt.numel()>0 else None # special case for test time
+            if do_flip:
+                assert all("right" in p for p in pairnames) 
+                image1 = image1.flip(dims=[3]) # this is already the right frame, let's flip it
+                image2 = image2.flip(dims=[3])
+                gt = gt # that is ok
+                        
+            with torch.inference_mode():
+                pred, _, _, time = tiled_pred(model, None, image1, image2, None if dataset.name=='Spring' else gt, conf_mode=args.tile_conf_mode, overlap=args.tile_overlap, crop=cropsize, with_conf=with_conf, return_time=True)
+
+                if do_flip:
+                    pred = pred.flip(dims=[3])
+                
+                if metrics is not None: 
+                    metrics.add_batch(pred, gt)
+                
+                if any(k in args.save for k in ['pred','visu','err10','submission']):
+                    _save_batch(pred, gt, pairnames, dataset, task, args.save, outdir, time, submission_dir=submission_dir)                
+            
+
+        # print 
+        if metrics is not None: 
+            results = metrics.get_results()
+            for k,v in results.items():
+                print('{:s}: {:.3f}'.format(k, v))
+                
+        # save if needed
+        if 'metrics' in args.save:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            with open(fname, 'wb') as fid:
+                pickle.dump(results, fid)
+            print('metrics saved in', fname)
+            
+        # finalize submission if needed
+        if 'submission' in args.save:
+            dataset.finalize_submission(submission_dir)
+                
+        
+            
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/croco/stereoflow/train.py b/croco/stereoflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..91f2414ffbe5ecd547d31c0e2455478d402719d6
--- /dev/null
+++ b/croco/stereoflow/train.py
@@ -0,0 +1,253 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main training function
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from torch.utils.data import DataLoader
+
+import utils
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt
+from models.pos_embed import interpolate_pos_embed
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.datasets_stereo import get_train_dataset_stereo, get_test_datasets_stereo
+from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow
+from stereoflow.engine import train_one_epoch, validate_one_epoch
+from stereoflow.criterion import *
+
+
+def get_args_parser():
+    # prepare subparsers 
+    parser = argparse.ArgumentParser('Finetuning CroCo models on stereo or flow', add_help=False)
+    subparsers = parser.add_subparsers(title="Task (stereo or flow)", dest="task", required=True)
+    parser_stereo = subparsers.add_parser('stereo', help='Training stereo model')
+    parser_flow = subparsers.add_parser('flow', help='Training flow model')
+    def add_arg(name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs):
+        if default is not None: assert default_stereo is None and default_flow is None, "setting default makes default_stereo and default_flow disabled"
+        parser_stereo.add_argument(name_or_flags, default=default if default is not None else default_stereo, **kwargs)
+        parser_flow.add_argument(name_or_flags, default=default if default is not None else default_flow, **kwargs)
+    # output dir 
+    add_arg('--output_dir', required=True, type=str, help='path where to save, if empty, automatically created')
+    # model
+    add_arg('--crop', type=int, nargs = '+', default_stereo=[352, 704], default_flow=[320, 384], help = "size of the random image crops used during training.")
+    add_arg('--pretrained', required=True, type=str, help="Load pretrained model (required as croco arguments come from there)")
+    # criterion  
+    add_arg('--criterion', default_stereo='LaplacianLossBounded2()', default_flow='LaplacianLossBounded()', type=str, help='string to evaluate to get criterion')
+    add_arg('--bestmetric', default_stereo='avgerr', default_flow='EPE', type=str)
+    # dataset 
+    add_arg('--dataset', type=str, required=True, help="training set")
+    # training 
+    add_arg('--seed', default=0, type=int, help='seed')
+    add_arg('--batch_size', default_stereo=6, default_flow=8, type=int, help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    add_arg('--epochs', default=32, type=int, help='number of training epochs')
+    add_arg('--img_per_epoch', type=int, default=None, help='Fix the number of images seen in an epoch (None means use all training pairs)')
+    add_arg('--accum_iter', default=1, type=int, help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+    add_arg('--weight_decay', type=float, default=0.05, help='weight decay (default: 0.05)')
+    add_arg('--lr', type=float, default_stereo=3e-5, default_flow=2e-5, metavar='LR', help='learning rate (absolute lr)')
+    add_arg('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+    add_arg('--warmup_epochs', type=int, default=1, metavar='N', help='epochs to warmup LR')
+    add_arg('--optimizer', default='AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))', type=str,
+                        help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]")
+    add_arg('--amp', default=0, type=int, choices=[0,1], help='enable automatic mixed precision training')
+    # validation
+    add_arg('--val_dataset', type=str, default='', help="Validation sets, multiple separated by + (empty string means that no validation is performed)")
+    add_arg('--tile_conf_mode', type=str, default_stereo='conf_expsigmoid_15_3', default_flow='conf_expsigmoid_10_5', help='Weights for tile aggregation')
+    add_arg('--val_overlap', default=0.7, type=float, help='Overlap value for the tiling')
+    # others
+    add_arg('--num_workers', default=8, type=int)
+    add_arg('--eval_every', type=int, default=1, help='Val loss evaluation frequency')
+    add_arg('--save_every', type=int, default=1, help='Save checkpoint frequency')
+    add_arg('--start_from', type=str, default=None, help='Start training using weights from an other model (eg for finetuning)')
+    add_arg('--tboard_log_step', type=int, default=100, help='Log to tboard every so many steps')
+    add_arg('--dist_url', default='env://', help='url used to set up distributed training')
+
+    return parser
+    
+        
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    num_tasks = misc.get_world_size()
+
+    assert os.path.isfile(args.pretrained)
+    print("output_dir: "+args.output_dir)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # Metrics / criterion 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    metrics = (StereoMetrics if args.task=='stereo' else FlowMetrics)().to(device)
+    criterion = eval(args.criterion).to(device)
+    print('Criterion: ', args.criterion)
+
+    # Prepare model
+    assert os.path.isfile(args.pretrained)
+    ckpt = torch.load(args.pretrained, 'cpu')
+    croco_args = croco_args_from_ckpt(ckpt)
+    croco_args['img_size'] = (args.crop[0], args.crop[1])
+    print('Croco args: '+str(croco_args))
+    args.croco_args = croco_args # saved for test time 
+    # prepare head 
+    num_channels = {'stereo': 1, 'flow': 2}[args.task]
+    if criterion.with_conf: num_channels += 1
+    print(f'Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)')
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    # build model and load pretrained weights
+    model = CroCoDownstreamBinocular(head, **croco_args)
+    interpolate_pos_embed(model, ckpt['model'])
+    msg = model.load_state_dict(ckpt['model'], strict=False)
+    print(msg)
+
+    total_params = sum(p.numel() for p in model.parameters())
+    total_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total params: {total_params}")
+    print(f"Total params trainable: {total_params_trainable}")
+    model_without_ddp = model.to(device)
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    print("lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], static_graph=True)
+        model_without_ddp = model.module
+    
+    # following timm: set wd as 0 for bias and norm layers   
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay)
+    optimizer = eval(f"torch.optim.{args.optimizer}")
+    print(optimizer)
+    loss_scaler = NativeScaler()
+    
+    # automatic restart
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    if not args.resume and args.start_from:
+        print(f"Starting from an other model's weights: {args.start_from}")
+        best_so_far = None
+        args.start_epoch = 0
+        ckpt = torch.load(args.start_from, 'cpu')
+        msg = model_without_ddp.load_state_dict(ckpt['model'], strict=False)
+        print(msg)
+    else:
+        best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if best_so_far is None: best_so_far = np.inf
+    
+    # tensorboard
+    log_writer = None
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir, purge_step=args.start_epoch*1000)
+
+    #  dataset and loader 
+    print('Building Train Data loader for dataset: ', args.dataset)
+    train_dataset = (get_train_dataset_stereo if args.task=='stereo' else get_train_dataset_flow)(args.dataset, crop_size=args.crop)
+    def _print_repr_dataset(d):
+        if isinstance(d, torch.utils.data.dataset.ConcatDataset):
+            for dd in d.datasets:
+                _print_repr_dataset(dd)
+        else:
+            print(repr(d))
+    _print_repr_dataset(train_dataset)
+    print('  total length:', len(train_dataset))
+    if args.distributed:
+        sampler_train = torch.utils.data.DistributedSampler(
+            train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+    else:
+        sampler_train = torch.utils.data.RandomSampler(train_dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        train_dataset, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+    if args.val_dataset=='':
+        data_loaders_val = None
+    else:
+        print('Building Val Data loader for datasets: ', args.val_dataset)
+        val_datasets = (get_test_datasets_stereo if args.task=='stereo' else get_test_datasets_flow)(args.val_dataset)
+        for val_dataset in val_datasets: print(repr(val_dataset))
+        data_loaders_val = [DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for val_dataset in val_datasets]
+        bestmetric = ("AVG_" if len(data_loaders_val)>1 else str(data_loaders_val[0].dataset)+'_')+args.bestmetric
+       
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    # Training Loop
+    for epoch in range(args.start_epoch, args.epochs):
+
+        if args.distributed: data_loader_train.sampler.set_epoch(epoch)
+            
+        # Train
+        epoch_start = time.time()
+        train_stats = train_one_epoch(model, criterion, metrics, data_loader_train, optimizer, device, epoch, loss_scaler, log_writer=log_writer, args=args)
+        epoch_time = time.time() - epoch_start
+
+        if args.distributed: dist.barrier()
+
+        # Validation (current naive implementation runs the validation on every gpu ... not smart ...)
+        if data_loaders_val is not None and args.eval_every > 0 and (epoch+1) % args.eval_every == 0:
+            val_epoch_start = time.time()
+            val_stats = validate_one_epoch(model, criterion, metrics, data_loaders_val, device, epoch, log_writer=log_writer, args=args)
+            val_epoch_time = time.time() - val_epoch_start
+
+            val_best = val_stats[bestmetric]
+            
+            # Save best of all
+            if val_best <= best_so_far:
+                best_so_far = val_best
+                misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='best')
+        
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         'epoch': epoch,
+                         **{f'val_{k}': v for k, v in val_stats.items()}}
+        else:
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         'epoch': epoch,}
+                             
+        if args.distributed: dist.barrier()
+        
+        # Save stuff
+        if args.output_dir and ((epoch+1) % args.save_every == 0 or epoch + 1 == args.epochs):
+            misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='last')
+
+        if args.output_dir:
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+        
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+    
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/croco/utils/misc.py b/croco/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..132e102a662c987dce5282633cb8730b0e0d5c2d
--- /dev/null
+++ b/croco/utils/misc.py
@@ -0,0 +1,463 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# utilitary functions for CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import builtins
+import datetime
+import os
+import time
+import math
+import json
+from collections import defaultdict, deque
+from pathlib import Path
+import numpy as np
+
+import torch
+import torch.distributed as dist
+from torch import inf
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None, max_iter=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable)
+        space_fmt = ':' + str(len(str(len_iterable))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for it,obj in enumerate(iterable):
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len_iterable - 1:
+                eta_seconds = iter_time.global_avg * (len_iterable - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len_iterable, eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len_iterable, eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+            if max_iter and it >= max_iter:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len_iterable))
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+
+    builtins.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    nodist = args.nodist if hasattr(args,'nodist') else False 
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ and not nodist:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self, enabled=True):
+        self._scaler = torch.cuda.amp.GradScaler(enabled=enabled)
+
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+
+    def state_dict(self):
+        return self._scaler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+
+
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+
+
+
+
+def save_model(args, epoch, model_without_ddp, optimizer, loss_scaler, fname=None, best_so_far=None):
+    output_dir = Path(args.output_dir)
+    if fname is None: fname = str(epoch)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % fname)
+    to_save = {
+        'model': model_without_ddp.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+        'epoch': epoch,
+    }
+    if best_so_far is not None: to_save['best_so_far'] = best_so_far
+    print(f'>> Saving model to {checkpoint_path} ...')
+    save_on_master(to_save, checkpoint_path)
+
+
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    args.start_epoch = 0
+    best_so_far = None
+    if args.resume is not None:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        print("Resume checkpoint %s" % args.resume)
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        args.start_epoch = checkpoint['epoch'] + 1
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        if 'scaler' in checkpoint:
+            loss_scaler.load_state_dict(checkpoint['scaler'])
+        if 'best_so_far' in checkpoint:
+            best_so_far = checkpoint['best_so_far']
+            print(" & best_so_far={:g}".format(best_so_far))
+        else:
+            print("")
+        print("With optim & sched! start_epoch={:d}".format(args.start_epoch), end='')
+    return best_so_far
+
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x
+
+def _replace(text, src, tgt, rm=''):
+    """ Advanced string replacement.
+    Given a text:
+    - replace all elements in src by the corresponding element in tgt
+    - remove all elements in rm
+    """
+    if len(tgt) == 1: 
+        tgt = tgt * len(src)
+    assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len"
+    for s,t in zip(src, tgt):
+        text = text.replace(s,t)
+    for c in rm:
+        text = text.replace(c,'')
+    return text
+    
+def filename( obj ):
+    """ transform a python obj or cmd into a proper filename. 
+     - \1 gets replaced by slash '/'
+     - \2 gets replaced by comma ','
+    """
+    if not isinstance(obj, str): 
+        obj = repr(obj)
+    obj = str(obj).replace('()','')
+    obj = _replace(obj, '_,(*/\1\2','-__x%/,', rm=' )\'"')
+    assert all(len(s) < 256 for s in obj.split(os.sep)), 'filename too long (>256 characters):\n'+obj
+    return obj
+
+def _get_num_layer_for_vit(var_name, enc_depth, dec_depth):
+    if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"):
+        return 0
+    elif var_name.startswith("patch_embed"):
+        return 0
+    elif var_name.startswith("enc_blocks"):
+        layer_id = int(var_name.split('.')[1])
+        return layer_id + 1
+    elif var_name.startswith('decoder_embed') or var_name.startswith('enc_norm'): # part of the last black
+        return enc_depth
+    elif var_name.startswith('dec_blocks'):
+        layer_id = int(var_name.split('.')[1])
+        return enc_depth + layer_id + 1
+    elif var_name.startswith('dec_norm'): # part of the last block
+        return enc_depth + dec_depth
+    elif any(var_name.startswith(k) for k in ['head','prediction_head']):
+        return enc_depth + dec_depth + 1
+    else:
+        raise NotImplementedError(var_name)
+
+def get_parameter_groups(model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    enc_depth, dec_depth = None, None
+    # prepare layer decay values 
+    assert layer_decay==1.0 or 0.<layer_decay<1.
+    if layer_decay<1.:
+        enc_depth = model.enc_depth
+        dec_depth = model.dec_depth if hasattr(model, 'dec_blocks') else 0
+        num_layers = enc_depth+dec_depth
+        layer_decay_values = list(layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))
+        
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        # Assign weight decay values
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+
+        # Assign layer ID for LR scaling
+        if layer_decay<1.:
+            skip_scale = False
+            layer_id = _get_num_layer_for_vit(name, enc_depth, dec_depth)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if name in no_lr_scale_list:
+                skip_scale = True
+                group_name = f'{group_name}_no_lr_scale'
+        else:
+            layer_id = 0
+            skip_scale = True
+
+        if group_name not in parameter_group_names:
+            if not skip_scale:
+                scale = layer_decay_values[layer_id]
+            else:
+                scale = 1.
+
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
+
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs 
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+            
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+            
+    return lr
diff --git a/dust3r/__init__.py b/dust3r/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/__pycache__/__init__.cpython-310.pyc b/dust3r/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2554667f2a3dea851493e035792ac143bd10cea4
Binary files /dev/null and b/dust3r/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/__init__.cpython-38.pyc b/dust3r/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb5d6c4f66d26ea8bee390058b16d00ba0a178b7
Binary files /dev/null and b/dust3r/__pycache__/__init__.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/__init__.cpython-39.pyc b/dust3r/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d436cad8c59aee2b22ff5e73c7c77f8ac5eb826c
Binary files /dev/null and b/dust3r/__pycache__/__init__.cpython-39.pyc differ
diff --git a/dust3r/__pycache__/image_pairs.cpython-310.pyc b/dust3r/__pycache__/image_pairs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..197341b3fd8a6d4675a7446bcdf9520c82dac153
Binary files /dev/null and b/dust3r/__pycache__/image_pairs.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/image_pairs.cpython-38.pyc b/dust3r/__pycache__/image_pairs.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b5a6514e633f89cbbf4fab38e705a6d2714dc0
Binary files /dev/null and b/dust3r/__pycache__/image_pairs.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/inference.cpython-310.pyc b/dust3r/__pycache__/inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a53990954586fcc38cbb84d0f03b5c54136cfa74
Binary files /dev/null and b/dust3r/__pycache__/inference.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/inference.cpython-38.pyc b/dust3r/__pycache__/inference.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f566e7340dcfb332c4c7e388cb7bb3157b289528
Binary files /dev/null and b/dust3r/__pycache__/inference.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/inference.cpython-39.pyc b/dust3r/__pycache__/inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bce1d02d8e940f348f3373351f2264ff1b634fd
Binary files /dev/null and b/dust3r/__pycache__/inference.cpython-39.pyc differ
diff --git a/dust3r/__pycache__/model.cpython-310.pyc b/dust3r/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd290ec31653d630722c3d49dbb61d8238256f2a
Binary files /dev/null and b/dust3r/__pycache__/model.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/model.cpython-38.pyc b/dust3r/__pycache__/model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..421285c1abb1bd2c92255c70131c55b6eb962798
Binary files /dev/null and b/dust3r/__pycache__/model.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/model.cpython-39.pyc b/dust3r/__pycache__/model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b3fc91a79d56428b58d8a9aec928db114981371
Binary files /dev/null and b/dust3r/__pycache__/model.cpython-39.pyc differ
diff --git a/dust3r/__pycache__/optim_factory.cpython-310.pyc b/dust3r/__pycache__/optim_factory.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c4675b7111b6496670413ab3d7ac69e05e62817
Binary files /dev/null and b/dust3r/__pycache__/optim_factory.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/optim_factory.cpython-38.pyc b/dust3r/__pycache__/optim_factory.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b86abf43210c3a958beee358b83bc4f25300f57
Binary files /dev/null and b/dust3r/__pycache__/optim_factory.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/patch_embed.cpython-310.pyc b/dust3r/__pycache__/patch_embed.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d62cef41468bc157c09203e594f036c419ecc7f
Binary files /dev/null and b/dust3r/__pycache__/patch_embed.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/patch_embed.cpython-38.pyc b/dust3r/__pycache__/patch_embed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b296aa18c05ec232f66356b618da42d13f88e81
Binary files /dev/null and b/dust3r/__pycache__/patch_embed.cpython-38.pyc differ
diff --git a/dust3r/__pycache__/post_process.cpython-310.pyc b/dust3r/__pycache__/post_process.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e639f8b862956da232cab176c2f1c34be121a97
Binary files /dev/null and b/dust3r/__pycache__/post_process.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/viz.cpython-310.pyc b/dust3r/__pycache__/viz.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31c936fe6b04a03e774e7d3096862034b882e61f
Binary files /dev/null and b/dust3r/__pycache__/viz.cpython-310.pyc differ
diff --git a/dust3r/__pycache__/viz.cpython-38.pyc b/dust3r/__pycache__/viz.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..348ee4235b6cc92884dcea9c2c6b586da932ea57
Binary files /dev/null and b/dust3r/__pycache__/viz.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__init__.py b/dust3r/cloud_opt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc597c702861154bbe7a08f23b089474e926bb35
--- /dev/null
+++ b/dust3r/cloud_opt/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+
+from .optimizer import PointCloudOptimizer
+from .pair_viewer import PairViewer
+
+
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    PairViewer = "PairViewer"
+
+
+def global_aligner(dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [dust3r_output[k] for k in 'view1 view2 pred1 pred2'.split()]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    elif mode == GlobalAlignerMode.PairViewer:
+        net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f'Unknown mode {mode}')
+
+    return net
diff --git a/dust3r/cloud_opt/__pycache__/__init__.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e0a2b51c03de3e985bd0aaa8fdcca3df53afc24
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/__init__.cpython-38.pyc b/dust3r/cloud_opt/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f50c36cc3fd01bee45dd96d2b4b74abc5020bb8
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/__init__.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/base_opt.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/base_opt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13d999f186a2c4107591cd1a86db25bb11752af3
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/base_opt.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/base_opt.cpython-38.pyc b/dust3r/cloud_opt/__pycache__/base_opt.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c72e158564414b21d654a3bfb09deec9c8062e0
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/base_opt.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/commons.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/commons.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80a51ff436858899e0394ec879a6fefea4dca7fd
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/commons.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/commons.cpython-38.pyc b/dust3r/cloud_opt/__pycache__/commons.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c309db1757053214244504ee4ef54568ea10ec1d
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/commons.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..028d8d55934e604ed7b856eb501b9fd59a55e4ea
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-38.pyc b/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b3d310d3d6522379fb57d29d123ecbb5d6392f5
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/init_im_poses.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/optimizer.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/optimizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..594404d25b6ede0ca2875af7a43026ea0508b09c
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/optimizer.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/optimizer.cpython-38.pyc b/dust3r/cloud_opt/__pycache__/optimizer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79fa1b943f67dddbd8c4e0beafb2a19a365c5825
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/optimizer.cpython-38.pyc differ
diff --git a/dust3r/cloud_opt/__pycache__/pair_viewer.cpython-310.pyc b/dust3r/cloud_opt/__pycache__/pair_viewer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78f5a5e6821b1b033f1154061b87d3631dae320c
Binary files /dev/null and b/dust3r/cloud_opt/__pycache__/pair_viewer.cpython-310.pyc differ
diff --git a/dust3r/cloud_opt/base_opt.py b/dust3r/cloud_opt/base_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7207954e64fc61f8774f869c41f750fe778f2a0
--- /dev/null
+++ b/dust3r/cloud_opt/base_opt.py
@@ -0,0 +1,378 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+
+from dust3r.utils.geometry import inv, geotrf
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+from dust3r.viz import SceneViz, segment_sky, auto_cam_size
+from dust3r.optim_factory import adjust_learning_rate_by_lr
+
+from dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p,
+                                      cosine_schedule, linear_schedule, get_conf_trf)
+import dust3r.cloud_opt.init_im_poses as init_fun
+
+
+class BasePCOptimizer (nn.Module):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes 
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses 
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs'''.split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+
+    def _init_from_views(self, view1, view2, pred1, pred2,
+                         dist='l1',
+                         conf='log',
+                         min_conf_thr=3,
+                         base_scale=0.5,
+                         allow_pw_adaptors=False,
+                         pw_break=20,
+                         rand_pose=torch.randn,
+                         iterationsCount=None):
+        super().__init__()
+        if not isinstance(view1['idx'], list):
+            view1['idx'] = view1['idx'].tolist()
+        if not isinstance(view2['idx'], list):
+            view2['idx'] = view2['idx'].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+
+        self.n_imgs = self._check_edges()
+
+        # input data
+        pred1_pts = pred1['pts3d']
+        pred2_pts = pred2['pts3d_in_other_view']
+        self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+
+        # work in log-scale with conf
+        pred1_conf = pred1['conf']
+        pred2_conf = pred2['conf']
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+
+        self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM)))  # pairwise poses
+        self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2)))  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if 'img' in view1 and 'img' in view2:
+            imgs = [torch.zeros((3,)+hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1['idx'][v]
+                imgs[idx] = view1['img'][v]
+                idx = view2['idx'][v]
+                imgs[idx] = view2['img'][v]
+            self.imgs = rgb(imgs)
+
+    @property
+    def n_edges(self):
+        return len(self.edges)
+
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable}
+
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), 'bad pair indices: missing values '
+        return len(indices)
+
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes])
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+
+    def get_adaptors(self): # 公式(5)中的σ_e
+        adapt = self.pw_adaptors
+        adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1)  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True) # 归一化
+        return (adapt / self.pw_break).exp() # TODO gys:公式(5)中的σ_e是什么？
+
+    def _get_poses(self, poses): # self.im_poses 或者 self.pw_poses
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(T / (scale or 1))  # translation is function of scale
+
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1)  # scale the rotation AND translation
+        return scaled_RT
+
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+
+    def get_focals(self):
+        raise NotImplementedError()
+
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+
+    def get_principal_points(self):
+        raise NotImplementedError()
+
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+
+    def get_im_poses(self):
+        raise NotImplementedError()
+
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+
+    @torch.no_grad()
+    def clean_pointcloud(self, tol=0.001, max_bad_conf=0):
+        """ Method: 
+        1) express all 3d points in each camera coordinate frame
+        2) if they're in front of a depthmap --> then lower their confidence
+        """
+        assert 0 <= tol < 1
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        res = deepcopy(self)
+
+        for i, pts3d in enumerate(self.depth_to_pts3d()):
+            for j in range(self.n_imgs):
+                if i == j:
+                    continue
+
+                # project 3dpts in other view
+                Hi, Wi = self.imshapes[i]
+                Hj, Wj = self.imshapes[j]
+                proj = geotrf(cams[j], pts3d[:Hi*Wi]).reshape(Hi, Wi, 3)
+                proj_depth = proj[:, :, 2]
+                u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+
+                # check which points are actually in the visible cone
+                msk_i = (proj_depth > 0) & (0 <= u) & (u < Wj) & (0 <= v) & (v < Hj)
+                msk_j = v[msk_i], u[msk_i]
+
+                # find bad points = those in front but less confident
+                bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j]
+                              ) & (res.im_conf[i][msk_i] < res.im_conf[j][msk_j])
+
+                bad_msk_i = msk_i.clone()
+                bad_msk_i[msk_i] = bad_points
+                res.im_conf[i][bad_msk_i] = res.im_conf[i][bad_msk_i].clip_(max=max_bad_conf)
+
+        return res
+
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+
+        if ret_details:
+            return loss, details
+        return loss
+
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == 'msp' or init == 'mst':
+            # ==============3.3.Downstream Applications：主要是为3.4. Global Alignment中的公式(5)初始化内外参矩阵和待估计的世界坐标系的坐标============
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == 'known_poses':
+            init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP)
+        else:
+            raise ValueError(f'bad value for {init=}')
+
+        global_alignment_loop(self, **kw) # 3.4. Global Alignment：梯度下降公式(5)
+
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(im_poses, self.get_focals(), colors=colors,
+                        images=self.imgs, imsizes=self.imsizes, cam_size=cam_size)
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+
+            if show_pw_pts3d:
+                pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)]
+                viz.add_pointcloud(pts, (128, 0, 128))
+
+        viz.show(**kw)
+        return viz
+
+
+def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6, verbose=False):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+
+    if verbose:
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+
+    with tqdm.tqdm(total=niter) as bar:
+        while bar.n < bar.total:
+            t = bar.n / bar.total
+
+            if schedule == 'cosine':
+                lr = cosine_schedule(t, lr_base, lr_min)
+            elif schedule == 'linear':
+                lr = linear_schedule(t, lr_base, lr_min)
+            else:
+                raise ValueError(f'bad lr {schedule=}')
+            adjust_learning_rate_by_lr(optimizer, lr)
+
+            optimizer.zero_grad()
+            loss = net() # 论文中：Global optimization
+            loss.backward()
+            optimizer.step()
+            loss = float(loss)
+            bar.set_postfix_str(f'{lr=:g} loss={loss:g}')
+            if bar.n % 30 == 0:
+                print(' ')
+            bar.update()
diff --git a/dust3r/cloud_opt/commons.py b/dust3r/cloud_opt/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..052462e766c67282952f6f6e147c4d927e8ce486
--- /dev/null
+++ b/dust3r/cloud_opt/commons.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def edge_str(i, j):
+    return f'{i}_{j}'
+
+
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+
+
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+    # edge对应的两张图片经dust3r输出的置信度，分别对两张图片所有像素点的置信度取平均值再相乘，作为当前edge的置信度
+
+
+def compute_edge_scores(edges, conf_i, conf_j):# edge对应的两张图片经dust3r会输出两个置信度矩阵，分别对两张图片所有像素点的置信度取平均值再相乘，作为当前edge的置信度
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+
+
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+
+
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f'incorrect shape for image {i}'
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f'incorrect shape for image {j}'
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+
+
+def get_conf_trf(mode):
+    if mode == 'log':
+        def conf_trf(x): return x.log()
+    elif mode == 'sqrt':
+        def conf_trf(x): return x.sqrt()
+    elif mode == 'm1':
+        def conf_trf(x): return x-1
+    elif mode in ('id', 'none'):
+        def conf_trf(x): return x
+    else:
+        raise ValueError(f'bad mode for {mode=}')
+    return conf_trf
+
+
+def l2_dist(a, b, weight):
+    return ((a - b).square().sum(dim=-1) * weight)
+
+
+def l1_dist(a, b, weight):
+    return ((a - b).norm(dim=-1) * weight) # torch.norm()是求范式的损失，默认是第二范式
+
+
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+
+
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+
+
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+
+
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2
+
+
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
diff --git a/dust3r/cloud_opt/init_im_poses.py b/dust3r/cloud_opt/init_im_poses.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d265c9657d2fdcf3b7bde0413157332523608
--- /dev/null
+++ b/dust3r/cloud_opt/init_im_poses.py
@@ -0,0 +1,316 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+
+from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.viz import to_numpy
+
+from dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+
+
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, 'not all poses are known'
+
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges)):
+        i_j = edge_str(i, j)
+
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()),
+                         pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP)
+
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+
+
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """ Init all camera poses (image-wise and pairwise poses) given
+        an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges,
+                                                          self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr,
+                                                          device, has_im_poses=self.has_im_poses, **kw)
+
+    return init_from_pts3d(self, pts3d, im_focals, im_poses) # 初始化
+
+
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1: # 0
+        raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose")
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk])
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+
+    # pw_poses：遍历所有的edge，计算每个edge对应的(即输入dust3r的第一张图片的)相机坐标系转成“世界坐标系”的转换矩阵即P_e
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        # pred_i：dust3r输出的第一张图片对应的3D点云
+        s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]) # 估计每个edge对应的相机坐标系转成世界坐标系的外参矩阵
+        self._set_pose(self.pw_poses, e, R, T, scale=s) # pw_poses *****************
+
+    # TODO gys:s_factor是什么? take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factorS
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2] # 将世界坐标系的点pts3d[i]转成相机坐标系
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world) # im_poses ********************
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+
+    print(' init loss =', float(self()))
+
+
+def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr,
+                          device, has_im_poses=True, niter_PnP=10):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)) # 计算置信度，返回一个矩阵，表示两两图片表示的edge的置信度
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo() # 将上面的矩阵转换成最小生成树，因为sparse_graph加了负号，所以这里筛选出来的其实是最大的置信度
+    # 上面找最小生成树的目的是：为每个图片尽量选一个置信度最大的edge，因为每两两图片之间都存在一个edge
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes) # 长度为5的空list（输入图片的数量是5）
+
+    todo = sorted(zip(-msp.data, msp.row, msp.col)) # 根据最小生成树选出：平均置信度最大的4个edge（输入图片的数量是5），这4个edge一定包含5张输入图像 ，因为是生成树 # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+
+    # init with strongest edge
+    score, i, j = todo.pop() # 这里的socre是compute_edge_scores函数计算出的置信度
+    print(f' init edge ({i}*,{j}*) {score=}')
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone() # 置信度最大的edge对应的两张图片的三维点云（对与所有图片，每两张图片经dust3r都会输出两个三维点云）
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses: #============选择置信度最高edge中的第一张图片的相机坐标系为世界坐标系==============
+        im_poses[i] = torch.eye(4, device=device) # 4*4的单位矩阵，因为该图片的相机坐标系就是世界坐标系，所以外参矩阵为单位矩阵
+        im_focals[i] = estimate_focal(pred_i[i_j]) # 3.3 估计内参矩阵
+
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop() # pop把list最后一个元素弹出
+
+        if im_focals[i] is None: # 图片i对应的相机内参已经计算过了
+            im_focals[i] = estimate_focal(pred_i[i_j])
+
+        if i in done:
+            print(f' init edge ({i},{j}*) {score=}')
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j]) # 3.3 外参估计，s是sigma；直接调用roma工具包实现的
+            trf = sRT_to_4x4(s, R, T, device) # 存放到4*4的矩阵中，第四行是[0,0,0,1]，对应齐次坐标的转换
+            pts3d[j] = geotrf(trf, pred_j[i_j]) # pred_j[i_j]表示dust3r的输出：图片j在i的相机坐标系下的三维点云
+            done.add(j)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+
+        elif j in done:
+            print(f' init edge ({i}*,{j}) {score=}')
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j]) # 从pred_j[i_j]转换到 pts3d[j]的外参矩阵
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j]) # 应用估计出的外参矩阵将相机坐标系的点转成世界坐标系
+            done.add(i)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(sparse_graph.values())  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr # 使用PnP算法估计外参矩阵
+                res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP)
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+
+    return pts3d, msp_edges, im_focals, im_poses # pts3d表示：每个输入的图片在自己的相机坐标系下的三维点经im_poses转换成世界坐标系的点
+
+
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1 # 取出照片数量
+    for e in dic:
+        a1 = max(e)
+        a2 = 2
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res # 将edge中存放的置信度转移到一个n_imgs * n_imgs大小的列表中
+
+
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration( # 调用roma的工具类函数
+        pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True)
+    return s, R, T  # return un-scaled (R, T)
+
+
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device) # 单位矩阵
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf # 外参矩阵 3*4
+
+
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W/2, H/2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(
+        0), focal_mode='weiszfeld', min_focal=0.5, max_focal=3.5).ravel()
+    return float(focal)
+
+
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+
+
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S/2, S*3, 21)
+    else:
+        tentative_focals = [focal]
+
+    if pp is None:
+        pp = (W/2, H/2)
+    else:
+        pp = to_numpy(pp)
+
+    best = 0,
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+
+        success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                                    iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+        if not success:
+            continue
+
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+
+    if not best[0]:
+        return None
+
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+
+
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+
+
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+
+
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True)
+    return s, R, T
diff --git a/dust3r/cloud_opt/optimizer.py b/dust3r/cloud_opt/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a61060a9b03b86f50c8f5bc758a933d88038b6
--- /dev/null
+++ b/dust3r/cloud_opt/optimizer.py
@@ -0,0 +1,245 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import xy_grid, geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+
+
+class PointCloudOptimizer(BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        self.im_focals = nn.ParameterList(torch.FloatTensor(
+            [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+
+        self.imshape = self.imshapes[0]
+        im_areas = [h*w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+
+        # adding thing to optimize
+        self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area)
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes]))
+        self.register_buffer('_grid', ParameterStack(
+            [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area))
+
+        # pre-compute pixel weights
+        self.register_buffer('_weight_i', ParameterStack(
+            [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        self.register_buffer('_weight_j', ParameterStack(
+            [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area))
+
+        # precompute aa
+        self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area))
+        self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area))
+        self.register_buffer('_ei', torch.tensor([i for i, j in self.edges]))
+        self.register_buffer('_ej', torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!'
+
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+
+        self.im_poses.requires_grad_(False)
+        self.norm_pw_scale = False
+
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal))
+
+        self.im_focals.requires_grad_(False)
+
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp))
+
+        self.im_pp.requires_grad_(False)
+
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.cumsum([0] + msk.tolist())
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+
+    def _no_grad(self, tensor):
+        assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs'
+
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+
+    def get_focals(self): # 论文中Recovering intrinsics章节：求内参矩阵（即焦距）
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp # 将图像坐标系和像素坐标系的中心点偏移量
+
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):  # cam to world 外参数矩阵的逆
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+
+    def _set_depthmap(self, idx, depth, force=False):
+        depth = _ravel_hw(depth, self.max_area)
+
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+
+    def get_depthmaps(self, raw=False): #论文中公式(1)上面的的深度信息D
+        res = self.im_depthmaps.exp()
+        if not raw:
+            res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def depth_to_pts3d(self): # 这里根据深度信息D计算真实的世界坐标系下的点，即论文中公式(1)上面的公式
+        # Get depths and  projection params if not provided
+        focals = self.get_focals() # 论文中Recovering intrinsics章节：求内参矩阵（即焦距）
+        pp = self.get_principal_points() # 图像坐标系和像素坐标系之间的偏移，即照片宽高的一半
+        im_poses = self.get_im_poses() # 外参数矩阵
+        depth = self.get_depthmaps(raw=True)#论文中公式(1)上面的深度信息D
+
+        # get pointmaps in camera frame self._grid：输入的所有图像(图像坐标系)
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp) # 将输入图像的坐标点转成相机坐标系下的点
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps) # 再由相机坐标系转成世界坐标系
+
+    def get_pts3d(self, raw=False): # 计算真实的世界坐标系下的三维点坐标，根据公式(1)上面的深度D计算公式计算
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    # 这里的forward返回的就是公式(5)计算的损失值
+    def forward(self): # 论文中： Global optimization
+        pw_poses = self.get_pw_poses() # pw_poses cam-to-world 公式(5)的P_e： 外参矩阵的逆，由相机坐标系转成世界坐标系
+        pw_adapt = self.get_adaptors().unsqueeze(1) # 公式(5)中的比例系数 sigma
+        proj_pts3d = self.get_pts3d(raw=True) # im_poses 公式(5)的待优化的真实的世界坐标系下的三维点
+
+        # rotate pairwise prediction according to pw_poses 根据公式(5)的外参矩阵部分转成世界坐标系
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i) # _stacked_pred_i/j表示dest3r预测的三维点云
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+
+        # compute the loss： 转换成世界坐标系后的两张图像分别与待估计世界坐标系下的点（proj_pts3d）计算损失
+        li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i
+        lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j
+
+        return li + lj
+
+
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1) # 公式(1)上面的计算公式，根据内参矩阵和深度D，将图像坐标系的点转成相机坐标系下的三维点
+
+
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params)
+
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+
+
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+
+    if len(tensor) < fill:
+        tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:])))
+    return tensor
+
+
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    return minf*focal_base, maxf*focal_base
+
+
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img
diff --git a/dust3r/cloud_opt/pair_viewer.py b/dust3r/cloud_opt/pair_viewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49e9a17df9ddc489b8fe3dddc027636c0c5973d
--- /dev/null
+++ b/dust3r/cloud_opt/pair_viewer.py
@@ -0,0 +1,125 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dummy optimizer for visualizing pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import cv2
+
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates
+from dust3r.cloud_opt.commons import edge_str
+from dust3r.post_process import estimate_focal_knowing_depth
+
+
+class PairViewer (BasePCOptimizer):
+    """
+    This a Dummy Optimizer.
+    To use only when the goal is to visualize the results for a pair of images (with is_symmetrized)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.is_symmetrized and self.n_edges == 2
+        self.has_im_poses = True
+
+        # compute all parameters directly from raw input
+        self.focals = []
+        self.pp = []
+        rel_poses = []
+        confs = []
+        for i in range(self.n_imgs):
+            conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean())
+            print(f'  - {conf=:.3} for edge {i}-{1-i}')
+            confs.append(conf)
+
+            H, W = self.imshapes[i]
+            pts3d = self.pred_i[edge_str(i, 1-i)]
+            pp = torch.tensor((W/2, H/2))
+            focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld'))
+            self.focals.append(focal)
+            self.pp.append(pp)
+
+            # estimate the pose of pts1 in image 2
+            pixels = np.mgrid[:W, :H].T.astype(np.float32)
+            pts3d = self.pred_j[edge_str(1-i, i)].numpy()
+            assert pts3d.shape[:2] == (H, W)
+            msk = self.get_masks()[i].numpy()
+            K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+
+            try:
+                res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                         iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+                success, R, T, inliers = res
+                assert success
+
+                R = cv2.Rodrigues(R)[0]  # world to cam
+                pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]])  # cam to world
+            except:
+                pose = np.eye(4)
+            rel_poses.append(torch.from_numpy(pose.astype(np.float32)))
+
+        # let's use the pair with the most confidence
+        if confs[0] > confs[1]:
+            # ptcloud is expressed in camera1
+            self.im_poses = [torch.eye(4), rel_poses[1]]  # I, cam2-to-cam1
+            self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]]
+        else:
+            # ptcloud is expressed in camera2
+            self.im_poses = [rel_poses[0], torch.eye(4)]  # I, cam1-to-cam2
+            self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]]
+
+        self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False)
+        self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False)
+        self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False)
+        self.depth = nn.ParameterList(self.depth)
+        for p in self.parameters():
+            p.requires_grad = False
+
+    def _set_depthmap(self, idx, depth, force=False):
+        print('_set_depthmap is ignored in PairViewer')
+        return
+
+    def get_depthmaps(self, raw=False):
+        depth = [d.to(self.device) for d in self.depth]
+        return depth
+
+    def _set_focal(self, idx, focal, force=False):
+        self.focals[idx] = focal
+
+    def get_focals(self):
+        return self.focals
+
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.focals])
+
+    def get_principal_points(self):
+        return self.pp
+
+    def get_intrinsics(self):
+        focals = self.get_focals()
+        pps = self.get_principal_points()
+        K = torch.zeros((len(focals), 3, 3), device=self.device)
+        for i in range(len(focals)):
+            K[i, 0, 0] = K[i, 1, 1] = focals[i]
+            K[i, :2, 2] = pps[i]
+            K[i, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):
+        return self.im_poses
+
+    def depth_to_pts3d(self):
+        pts3d = []
+        for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()):
+            pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(),
+                                                             intrinsics.cpu().numpy(),
+                                                             im_pose.cpu().numpy())
+            pts3d.append(torch.from_numpy(pts).to(device=self.device))
+        return pts3d
+
+    def forward(self):
+        return float('nan')
diff --git a/dust3r/datasets/__init__.py b/dust3r/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc5e79718e4a3eb2e31c60c8a390e61a19ec5432
--- /dev/null
+++ b/dust3r/datasets/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .utils.transforms import *
+from .base.batched_sampler import BatchedRandomSampler  # noqa: F401
+from .co3d import Co3d  # noqa: F401
+
+
+def get_data_loader(dataset, batch_size, num_workers=8, shuffle=True, drop_last=True, pin_mem=True):
+    import torch
+    from croco.utils.misc import get_world_size, get_rank
+
+    # pytorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+
+    world_size = get_world_size()
+    rank = get_rank()
+
+    try:
+        sampler = dataset.make_sampler(batch_size, shuffle=shuffle, world_size=world_size,
+                                       rank=rank, drop_last=drop_last)
+    except (AttributeError, NotImplementedError):
+        # not avail for this dataset
+        if torch.distributed.is_initialized():
+            sampler = torch.utils.data.DistributedSampler(
+                dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last
+            )
+        elif shuffle:
+            sampler = torch.utils.data.RandomSampler(dataset)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+
+    return data_loader
diff --git a/dust3r/datasets/base/__init__.py b/dust3r/datasets/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/datasets/base/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/datasets/base/base_stereo_view_dataset.py b/dust3r/datasets/base/base_stereo_view_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..17390ca29d4437fc41f3c946b235888af9e4c888
--- /dev/null
+++ b/dust3r/datasets/base/base_stereo_view_dataset.py
@@ -0,0 +1,220 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL
+import numpy as np
+import torch
+
+from dust3r.datasets.base.easy_dataset import EasyDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+
+
+class BaseStereoViewDataset (EasyDataset):
+    """ Define all basic options.
+
+    Usage:
+        class MyDataset (BaseStereoViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+
+    def __init__(self, *,  # only keyword arguments
+                 split=None,
+                 resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+                 transform=ImgNorm,
+                 aug_crop=False,
+                 seed=None):
+        self.num_views = 2
+        self.split = split
+        self._set_resolutions(resolution)
+
+        self.transform = transform
+        if isinstance(transform, str):
+            transform = eval(transform)
+
+        self.aug_crop = aug_crop
+        self.seed = seed
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def get_stats(self):
+        return f"{len(self)} pairs"
+
+    def __repr__(self):
+        resolutions_str = '['+';'.join(f'{w}x{h}' for w, h in self._resolutions)+']'
+        return f"""{type(self).__name__}({self.get_stats()},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace('self.', '').replace('\n', '').replace('   ', '')
+
+    def _get_views(self, idx, resolution, rng):
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        assert len(views) == self.num_views
+
+        # check data-types
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
+
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, 'undefined resolution'
+
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(width, int), f'Bad type for {width=} {type(width)=}, should be int'
+            assert isinstance(height, int), f'Bad type for {height=} {type(height)=}, should be int'
+            assert width >= height
+            self._resolutions.append((width, height))
+
+    def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None):
+        """ This function:
+            - first downsizes the image with LANCZOS inteprolation,
+              which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W-cx)
+        min_margin_y = min(cy, H-cy)
+        assert min_margin_x > W/5, f'Bad principal point in view={info}'
+        assert min_margin_y > H/5, f'Bad principal point in view={info}'
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1*W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H/W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2):
+                resolution = resolution[::-1]
+
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_resolution += rng.integers(0, self.aug_crop)
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution)
+
+        # actual cropping (if necessary) with bilinear interpolation
+        intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=0.5)
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution)
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+
+        return image, depthmap, intrinsics2
+
+
+def is_good_type(key, v):
+    """ returns (is_good, err_msg) 
+    """
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+
+
+def view_name(view, batch_index=None):
+    def sel(x): return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view['dataset'])
+    label = sel(view['label'])
+    instance = sel(view['instance'])
+    return f"{db}/{label}/{instance}"
+
+
+def transpose_to_landscape(view):
+    height, width = view['true_shape']
+
+    if width < height:
+        # rectify portrait to landscape
+        assert view['img'].shape == (3, height, width)
+        view['img'] = view['img'].swapaxes(1, 2)
+
+        assert view['valid_mask'].shape == (height, width)
+        view['valid_mask'] = view['valid_mask'].swapaxes(0, 1)
+
+        assert view['depthmap'].shape == (height, width)
+        view['depthmap'] = view['depthmap'].swapaxes(0, 1)
+
+        assert view['pts3d'].shape == (height, width, 3)
+        view['pts3d'] = view['pts3d'].swapaxes(0, 1)
+
+        # transpose x and y pixels
+        view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]]
diff --git a/dust3r/datasets/base/batched_sampler.py b/dust3r/datasets/base/batched_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f58a65d41bb8101159e032d5b0aac26a7cf1a1
--- /dev/null
+++ b/dust3r/datasets/base/batched_sampler.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Random sampling under a constraint
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+class BatchedRandomSampler:
+    """ Random sampling under a constraint: each sample in the batch has the same feature, 
+    which is chosen randomly from a known pool of 'features' for each batch.
+
+    For instance, the 'feature' could be the image aspect-ratio.
+
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+
+    def __init__(self, dataset, batch_size, pool_size, world_size=1, rank=0, drop_last=True):
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+
+        self.len_dataset = N = len(dataset)
+        self.total_size = round_by(N, batch_size*world_size) if drop_last else N
+        assert world_size == 1 or drop_last, 'must drop the last batch in distributed mode'
+
+        # distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+
+    def __len__(self):
+        return self.total_size // self.world_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def __iter__(self):
+        # prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, 'use set_epoch() if distributed mode is used'
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+
+        # random feat_idxs (same across each batch)
+        n_batches = (self.total_size+self.batch_size-1) // self.batch_size
+        feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size))
+        feat_idxs = feat_idxs.ravel()[:self.total_size]
+
+        # put them together
+        idxs = np.c_[sample_idxs, feat_idxs]  # shape = (total_size, 2)
+
+        # Distributed sampler: we select a subset of batches
+        # make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * ((self.total_size + self.world_size *
+                                           self.batch_size-1) // (self.world_size * self.batch_size))
+        idxs = idxs[self.rank*size_per_proc: (self.rank+1)*size_per_proc]
+
+        yield from (tuple(idx) for idx in idxs)
+
+
+def round_by(total, multiple, up=False):
+    if up:
+        total = total + multiple-1
+    return (total//multiple) * multiple
diff --git a/dust3r/datasets/base/easy_dataset.py b/dust3r/datasets/base/easy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4939a88f02715a1f80be943ddb6d808e1be84db7
--- /dev/null
+++ b/dust3r/datasets/base/easy_dataset.py
@@ -0,0 +1,157 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# A dataset base class that you can easily resize and combine.
+# --------------------------------------------------------
+import numpy as np
+from dust3r.datasets.base.batched_sampler import BatchedRandomSampler
+
+
+class EasyDataset:
+    """ a dataset that you can easily resize and combine.
+    Examples:
+    ---------
+        2 * dataset ==> duplicate each element 2x
+
+        10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
+
+        dataset1 + dataset2 ==> concatenate datasets
+    """
+
+    def __add__(self, other):
+        return CatDataset([self, other])
+
+    def __rmul__(self, factor):
+        return MulDataset(factor, self)
+
+    def __rmatmul__(self, factor):
+        return ResizedDataset(factor, self)
+
+    def set_epoch(self, epoch):
+        pass  # nothing to do by default
+
+    def make_sampler(self, batch_size, shuffle=True, world_size=1, rank=0, drop_last=True):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        return BatchedRandomSampler(self, batch_size, num_of_aspect_ratios, world_size=world_size, rank=rank, drop_last=drop_last)
+
+
+class MulDataset (EasyDataset):
+    """ Artifically augmenting the size of a dataset.
+    """
+    multiplicator: int
+
+    def __init__(self, multiplicator, dataset):
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.multiplicator * len(self.dataset)
+
+    def __repr__(self):
+        return f'{self.multiplicator}*{repr(self.dataset)}'
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[idx // self.multiplicator, other]
+        else:
+            return self.dataset[idx // self.multiplicator]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+
+class ResizedDataset (EasyDataset):
+    """ Artifically changing the size of a dataset.
+    """
+    new_size: int
+
+    def __init__(self, new_size, dataset):
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.new_size
+
+    def __repr__(self):
+        size_str = str(self.new_size)
+        for i in range((len(size_str)-1) // 3):
+            sep = -4*i-3
+            size_str = size_str[:sep] + '_' + size_str[sep:]
+        return f'{size_str} @ {repr(self.dataset)}'
+
+    def set_epoch(self, epoch):
+        # this random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch+777)
+
+        # shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+
+        # rotary extension until target size is met
+        shuffled_idxs = np.concatenate([perm] * (1 + (len(self)-1) // len(self.dataset)))
+        self._idxs_mapping = shuffled_idxs[:self.new_size]
+
+        assert len(self._idxs_mapping) == self.new_size
+
+    def __getitem__(self, idx):
+        assert hasattr(self, '_idxs_mapping'), 'You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()'
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[self._idxs_mapping[idx], other]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+
+class CatDataset (EasyDataset):
+    """ Concatenation of several datasets 
+    """
+
+    def __init__(self, datasets):
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+
+    def __len__(self):
+        return self._cum_sizes[-1]
+
+    def __repr__(self):
+        # remove uselessly long transform
+        return ' + '.join(repr(dataset).replace(',transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))', '') for dataset in self.datasets)
+
+    def set_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            idx, other = idx
+
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+
+        db_idx = np.searchsorted(self._cum_sizes, idx, 'right')
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+
+        if other is not None:
+            new_idx = (new_idx, other)
+        return dataset[new_idx]
+
+    @property
+    def _resolutions(self):
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions)
+        return resolutions
diff --git a/dust3r/datasets/co3d.py b/dust3r/datasets/co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc94f9420d86372e643c00e7cddf85b3d1982c6
--- /dev/null
+++ b/dust3r/datasets/co3d.py
@@ -0,0 +1,146 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed Co3d_v2
+# dataset at https://github.com/facebookresearch/co3d - Creative Commons Attribution-NonCommercial 4.0 International
+# See datasets_preprocess/preprocess_co3d.py
+# --------------------------------------------------------
+import os.path as osp
+import json
+import itertools
+from collections import deque
+
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Co3d(BaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert mask_bg in (True, False, 'rand')
+        self.mask_bg = mask_bg
+
+        # load all scenes
+        with open(osp.join(self.ROOT, f'selected_seqs_{self.split}.json'), 'r') as f:
+            self.scenes = json.load(f)
+            self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0}
+            self.scenes = {(k, k2): v2 for k, v in self.scenes.items()
+                           for k2, v2 in v.items()}
+        self.scene_list = list(self.scenes.keys())
+
+        # for each scene, we have 100 images ==> 360 degrees (so 25 frames ~= 90 degrees)
+        # we prepare all combinations such that i-j = +/- [5, 10, .., 90] degrees
+        self.combinations = [(i, j)
+                             for i, j in itertools.combinations(range(100), 2)
+                             if 0 < abs(i-j) <= 30 and abs(i-j) % 5 == 0]
+
+        self.invalidate = {scene: {} for scene in self.scene_list}
+
+    def __len__(self):
+        return len(self.scene_list) * len(self.combinations)
+
+    def _get_views(self, idx, resolution, rng):
+        # choose a scene
+        obj, instance = self.scene_list[idx // len(self.combinations)]
+        image_pool = self.scenes[obj, instance]
+        im1_idx, im2_idx = self.combinations[idx % len(self.combinations)]
+
+        # add a bit of randomness
+        last = len(image_pool)-1
+
+        if resolution not in self.invalidate[obj, instance]:  # flag invalid images
+            self.invalidate[obj, instance][resolution] = [False for _ in range(len(image_pool))]
+
+        # decide now if we mask the bg
+        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))
+
+        views = []
+        imgs_idxs = [max(0, min(im_idx + rng.integers(-4, 5), last)) for im_idx in [im2_idx, im1_idx]]
+        imgs_idxs = deque(imgs_idxs)
+        while len(imgs_idxs) > 0:  # some images (few) have zero depth
+            im_idx = imgs_idxs.pop()
+
+            if self.invalidate[obj, instance][resolution][im_idx]:
+                # search for a valid image
+                random_direction = 2 * rng.choice(2) - 1
+                for offset in range(1, len(image_pool)):
+                    tentative_im_idx = (im_idx + (random_direction * offset)) % len(image_pool)
+                    if not self.invalidate[obj, instance][resolution][tentative_im_idx]:
+                        im_idx = tentative_im_idx
+                        break
+
+            view_idx = image_pool[im_idx]
+
+            impath = osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.jpg')
+
+            # load camera params
+            input_metadata = np.load(impath.replace('jpg', 'npz'))
+            camera_pose = input_metadata['camera_pose'].astype(np.float32)
+            intrinsics = input_metadata['camera_intrinsics'].astype(np.float32)
+
+            # load image and depth
+            rgb_image = imread_cv2(impath)
+            depthmap = imread_cv2(impath.replace('images', 'depths') + '.geometric.png', cv2.IMREAD_UNCHANGED)
+            depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(input_metadata['maximum_depth'])
+
+            if mask_bg:
+                # load object mask
+                maskpath = osp.join(self.ROOT, obj, instance, 'masks', f'frame{view_idx:06n}.png')
+                maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32)
+                maskmap = (maskmap / 255.0) > 0.1
+
+                # update the depthmap with mask
+                depthmap *= maskmap
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath)
+
+            num_valid = (depthmap > 0.0).sum()
+            if num_valid == 0:
+                # problem, invalidate image and retry
+                self.invalidate[obj, instance][resolution][im_idx] = True
+                imgs_idxs.append(im_idx)
+                continue
+
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,
+                camera_intrinsics=intrinsics,
+                dataset='Co3d_v2',
+                label=osp.join(obj, instance),
+                instance=osp.split(impath)[1],
+            ))
+        return views
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/datasets/utils/__init__.py b/dust3r/datasets/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/datasets/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/datasets/utils/cropping.py b/dust3r/datasets/utils/cropping.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b1915676f3deea24f57032f7588ff34cbfaeb9
--- /dev/null
+++ b/dust3r/datasets/utils/cropping.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics  # noqa
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+
+
+class ImageList:
+    """ Convenience class to aply the same operation to a whole set of images.
+    """
+
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch('resize', *args, **kwargs))
+
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch('crop', *args, **kwargs))
+
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+
+
+def rescale_image_depthmap(image, depthmap, camera_intrinsics, output_resolution):
+    """ Jointly rescale a (image, depthmap) 
+        so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+    assert output_resolution.shape == (2,)
+    # define output resolution
+    scale_final = max(output_resolution / image.size) + 1e-8
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+
+    # first rescale the image so that it contains the crop
+    image = image.resize(output_resolution, resample=lanczos)
+    if depthmap is not None:
+        depthmap = cv2.resize(depthmap, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final)
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(input_camera_matrix, input_resolution, output_resolution, scaling=1, offset_factor=0.5, offset=None):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+
+    return output_camera_matrix
+
+
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def bbox_from_intrinsics_in_out(input_camera_matrix, output_camera_matrix, output_resolution):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l+out_width, t+out_height)
+    return crop_bbox
diff --git a/dust3r/datasets/utils/transforms.py b/dust3r/datasets/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb34f2f01d3f8f829ba71a7e03e181bf18f72c25
--- /dev/null
+++ b/dust3r/datasets/utils/transforms.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUST3R default transforms
+# --------------------------------------------------------
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+
+# define the standard image transforms
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
diff --git a/dust3r/heads/__init__.py b/dust3r/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53d0aa5610cae95f34f96bdb3ff9e835a2d6208e
--- /dev/null
+++ b/dust3r/heads/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# head factory
+# --------------------------------------------------------
+from .linear_head import LinearPts3d
+from .dpt_head import create_dpt_head
+
+
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder 
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")
diff --git a/dust3r/heads/__pycache__/__init__.cpython-310.pyc b/dust3r/heads/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ef2d76fef4e34c8f9284b1079f25e880434d2b3
Binary files /dev/null and b/dust3r/heads/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dust3r/heads/__pycache__/__init__.cpython-38.pyc b/dust3r/heads/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b657e9c74d6431a5693f6c639e2553dbd3c3568
Binary files /dev/null and b/dust3r/heads/__pycache__/__init__.cpython-38.pyc differ
diff --git a/dust3r/heads/__pycache__/__init__.cpython-39.pyc b/dust3r/heads/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ddfb3c493c8261a3b6c08938f59158040fecf40
Binary files /dev/null and b/dust3r/heads/__pycache__/__init__.cpython-39.pyc differ
diff --git a/dust3r/heads/__pycache__/dpt_head.cpython-310.pyc b/dust3r/heads/__pycache__/dpt_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6509907a7294b552928f9ab317337042c02da92
Binary files /dev/null and b/dust3r/heads/__pycache__/dpt_head.cpython-310.pyc differ
diff --git a/dust3r/heads/__pycache__/dpt_head.cpython-38.pyc b/dust3r/heads/__pycache__/dpt_head.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..934f8df70d76b338b1d56bd557a494d94f2b3f1a
Binary files /dev/null and b/dust3r/heads/__pycache__/dpt_head.cpython-38.pyc differ
diff --git a/dust3r/heads/__pycache__/linear_head.cpython-310.pyc b/dust3r/heads/__pycache__/linear_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39139c6a4a9dcf7555b827f9cf01850cb31acaa0
Binary files /dev/null and b/dust3r/heads/__pycache__/linear_head.cpython-310.pyc differ
diff --git a/dust3r/heads/__pycache__/linear_head.cpython-38.pyc b/dust3r/heads/__pycache__/linear_head.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..054ea45c2d4b958cbb048c4794ab4b63a61d5111
Binary files /dev/null and b/dust3r/heads/__pycache__/linear_head.cpython-38.pyc differ
diff --git a/dust3r/heads/__pycache__/linear_head.cpython-39.pyc b/dust3r/heads/__pycache__/linear_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..171a27cf2e13bd957b71eaf63f28fbae0b01cb5a
Binary files /dev/null and b/dust3r/heads/__pycache__/linear_head.cpython-39.pyc differ
diff --git a/dust3r/heads/__pycache__/postprocess.cpython-310.pyc b/dust3r/heads/__pycache__/postprocess.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a04b32eb15aee535f6da5c955b110af76253b30c
Binary files /dev/null and b/dust3r/heads/__pycache__/postprocess.cpython-310.pyc differ
diff --git a/dust3r/heads/__pycache__/postprocess.cpython-38.pyc b/dust3r/heads/__pycache__/postprocess.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..768959709e9caca008efec7bac383037ebce737c
Binary files /dev/null and b/dust3r/heads/__pycache__/postprocess.cpython-38.pyc differ
diff --git a/dust3r/heads/dpt_head.py b/dust3r/heads/dpt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..15fed0fbabb493af9b4926c1a0f0a8d42c1462cc
--- /dev/null
+++ b/dust3r/heads/dpt_head.py
@@ -0,0 +1,115 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dpt head implementation for DUST3R
+# Downstream heads assume inputs of size B x N x C (where N is the number of tokens) ;
+# or if it takes as input the output at every layer, the attribute return_all_layers should be set to True
+# the forward function also takes as input a dictionnary img_info with key "height" and "width"
+# for PixelwiseTask, the output will be of dimension B x num_channels x H x W
+# --------------------------------------------------------
+from einops import rearrange
+from typing import List
+import torch
+import torch.nn as nn
+from dust3r.heads.postprocess import postprocess
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.models.dpt_block import DPTOutputAdapter  # noqa
+
+
+class DPTOutputAdapter_fix(DPTOutputAdapter):
+    """
+    Adapt croco's DPTOutputAdapter implementation for dust3r:
+    remove duplicated weigths, and fix forward for dust3r
+    """
+
+    def init(self, dim_tokens_enc=768):
+        super().init(dim_tokens_enc)
+        # these are duplicated weights
+        del self.act_1_postprocess
+        del self.act_2_postprocess
+        del self.act_3_postprocess
+        del self.act_4_postprocess
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        # H, W = input_info['image_size']
+        image_size = self.image_size if image_size is None else image_size
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+        # decoder一共有13个层，选择[0,6,9,12]层的结果到layers
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks] # [0,6,9,12]
+
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+        # 分别对每个层进行对应的卷积操作，将来自decoder各个block的输出结果映射成不同的尺寸，以便输入RefineNet
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim，再次分别对每个layers使用一个卷积，统一通道数为256
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]]
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        # Output head
+        out = self.head(path_1)
+
+        return out
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for dust3r, can return 3D points + confidence for all pixels"""
+
+    def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None,
+                 output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+
+        assert n_cls_token == 0, "Not implemented"
+        dpt_args = dict(output_width_ratio=output_width_ratio,
+                        num_channels=num_channels,
+                        **kwargs)
+        if hooks_idx is not None:
+            dpt_args.update(hooks=hooks_idx)
+        self.dpt = DPTOutputAdapter_fix(**dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+    def forward(self, x, img_info): # Head
+        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
+        if self.postprocess:
+            out = self.postprocess(out, self.depth_mode, self.conf_mode)
+        return out
+
+
+def create_dpt_head(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                head_type='regression')
diff --git a/dust3r/heads/linear_head.py b/dust3r/heads/linear_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b697f29eaa6f43fad0a3e27a8d9b8f1a602a833
--- /dev/null
+++ b/dust3r/heads/linear_head.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# linear head implementation for DUST3R
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+from dust3r.heads.postprocess import postprocess
+
+
+class LinearPts3d (nn.Module):
+    """ 
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(self, net, has_conf=False):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+
+        self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2)
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+
+        # extract 3D points
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+
+        # permute + norm depth
+        return postprocess(feat, self.depth_mode, self.conf_mode)
diff --git a/dust3r/heads/postprocess.py b/dust3r/heads/postprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc84628dda7558b584e928c4967b375e085dde49
--- /dev/null
+++ b/dust3r/heads/postprocess.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# post process function for all heads: extract 3D points/confidence from output
+# --------------------------------------------------------
+import torch
+
+
+def postprocess(out, depth_mode, conf_mode):
+    """
+    extract 3D points/confidence from prediction head output 
+    """ # out的通道数为4，即分别表示三维点云的xyz坐标值和conf置信度
+    fmap = out.permute(0, 2, 3, 1)  # B=1,H,W,3
+    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode))
+
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
+    return res
+
+
+def reg_dense_depth(xyz, mode):
+    """
+    extract 3D points from prediction head output
+    """
+    mode, vmin, vmax = mode
+
+    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
+    assert no_bounds
+
+    if mode == 'linear':
+        if no_bounds:
+            return xyz  # [-inf, +inf]
+        return xyz.clip(min=vmin, max=vmax)
+
+    # distance to origin
+    d = xyz.norm(dim=-1, keepdim=True) # 对channel维度，即对x,y,z三个坐标值求第二范式
+    xyz = xyz / d.clip(min=1e-8) # 除以上面的norm，即归一化
+
+    if mode == 'square':
+        return xyz * d.square()
+
+    if mode == 'exp':
+        return xyz * torch.expm1(d)
+
+    raise ValueError(f'bad {mode=}')
+
+
+def reg_dense_conf(x, mode):
+    """
+    extract confidence from prediction head output
+    """
+    mode, vmin, vmax = mode
+    if mode == 'exp':
+        return vmin + x.exp().clip(max=vmax-vmin)
+    if mode == 'sigmoid':
+        return (vmax - vmin) * torch.sigmoid(x) + vmin
+    raise ValueError(f'bad {mode=}')
diff --git a/dust3r/image_pairs.py b/dust3r/image_pairs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9251dc822b6b4b11bb9149dfd256ee1e66947562
--- /dev/null
+++ b/dust3r/image_pairs.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed to load image pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+def make_pairs(imgs, scene_graph='complete', prefilter=None, symmetrize=True):
+    pairs = []
+
+    if scene_graph == 'complete':  # complete graph
+        for i in range(len(imgs)):
+            for j in range(i):
+                pairs.append((imgs[i], imgs[j]))
+
+    elif scene_graph.startswith('swin'):
+        winsize = int(scene_graph.split('-')[1]) if '-' in scene_graph else 3
+        for i in range(len(imgs)):
+            for j in range(winsize):
+                idx = (i + j) % len(imgs)  # explicit loop closure
+                pairs.append((imgs[i], imgs[idx]))
+
+    elif scene_graph.startswith('oneref'):
+        refid = int(scene_graph.split('-')[1]) if '-' in scene_graph else 0
+        for j in range(len(imgs)):
+            if j != refid:
+                pairs.append((imgs[refid], imgs[j]))
+
+    elif scene_graph == 'pairs':
+        assert len(imgs) % 2 == 0
+        for i in range(0, len(imgs), 2):
+            pairs.append((imgs[i], imgs[i+1]))
+
+    if symmetrize:
+        pairs += [(img2, img1) for img1, img2 in pairs]
+
+    # now, remove edges
+    if isinstance(prefilter, str) and prefilter.startswith('seq'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
+
+    if isinstance(prefilter, str) and prefilter.startswith('cyc'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
+
+    return pairs
+
+
+def sel(x, kept):
+    if isinstance(x, dict):
+        return {k: sel(v, kept) for k, v in x.items()}
+    if isinstance(x, (torch.Tensor, np.ndarray)):
+        return x[kept]
+    if isinstance(x, (tuple, list)):
+        return type(x)([x[k] for k in kept])
+
+
+def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
+    # number of images
+    n = max(max(e) for e in edges)+1
+
+    kept = []
+    for e, (i, j) in enumerate(edges):
+        dis = abs(i-j)
+        if cyclic:
+            dis = min(dis, abs(i+n-j), abs(i-n-j))
+        if dis <= seq_dis_thr:
+            kept.append(e)
+    return kept
+
+
+def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
+    edges = [(img1['idx'], img2['idx']) for img1, img2 in pairs]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    return [pairs[i] for i in kept]
+
+
+def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
+    edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    print(f'>> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges')
+    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)
diff --git a/dust3r/inference.py b/dust3r/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..294d3593e03cdf88b154ad3d7dcef6be9c17acb0
--- /dev/null
+++ b/dust3r/inference.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed for the inference
+# --------------------------------------------------------
+import tqdm
+import torch
+from dust3r.utils.device import to_cpu, collate_with_cat
+from dust3r.model import AsymmetricCroCo3DStereo, inf  # noqa: F401, needed when loading the model
+from dust3r.utils.misc import invalid_to_nans
+from dust3r.utils.geometry import depthmap_to_pts3d, geotrf
+
+
+def load_model(model_path, device):
+    print('... loading model from', model_path)
+    ckpt = torch.load(model_path, map_location='cpu')
+    args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
+    if 'landscape_only' not in args:
+        args = args[:-1] + ', landscape_only=False)'
+    else:
+        args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False')
+    assert "landscape_only=False" in args
+    print(f"instantiating : {args}")
+    net = eval(args)
+    print(net.load_state_dict(ckpt['model'], strict=False))
+    return net.to(device)
+
+
+def _interleave_imgs(img1, img2):
+    res = {}
+    for key, value1 in img1.items():
+        value2 = img2[key]
+        if isinstance(value1, torch.Tensor):
+            value = torch.stack((value1, value2), dim=1).flatten(0, 1)
+        else:
+            value = [x for pair in zip(value1, value2) for x in pair]
+        res[key] = value
+    return res
+
+
+def make_batch_symmetric(batch):
+    view1, view2 = batch
+    view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1))
+    return view1, view2
+
+
+def loss_of_one_batch(batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None):
+    view1, view2 = batch # 输入模型的两张图片
+    for view in batch: # 将输入的图片放到GPU上
+        for name in 'img pts3d valid_mask camera_pose camera_intrinsics F_matrix corres'.split():  # pseudo_focal
+            if name not in view:
+                continue
+            view[name] = view[name].to(device, non_blocking=True) # 放到GPU上
+
+    if symmetrize_batch:
+        view1, view2 = make_batch_symmetric(batch)
+
+    with torch.cuda.amp.autocast(enabled=bool(use_amp)):
+        pred1, pred2 = model(view1, view2) # model：AsymmetricCroCo3DStereo
+
+        # loss is supposed to be symmetric
+        with torch.cuda.amp.autocast(enabled=False):# loss = None
+            loss = criterion(view1, view2, pred1, pred2) if criterion is not None else None
+
+    result = dict(view1=view1, view2=view2, pred1=pred1, pred2=pred2, loss=loss) #这里loss为None
+    return result[ret] if ret else result
+
+
+@torch.no_grad()
+def inference(pairs, model, device, batch_size=8):
+    print(f'>> Inference with model on {len(pairs)} image pairs') # 所有照片两两成一对
+    result = []
+
+    # first, check if all images have the same size
+    multiple_shapes = not (check_if_same_size(pairs))
+    if multiple_shapes:  # force bs=1
+        batch_size = 1
+
+    for i in tqdm.trange(0, len(pairs), batch_size): # 将所有的pairs依次输入模型
+        res = loss_of_one_batch(collate_with_cat(pairs[i:i+batch_size]), model, None, device)
+        result.append(to_cpu(res))
+
+    result = collate_with_cat(result, lists=multiple_shapes) # view1、view2分别表示输入模型的两张图片
+
+    torch.cuda.empty_cache()
+    return result
+
+
+def check_if_same_size(pairs):
+    shapes1 = [img1['img'].shape[-2:] for img1, img2 in pairs]
+    shapes2 = [img2['img'].shape[-2:] for img1, img2 in pairs]
+    return all(shapes1[0] == s for s in shapes1) and all(shapes2[0] == s for s in shapes2)
+
+
+def get_pred_pts3d(gt, pred, use_pose=False):
+    if 'depth' in pred and 'pseudo_focal' in pred:
+        try:
+            pp = gt['camera_intrinsics'][..., :2, 2]
+        except KeyError:
+            pp = None
+        pts3d = depthmap_to_pts3d(**pred, pp=pp)
+
+    elif 'pts3d' in pred:
+        # pts3d from my camera
+        pts3d = pred['pts3d']
+
+    elif 'pts3d_in_other_view' in pred:
+        # pts3d from the other camera, already transformed
+        assert use_pose is True
+        return pred['pts3d_in_other_view']  # return!
+
+    if use_pose:
+        camera_pose = pred.get('camera_pose')
+        assert camera_pose is not None
+        pts3d = geotrf(camera_pose, pts3d)
+
+    return pts3d
+
+
+def find_opt_scaling(gt_pts1, gt_pts2, pr_pts1, pr_pts2=None, fit_mode='weiszfeld_stop_grad', valid1=None, valid2=None):
+    assert gt_pts1.ndim == pr_pts1.ndim == 4
+    assert gt_pts1.shape == pr_pts1.shape
+    if gt_pts2 is not None:
+        assert gt_pts2.ndim == pr_pts2.ndim == 4
+        assert gt_pts2.shape == pr_pts2.shape
+
+    # concat the pointcloud
+    nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2)
+    nan_gt_pts2 = invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None
+
+    pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2)
+    pr_pts2 = invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None
+
+    all_gt = torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1) if gt_pts2 is not None else nan_gt_pts1
+    all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1
+
+    dot_gt_pr = (all_pr * all_gt).sum(dim=-1)
+    dot_gt_gt = all_gt.square().sum(dim=-1)
+
+    if fit_mode.startswith('avg'):
+        # scaling = (all_pr / all_gt).view(B, -1).mean(dim=1)
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+    elif fit_mode.startswith('median'):
+        scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values
+    elif fit_mode.startswith('weiszfeld'):
+        # init scaling with l2 closed form
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip_(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1)
+    else:
+        raise ValueError(f'bad {fit_mode=}')
+
+    if fit_mode.endswith('stop_grad'):
+        scaling = scaling.detach()
+
+    scaling = scaling.clip(min=1e-3)
+    # assert scaling.isfinite().all(), bb()
+    return scaling
diff --git a/dust3r/losses.py b/dust3r/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6e20fd3a30d6d498afdc13ec852ae984d05f7e
--- /dev/null
+++ b/dust3r/losses.py
@@ -0,0 +1,297 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Implementation of DUSt3R training losses
+# --------------------------------------------------------
+from copy import copy, deepcopy
+import torch
+import torch.nn as nn
+
+from dust3r.inference import get_pred_pts3d, find_opt_scaling
+from dust3r.utils.geometry import inv, geotrf, normalize_pointcloud
+from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale
+
+
+def Sum(*losses_and_masks):
+    loss, mask = losses_and_masks[0]
+    if loss.ndim > 0:
+        # we are actually returning the loss for every pixels
+        return losses_and_masks
+    else:
+        # we are returning the global loss
+        for loss2, mask2 in losses_and_masks[1:]:
+            loss = loss + loss2
+        return loss
+
+
+class LLoss (nn.Module):
+    """ L-norm loss
+    """
+
+    def __init__(self, reduction='mean'):
+        super().__init__()
+        self.reduction = reduction
+
+    def forward(self, a, b):
+        assert a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3, f'Bad shape = {a.shape}'
+        dist = self.distance(a, b)
+        assert dist.ndim == a.ndim-1  # one dimension less
+        if self.reduction == 'none':
+            return dist
+        if self.reduction == 'sum':
+            return dist.sum()
+        if self.reduction == 'mean':
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f'bad {self.reduction=} mode')
+
+    def distance(self, a, b):
+        raise NotImplementedError()
+
+
+class L21Loss (LLoss):
+    """ Euclidean distance between 3d points  """
+
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)  # normalized L2 distance
+
+
+L21 = L21Loss()
+
+
+class Criterion (nn.Module):
+    def __init__(self, criterion=None):
+        super().__init__()
+        assert isinstance(criterion, LLoss), f'{criterion} is not a proper criterion!'+bb()
+        self.criterion = copy(criterion)
+
+    def get_name(self):
+        return f'{type(self).__name__}({self.criterion})'
+
+    def with_reduction(self, mode):
+        res = loss = deepcopy(self)
+        while loss is not None:
+            assert isinstance(loss, Criterion)
+            loss.criterion.reduction = 'none'  # make it return the loss for each sample
+            loss = loss._loss2  # we assume loss is a Multiloss
+        return res
+
+
+class MultiLoss (nn.Module):
+    """ Easily combinable losses (also keep track of individual loss values):
+        loss = MyLoss1() + 0.1*MyLoss2()
+    Usage:
+        Inherit from this class and override get_name() and compute_loss()
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._alpha = 1
+        self._loss2 = None
+
+    def compute_loss(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def get_name(self):
+        raise NotImplementedError()
+
+    def __mul__(self, alpha):
+        assert isinstance(alpha, (int, float))
+        res = copy(self)
+        res._alpha = alpha
+        return res
+    __rmul__ = __mul__  # same
+
+    def __add__(self, loss2):
+        assert isinstance(loss2, MultiLoss)
+        res = cur = copy(self)
+        # find the end of the chain
+        while cur._loss2 is not None:
+            cur = cur._loss2
+        cur._loss2 = loss2
+        return res
+
+    def __repr__(self):
+        name = self.get_name()
+        if self._alpha != 1:
+            name = f'{self._alpha:g}*{name}'
+        if self._loss2:
+            name = f'{name} + {self._loss2}'
+        return name
+
+    def forward(self, *args, **kwargs):
+        loss = self.compute_loss(*args, **kwargs)
+        if isinstance(loss, tuple):
+            loss, details = loss
+        elif loss.ndim == 0:
+            details = {self.get_name(): float(loss)}
+        else:
+            details = {}
+        loss = loss * self._alpha
+
+        if self._loss2:
+            loss2, details2 = self._loss2(*args, **kwargs)
+            loss = loss + loss2
+            details |= details2
+
+        return loss, details
+
+
+class Regr3D (Criterion, MultiLoss):
+    """ Ensure that all 3D points are correct.
+        Asymmetric loss: view1 is supposed to be the anchor.
+
+        P1 = RT1 @ D1
+        P2 = RT2 @ D2
+        loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1)
+        loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2)
+              = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2)
+    """
+
+    def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False):
+        super().__init__(criterion)
+        self.norm_mode = norm_mode
+        self.gt_scale = gt_scale
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gt1['camera_pose'])
+        gt_pts1 = geotrf(in_camera1, gt1['pts3d'])  # B,H,W,3
+        gt_pts2 = geotrf(in_camera1, gt2['pts3d'])  # B,H,W,3
+
+        valid1 = gt1['valid_mask'].clone()
+        valid2 = gt2['valid_mask'].clone()
+
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis1 = gt_pts1.norm(dim=-1)  # (B, H, W)
+            dis2 = gt_pts2.norm(dim=-1)  # (B, H, W)
+            valid1 = valid1 & (dis1 <= dist_clip)
+            valid2 = valid2 & (dis2 <= dist_clip)
+
+        pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False)
+        pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True)
+
+        # normalize 3d points
+        if self.norm_mode:
+            pr_pts1, pr_pts2 = normalize_pointcloud(pr_pts1, pr_pts2, self.norm_mode, valid1, valid2)
+        if self.norm_mode and not self.gt_scale:
+            gt_pts1, gt_pts2 = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode, valid1, valid2)
+
+        return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, {}
+
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \
+            self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw)
+        # loss on img1 side
+        l1 = self.criterion(pred_pts1[mask1], gt_pts1[mask1])
+        # loss on gt2 side
+        l2 = self.criterion(pred_pts2[mask2], gt_pts2[mask2])
+        self_name = type(self).__name__
+        details = {self_name+'_pts3d_1': float(l1.mean()), self_name+'_pts3d_2': float(l2.mean())}
+        return Sum((l1, mask1), (l2, mask2)), (details | monitoring)
+
+
+class ConfLoss (MultiLoss):
+    """ Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10) 
+
+        alpha: hyperparameter
+    """
+
+    def __init__(self, pixel_loss, alpha=1):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.pixel_loss = pixel_loss.with_reduction('none')
+
+    def get_name(self):
+        return f'ConfLoss({self.pixel_loss})'
+
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # compute per-pixel loss
+        ((loss1, msk1), (loss2, msk2)), details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw)
+        if loss1.numel() == 0:
+            print('NO VALID POINTS in img1', force=True)
+        if loss2.numel() == 0:
+            print('NO VALID POINTS in img2', force=True)
+
+        # weight by confidence
+        conf1, log_conf1 = self.get_conf_log(pred1['conf'][msk1])
+        conf2, log_conf2 = self.get_conf_log(pred2['conf'][msk2])
+        conf_loss1 = loss1 * conf1 - self.alpha * log_conf1
+        conf_loss2 = loss2 * conf2 - self.alpha * log_conf2
+
+        # average + nan protection (in case of no valid pixels at all)
+        conf_loss1 = conf_loss1.mean() if conf_loss1.numel() > 0 else 0
+        conf_loss2 = conf_loss2.mean() if conf_loss2.numel() > 0 else 0
+
+        return conf_loss1 + conf_loss2, dict(conf_loss_1=float(conf_loss1), conf_loss2=float(conf_loss2), **details)
+
+
+class Regr3D_ShiftInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+    """
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute unnormalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+
+        # compute median depth
+        gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2]
+        pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2]
+        gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None]
+        pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None]
+
+        # subtract the median depth
+        gt_z1 -= gt_shift_z
+        gt_z2 -= gt_shift_z
+        pred_z1 -= pred_shift_z
+        pred_z2 -= pred_shift_z
+
+        # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring
+
+
+class Regr3D_ScaleInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+        if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute depth-normalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = super().get_all_pts3d(gt1, gt2, pred1, pred2)
+
+        # measure scene scale
+        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2)
+        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2)
+
+        # prevent predictions to be in a ridiculous range
+        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
+
+        # subtract the median depth
+        if self.gt_scale:
+            pred_pts1 *= gt_scale / pred_scale
+            pred_pts2 *= gt_scale / pred_scale
+            # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean())
+        else:
+            gt_pts1 /= gt_scale
+            gt_pts2 /= gt_scale
+            pred_pts1 /= pred_scale
+            pred_pts2 /= pred_scale
+            # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach())
+
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring
+
+
+class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv):
+    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
+    pass
diff --git a/dust3r/model.py b/dust3r/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2499f5c93d1d0c57f5ab87251aa1505318f2486b
--- /dev/null
+++ b/dust3r/model.py
@@ -0,0 +1,166 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUSt3R model class
+# --------------------------------------------------------
+from copy import deepcopy
+import torch
+
+from .utils.misc import fill_default_args, freeze_all_params, is_symmetrized, interleave, transpose_to_landscape
+from .heads import head_factory
+from dust3r.patch_embed import get_patch_embed
+
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.models.croco import CroCoNet  # noqa
+inf = float('inf')
+
+
+class AsymmetricCroCo3DStereo (CroCoNet):
+    """ Two siamese encoders, followed by two decoders.
+    The goal is to output 3d points directly, both images in view1's frame
+    (hence the asymmetry).   
+    """
+
+    def __init__(self,
+                 output_mode='pts3d',
+                 head_type='linear',
+                 depth_mode=('exp', -inf, inf),
+                 conf_mode=('exp', 1, inf),
+                 freeze='none',
+                 landscape_only=True,
+                 patch_embed_cls='PatchEmbedDust3R',  # PatchEmbedDust3R or ManyAR_PatchEmbed
+                 **croco_kwargs):
+        self.patch_embed_cls = patch_embed_cls
+        self.croco_args = fill_default_args(croco_kwargs, super().__init__)
+        super().__init__(**croco_kwargs)
+
+        # dust3r specific initialization
+        self.dec_blocks2 = deepcopy(self.dec_blocks)
+        self.set_downstream_head(output_mode, head_type, landscape_only, depth_mode, conf_mode, **croco_kwargs)
+        self.set_freeze(freeze)
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)
+
+    def load_state_dict(self, ckpt, **kw):
+        # duplicate all weights for the second decoder if not present
+        new_ckpt = dict(ckpt)
+        if not any(k.startswith('dec_blocks2') for k in ckpt):
+            for key, value in ckpt.items():
+                if key.startswith('dec_blocks'):
+                    new_ckpt[key.replace('dec_blocks', 'dec_blocks2')] = value
+        return super().load_state_dict(new_ckpt, **kw)
+
+    def set_freeze(self, freeze):  # this is for use by downstream models
+        self.freeze = freeze
+        to_be_frozen = {
+            'none':     [],
+            'mask':     [self.mask_token],
+            'encoder':  [self.mask_token, self.patch_embed, self.enc_blocks],
+        }
+        freeze_all_params(to_be_frozen[freeze])
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No prediction head """
+        return
+
+    def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size,
+                            **kw):
+        assert img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0, \
+            f'{img_size=} must be multiple of {patch_size=}'
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        self.downstream_head2 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        # magic wrapper
+        self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
+        self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)
+
+    def _encode_image(self, image, true_shape): # image：输入的两张图片在batch维度上连接后的结果
+        # embed the image into patches  (x has size B x Npatches x C)
+        x, pos = self.patch_embed(image, true_shape=true_shape)  # 调用PatchEmbedDust3R，进行patch_embedding和位置编码
+
+        # add positional embedding without cls token
+        assert self.enc_pos_embed is None
+
+        # now apply the transformer encoder and normalization
+        for blk in self.enc_blocks: # 一共有24层block的encoder
+            x = blk(x, pos)
+
+        x = self.enc_norm(x) # LayerNorm
+        return x, pos, None
+
+    def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2):
+        if img1.shape[-2:] == img2.shape[-2:]:
+            out, pos, _ = self._encode_image(torch.cat((img1, img2), dim=0), # 将两张图片在batch维度上连接
+                                             torch.cat((true_shape1, true_shape2), dim=0))
+            out, out2 = out.chunk(2, dim=0)
+            pos, pos2 = pos.chunk(2, dim=0)
+        else: #******************************* 输入ViT encoder ************************
+            out, pos, _ = self._encode_image(img1, true_shape1)
+            out2, pos2, _ = self._encode_image(img2, true_shape2)
+        return out, out2, pos, pos2
+
+    def _encode_symmetrized(self, view1, view2):
+        img1 = view1['img']
+        img2 = view2['img']
+        B = img1.shape[0]
+        # Recover true_shape when available, otherwise assume that the img shape is the true one
+        shape1 = view1.get('true_shape', torch.tensor(img1.shape[-2:])[None].repeat(B, 1))
+        shape2 = view2.get('true_shape', torch.tensor(img2.shape[-2:])[None].repeat(B, 1))
+        # warning! maybe the images have different portrait/landscape orientations
+
+        if is_symmetrized(view1, view2):
+            # computing half of forward pass!'
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1[::2], img2[::2], shape1[::2], shape2[::2])
+            feat1, feat2 = interleave(feat1, feat2)
+            pos1, pos2 = interleave(pos1, pos2)
+        else: #******************************* 输入ViT encoder ************************
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1, img2, shape1, shape2)
+
+        return (shape1, shape2), (feat1, feat2), (pos1, pos2)
+
+    def _decoder(self, f1, pos1, f2, pos2):
+        final_output = [(f1, f2)] # 来自encoder的两个编码 # 映射前的两个编码
+
+        # project to decoder dim # 一个Linear映射层
+        f1 = self.decoder_embed(f1) # Linear层，channel：1024->768
+        f2 = self.decoder_embed(f2)
+
+        final_output.append((f1, f2))                   # 映射后的两个编码
+        for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2): #dec_blocks2是由dec_blocks deepcopy过来的，所以是一样的
+            # img1 side，*final_output[-1][::+1]表示输入f1，f2
+            f1, _ = blk1(*final_output[-1][::+1], pos1, pos2)
+            # img2 side *final_output[-1][::-1]表示输入f2，f1
+            f2, _ = blk2(*final_output[-1][::-1], pos2, pos1)
+            # store the result
+            final_output.append((f1, f2))
+
+        # normalize last output
+        del final_output[1]  # duplicate with final_output[0]，即删除 映射后的两个编码
+        final_output[-1] = tuple(map(self.dec_norm, final_output[-1]))
+        return zip(*final_output)
+
+    def _downstream_head(self, head_num, decout, img_shape):
+        B, S, D = decout[-1].shape
+        # img_shape = tuple(map(int, img_shape))
+        head = getattr(self, f'head{head_num}')
+        return head(decout, img_shape)
+
+    def forward(self, view1, view2):
+        # *****encode the two images --> B,S,D ** 输入ViT encoder ************************
+        (shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(view1, view2)
+
+        # combine all ref images into object-centric representation **输入decoder*************
+        dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2)
+
+        with torch.cuda.amp.autocast(enabled=False): #Decoder的结果分别输入 Head1 和 Head2
+            res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1) # PixelwiseTaskWithDPT
+            res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2)
+
+        res2['pts3d_in_other_view'] = res2.pop('pts3d')  # predict view2's pts3d in view1's frame，即res2中的三维点云坐标是在view1的相机坐标系下的
+        return res1, res2
diff --git a/dust3r/optim_factory.py b/dust3r/optim_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9c16e0e0fda3fd03c3def61abc1f354f75c584
--- /dev/null
+++ b/dust3r/optim_factory.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# optimization functions
+# --------------------------------------------------------
+
+
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
diff --git a/dust3r/patch_embed.py b/dust3r/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..23b6cb0cabf6b5351451e4cfb297e182c4128d1f
--- /dev/null
+++ b/dust3r/patch_embed.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# PatchEmbed implementation for DUST3R,
+# in particular ManyAR_PatchEmbed that Handle images with non-square aspect ratio
+# --------------------------------------------------------
+import torch
+import dust3r.utils.path_to_croco  # noqa: F401
+from models.blocks import PatchEmbed  # noqa
+
+
+def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim):
+    assert patch_embed_cls in ['PatchEmbedDust3R', 'ManyAR_PatchEmbed']
+    patch_embed = eval(patch_embed_cls)(img_size, patch_size, 3, enc_embed_dim)
+    return patch_embed
+
+
+class PatchEmbedDust3R(PatchEmbed):
+    def forward(self, x, **kw):
+        B, C, H, W = x.shape # 输入图片的尺寸得是16的倍数
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        x = self.proj(x) # 这是一个Conv2d卷积，即ViT的Patch_Embedding操作，输出编码的维度为1024，卷积核尺寸and步长都是16
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device) # PositionGetter，位置编码
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x) # nn.Identity()
+        return x, pos
+
+
+class ManyAR_PatchEmbed (PatchEmbed):
+    """ Handle images with non-square aspect ratio.
+        All images in the same batch have the same aspect ratio.
+        true_shape = [(height, width) ...] indicates the actual shape of each image.
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        self.embed_dim = embed_dim
+        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten)
+
+    def forward(self, img, true_shape):
+        B, C, H, W = img.shape
+        assert W >= H, f'img should be in landscape mode, but got {W=} {H=}'
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        assert true_shape.shape == (B, 2), f"true_shape has the wrong shape={true_shape.shape}"
+
+        # size expressed in tokens
+        W //= self.patch_size[0]
+        H //= self.patch_size[1]
+        n_tokens = H * W
+
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+
+        # allocate result
+        x = img.new_zeros((B, n_tokens, self.embed_dim))
+        pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64)
+
+        # linear projection, transposed if necessary
+        x[is_landscape] = self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float()
+        x[is_portrait] = self.proj(img[is_portrait].swapaxes(-1, -2)).permute(0, 2, 3, 1).flatten(1, 2).float()
+
+        pos[is_landscape] = self.position_getter(1, H, W, pos.device)
+        pos[is_portrait] = self.position_getter(1, W, H, pos.device)
+
+        x = self.norm(x)
+        return x, pos
diff --git a/dust3r/post_process.py b/dust3r/post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..e453f7e1c0e0fa2e2729cc3a2f57f9a0dc5ed025
--- /dev/null
+++ b/dust3r/post_process.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities for interpreting the DUST3R output
+# --------------------------------------------------------
+import numpy as np
+import torch
+from dust3r.utils.geometry import xy_grid
+
+# 估计焦距f，即论文中的《3.3.Downstream Applications-Recovering intrinsics.》章节的公式
+def estimate_focal_knowing_depth(pts3d, pp, focal_mode='median', min_focal=0.5, max_focal=3.5):
+    """ Reprojection method, for when the absolute depth is known:
+        1) estimate the camera focal using a robust estimator
+        2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    # pixels即论文中的图像坐标系下的坐标(i`,j`)：i` = i - W/2 , j` = j - H/2
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(-1, 1, 2)  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, H*W, 3)
+
+    if focal_mode == 'median':
+        with torch.no_grad():
+            # direct estimation of focal
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+
+            # assume square pixels, hence same focal for X and Y
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+
+    elif focal_mode == 'weiszfeld': # 参考链接：https://blog.csdn.net/qianlinjun/article/details/53852306
+        # init focal with l2 closed form
+        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0)  # 转齐次坐标，即x，y除以z坐标
+        # 1、初始化第一轮迭代时的focal
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+        # 2、基于weiszfeld算法迭代
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1) # norm：求第二范式
+            # print(dis.nanmean(-1))
+            w = dis.clip(min=1e-8).reciprocal() # 求倒数
+            # update the scaling with the new weights
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f'bad {focal_mode=}')
+
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base)
+    # print(focal)
+    return focal
diff --git a/dust3r/utils/__init__.py b/dust3r/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/utils/__pycache__/__init__.cpython-310.pyc b/dust3r/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf6e2afbd1a015e26cb11835fe9c3d6a11c05c01
Binary files /dev/null and b/dust3r/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/__init__.cpython-38.pyc b/dust3r/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07a9d047b7ca97b0888f9507515d3d1ca20d6af6
Binary files /dev/null and b/dust3r/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/dust3r/utils/__pycache__/__init__.cpython-39.pyc b/dust3r/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33ae796e40503d0515ddf01c64c42982bf8b3468
Binary files /dev/null and b/dust3r/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/dust3r/utils/__pycache__/device.cpython-310.pyc b/dust3r/utils/__pycache__/device.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33e69f58cc58479e35d1d56e9b203c17a9a001d4
Binary files /dev/null and b/dust3r/utils/__pycache__/device.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/device.cpython-38.pyc b/dust3r/utils/__pycache__/device.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbb8b155cfc7b55a489e19b9a7fbf1e334a840e0
Binary files /dev/null and b/dust3r/utils/__pycache__/device.cpython-38.pyc differ
diff --git a/dust3r/utils/__pycache__/device.cpython-39.pyc b/dust3r/utils/__pycache__/device.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d206ff8ca979e37ee838eec1d19ac8f8a5e66
Binary files /dev/null and b/dust3r/utils/__pycache__/device.cpython-39.pyc differ
diff --git a/dust3r/utils/__pycache__/geometry.cpython-310.pyc b/dust3r/utils/__pycache__/geometry.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3c7f9ddeb45980a8bbd3bcdbf14c2ac7128baea
Binary files /dev/null and b/dust3r/utils/__pycache__/geometry.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/geometry.cpython-38.pyc b/dust3r/utils/__pycache__/geometry.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc6743772ebae02032bceb13607aa41da01cd28
Binary files /dev/null and b/dust3r/utils/__pycache__/geometry.cpython-38.pyc differ
diff --git a/dust3r/utils/__pycache__/image.cpython-310.pyc b/dust3r/utils/__pycache__/image.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7fd14446d7300caa89a7facf209ab61fb97a757
Binary files /dev/null and b/dust3r/utils/__pycache__/image.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/image.cpython-38.pyc b/dust3r/utils/__pycache__/image.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de4f0513e317e2eff5a15089eeb00f2f99d7593c
Binary files /dev/null and b/dust3r/utils/__pycache__/image.cpython-38.pyc differ
diff --git a/dust3r/utils/__pycache__/misc.cpython-310.pyc b/dust3r/utils/__pycache__/misc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16f56942d7ee7dc8636e877185d6095ff766aafa
Binary files /dev/null and b/dust3r/utils/__pycache__/misc.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/misc.cpython-38.pyc b/dust3r/utils/__pycache__/misc.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3268307231ff6ace455c4fefe41ef601967e1d8
Binary files /dev/null and b/dust3r/utils/__pycache__/misc.cpython-38.pyc differ
diff --git a/dust3r/utils/__pycache__/misc.cpython-39.pyc b/dust3r/utils/__pycache__/misc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f77dcf7d21464b9d50842ef0290f28186e12a4a6
Binary files /dev/null and b/dust3r/utils/__pycache__/misc.cpython-39.pyc differ
diff --git a/dust3r/utils/__pycache__/path_to_croco.cpython-310.pyc b/dust3r/utils/__pycache__/path_to_croco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8ec00cefee3dcff1ed3fa817313f5ddb7380b6e
Binary files /dev/null and b/dust3r/utils/__pycache__/path_to_croco.cpython-310.pyc differ
diff --git a/dust3r/utils/__pycache__/path_to_croco.cpython-38.pyc b/dust3r/utils/__pycache__/path_to_croco.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a613d9bc286cdbd21a0c691206d5845970a01603
Binary files /dev/null and b/dust3r/utils/__pycache__/path_to_croco.cpython-38.pyc differ
diff --git a/dust3r/utils/device.py b/dust3r/utils/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b6a74dac05a2e1ba3a2b2f0faa8cea08ece745
--- /dev/null
+++ b/dust3r/utils/device.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    '''
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == 'numpy':
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x): return todevice(x, 'numpy')
+def to_cpu(x): return todevice(x, 'cpu')
+def to_cuda(x): return todevice(x, 'cuda')
+
+
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem}
+
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever])
+
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+
+
+def listify(elems):
+    return [x for e in elems for x in e]
diff --git a/dust3r/utils/geometry.py b/dust3r/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..648a72ec6498c481c357b732c1ef389e83c7422f
--- /dev/null
+++ b/dust3r/utils/geometry.py
@@ -0,0 +1,361 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# geometry utilitary functions
+# --------------------------------------------------------
+import torch
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from dust3r.utils.device import to_numpy
+
+
+def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw):
+    """ Output a (H,W,2) array of int32 
+        with output[j,i,0] = i + origin[0]
+             output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        # numpy
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        # torch
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+
+    tw, th = [arange(o, o+s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing='xy')
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+
+
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """ Apply a geometric transformation to a list of 3-D points.
+
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+
+    # optimized code
+    if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
+            Trf.ndim == 3 and pts.ndim == 4):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d+1:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
+        else:
+            raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim-2
+            assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+
+        if pts.shape[-1]+1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+
+
+def inv(mat):
+    """ Invert a torch or numpy matrix
+    """
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f'bad matrix type = {type(mat)}')
+
+
+def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
+    """
+    Args:
+        - depthmap (BxHxW array):
+        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
+    Returns:
+        pointmap of absolute coordinates (BxHxWx3 array)
+    """
+
+    if len(depth.shape) == 4:
+        B, H, W, n = depth.shape
+    else:
+        B, H, W = depth.shape
+        n = None
+
+    if len(pseudo_focal.shape) == 3:  # [B,H,W]
+        pseudo_focalx = pseudo_focaly = pseudo_focal
+    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
+        pseudo_focalx = pseudo_focal[:, 0]
+        if pseudo_focal.shape[1] == 2:
+            pseudo_focaly = pseudo_focal[:, 1]
+        else:
+            pseudo_focaly = pseudo_focalx
+    else:
+        raise NotImplementedError("Error, unknown input focal shape format.")
+
+    assert pseudo_focalx.shape == depth.shape[:3]
+    assert pseudo_focaly.shape == depth.shape[:3]
+    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
+
+    # set principal point
+    if pp is None:
+        grid_x = grid_x - (W-1)/2
+        grid_y = grid_y - (H-1)/2
+    else:
+        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
+        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
+
+    if n is None:
+        pts3d = torch.empty((B, H, W, 3), device=depth.device)
+        pts3d[..., 0] = depth * grid_x / pseudo_focalx
+        pts3d[..., 1] = depth * grid_y / pseudo_focaly
+        pts3d[..., 2] = depth
+    else:
+        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
+        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
+        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
+        pts3d[..., 2, :] = depth
+    return pts3d
+
+
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+
+    # Compute 3D ray associated with each pixel
+    # Strong assumption: there are no skew terms
+    assert camera_intrinsics[0, 1] == 0.0
+    assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+
+    # Mask for valid coordinates
+    valid_mask = (depthmap > 0.0)
+    return X_cam, valid_mask
+
+
+def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, **kw):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels."""
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+
+    # R_cam2world = np.float32(camera_params["R_cam2world"])
+    # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze()
+    R_cam2world = camera_pose[:3, :3]
+    t_cam2world = camera_pose[:3, 3]
+
+    # Express in absolute coordinates (invalid depth values)
+    X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+    return X_world, valid_mask
+
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+
+
+def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None):
+    """ renorm pointmaps pts1, pts2 with norm_mode
+    """
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split('_')
+
+    if norm_mode == 'avg':
+        # gather all points together (joint normalization)
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == 'dis':
+            pass  # do nothing
+        elif dis_mode == 'log1p':
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == 'warp-log1p':
+            # actually warp input points before normalizing them
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, :W1*H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1*H1:].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f'bad {dis_mode=}')
+
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        # gather all points together (joint normalization)
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+
+        if norm_mode == 'avg':
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == 'median':
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == 'sqrt':
+            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
+        else:
+            raise ValueError(f'bad {norm_mode=}')
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    return res
+
+
+@torch.no_grad()
+def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
+    # set invalid points to NaN
+    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+    _z2 = invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1) if z2 is not None else None
+    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
+
+    # compute median depth overall (ignoring nans)
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+
+
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True):
+    # set invalid points to NaN
+    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+    _pts2 = invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3) if pts2 is not None else None
+    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
+
+    # compute median center
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+
+    # compute median norm
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+
+
+def find_reciprocal_matches(P1, P2):
+    """
+    returns 3 values:
+    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
+    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
+    3 - reciprocal_in_P2.sum(): the number of matches
+    """
+    tree1 = KDTree(P1)
+    tree2 = KDTree(P2)
+
+    _, nn1_in_P2 = tree2.query(P1, workers=8)
+    _, nn2_in_P1 = tree1.query(P2, workers=8)
+
+    reciprocal_in_P1 = (nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2)))
+    reciprocal_in_P2 = (nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1)))
+    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
+    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
+
+
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
diff --git a/dust3r/utils/image.py b/dust3r/utils/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc50c87e20282567cb405292d9e86e7727281990
--- /dev/null
+++ b/dust3r/utils/image.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions about images (loading/converting...)
+# --------------------------------------------------------
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """ Open an image or a depthmap with opencv-python.
+    """
+    if path.endswith(('.exr', 'EXR')):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f'Could not load image={path} with {options=}')
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+
+
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+
+
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size)
+    return img.resize(new_size, interp)
+
+
+def load_images(folder_or_list, size, square_ok=False):
+    """ open and convert all images in a list or folder to proper input format for DUSt3R
+    """
+    if isinstance(folder_or_list, str):
+        print(f'>> Loading images from {folder_or_list}')
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+
+    elif isinstance(folder_or_list, list):
+        print(f'>> Loading a list of {len(folder_or_list)} images')
+        root, folder_content = '', folder_or_list
+
+    else:
+        raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})')
+
+    supported_images_extensions = ['.jpg', '.jpeg', '.png']
+    if heif_support_enabled:
+        supported_images_extensions += ['.heic', '.heif']
+    supported_images_extensions = tuple(supported_images_extensions)
+
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB')
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W//2, H//2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx-half, cy-half, cx+half, cy+half))
+        else:
+            halfw, halfh = ((2*cx)//16)*8, ((2*cy)//16)*8
+            if not (square_ok) and W == H:
+                halfh = 3*halfw/4
+            img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh))
+
+        W2, H2 = img.size
+        print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}')
+        imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32(
+            [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs))))
+
+    assert imgs, 'no images foud at '+root
+    print(f' (Found {len(imgs)} images)')
+    return imgs
diff --git a/dust3r/utils/misc.py b/dust3r/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9fd06a063c3eafbfafddc011064ebb8a3232a8
--- /dev/null
+++ b/dust3r/utils/misc.py
@@ -0,0 +1,121 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import torch
+
+
+def fill_default_args(kwargs, func):
+    import inspect  # a bit hacky but it works reliably
+    signature = inspect.signature(func)
+
+    for k, v in signature.parameters.items():
+        if v.default is inspect.Parameter.empty:
+            continue
+        kwargs.setdefault(k, v.default)
+
+    return kwargs
+
+
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+            # module is directly a parameter
+            module.requires_grad = False
+
+
+def is_symmetrized(gt1, gt2):
+    x = gt1['instance']
+    y = gt2['instance']
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i+1]) and (x[i+1] == y[i])
+    return ok
+
+
+def flip(tensor):
+    """ flip so that tensor[0::2] <=> tensor[1::2] """
+    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
+
+
+def interleave(tensor1, tensor2):
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+
+
+def transpose_to_landscape(head, activate=True):
+    """ Predict in the correct aspect-ratio,
+        then transpose the result in landscape 
+        and stack everything back together.
+    """
+    def wrapper_no(decout, true_shape):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical'
+        H, W = true_shape[0].cpu().tolist()
+        res = head(decout, (H, W))
+        return res
+
+    def wrapper_yes(decout, true_shape):
+        B = len(true_shape)
+        # by definition, the batch is in landscape mode so W >= H
+        H, W = int(true_shape.min()), int(true_shape.max())
+
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+
+        # true_shape = true_shape.cpu()
+        if is_landscape.all():
+            return head(decout, (H, W))
+        if is_portrait.all():
+            return transposed(head(decout, (W, H)))
+
+        # batch is a mix of both portraint & landscape
+        def selout(ar): return [d[ar] for d in decout]
+        l_result = head(selout(is_landscape), (H, W))
+        p_result = transposed(head(selout(is_portrait),  (W, H)))
+
+        # allocate full result
+        result = {}
+        for k in l_result | p_result:
+            x = l_result[k].new(B, *l_result[k].shape[1:])
+            x[is_landscape] = l_result[k]
+            x[is_portrait] = p_result[k]
+            result[k] = x
+
+        return result
+
+    return wrapper_yes if activate else wrapper_no
+
+
+def transposed(dic):
+    return {k: v.swapaxes(1, 2) for k, v in dic.items()}
+
+
+def invalid_to_nans(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float('nan')
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+
+
+def invalid_to_zeros(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz
diff --git a/dust3r/utils/path_to_croco.py b/dust3r/utils/path_to_croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..39226ce6bc0e1993ba98a22096de32cb6fa916b4
--- /dev/null
+++ b/dust3r/utils/path_to_croco.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# CroCo submodule import
+# --------------------------------------------------------
+
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+CROCO_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../croco'))
+CROCO_MODELS_PATH = path.join(CROCO_REPO_PATH, 'models')
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(CROCO_MODELS_PATH):
+    # workaround for sibling import
+    sys.path.insert(0, CROCO_REPO_PATH)
+else:
+    raise ImportError(f"croco is not initialized, could not find: {CROCO_MODELS_PATH}.\n "
+                      "Did you forget to run 'git submodule update --init --recursive' ?")
diff --git a/dust3r/viz.py b/dust3r/viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21f399accf6710816cc4a858d60849ccaad31e1
--- /dev/null
+++ b/dust3r/viz.py
@@ -0,0 +1,320 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Visualization utilities using trimesh
+# --------------------------------------------------------
+import PIL.Image
+import numpy as np
+from scipy.spatial.transform import Rotation
+import torch
+
+from dust3r.utils.geometry import geotrf, get_med_dist_between_poses
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+
+try:
+    import trimesh
+except ImportError:
+    print('/!\\ module trimesh is not installed, cannot visualize results /!\\')
+
+
+def cat_3d(vecs):
+    if isinstance(vecs, (np.ndarray, torch.Tensor)):
+        vecs = [vecs]
+    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
+
+
+def show_raw_pointcloud(pts3d, colors, point_size=2):
+    scene = trimesh.Scene()
+
+    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
+    scene.add_geometry(pct)
+
+    scene.show(line_settings={'point_size': point_size})
+
+
+def pts3d_to_trimesh(img, pts3d, valid=None):
+    H, W, THREE = img.shape
+    assert THREE == 3
+    assert img.shape == pts3d.shape
+
+    vertices = pts3d.reshape(-1, 3)
+
+    # make squares: each pixel == 2 triangles
+    idx = np.arange(len(vertices)).reshape(H, W)
+    idx1 = idx[:-1, :-1].ravel()  # top-left corner
+    idx2 = idx[:-1, +1:].ravel()  # right-left corner
+    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
+    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
+    faces = np.concatenate((
+        np.c_[idx1, idx2, idx3],
+        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
+        np.c_[idx2, idx3, idx4],
+        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
+    ), axis=0)
+
+    # prepare triangle colors
+    face_colors = np.concatenate((
+        img[:-1, :-1].reshape(-1, 3),
+        img[:-1, :-1].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3)
+    ), axis=0)
+
+    # remove invalid faces
+    if valid is not None:
+        assert valid.shape == (H, W)
+        valid_idxs = valid.ravel()
+        valid_faces = valid_idxs[faces].all(axis=-1)
+        faces = faces[valid_faces]
+        face_colors = face_colors[valid_faces]
+
+    assert len(faces) == len(face_colors)
+    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
+
+
+def cat_meshes(meshes):
+    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
+    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
+    for i in range(len(faces)):
+        faces[i][:] += n_vertices[i]
+
+    vertices = np.concatenate(vertices)
+    colors = np.concatenate(colors)
+    faces = np.concatenate(faces)
+    return dict(vertices=vertices, face_colors=colors, faces=faces)
+
+
+def show_duster_pairs(view1, view2, pred1, pred2):
+    import matplotlib.pyplot as pl
+    pl.ion()
+
+    for e in range(len(view1['instance'])):
+        i = view1['idx'][e]
+        j = view2['idx'][e]
+        img1 = rgb(view1['img'][e])
+        img2 = rgb(view2['img'][e])
+        conf1 = pred1['conf'][e].squeeze()
+        conf2 = pred2['conf'][e].squeeze()
+        score = conf1.mean()*conf2.mean()
+        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
+        pl.clf()
+        pl.subplot(221).imshow(img1)
+        pl.subplot(223).imshow(img2)
+        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
+        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
+        pts1 = pred1['pts3d'][e]
+        pts2 = pred2['pts3d_in_other_view'][e]
+        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
+        if input('show pointcloud? (y/n) ') == 'y':
+            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
+
+
+def auto_cam_size(im_poses):
+    return 0.1 * get_med_dist_between_poses(im_poses)
+
+
+class SceneViz:
+    def __init__(self):
+        self.scene = trimesh.Scene()
+
+    def add_pointcloud(self, pts3d, color, mask=None):
+        pts3d = to_numpy(pts3d)
+        mask = to_numpy(mask)
+        if mask is None:
+            mask = [slice(None)] * len(pts3d)
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3))
+
+        if isinstance(color, (list, np.ndarray, torch.Tensor)):
+            color = to_numpy(color)
+            col = np.concatenate([p[m] for p, m in zip(color, mask)])
+            assert col.shape == pts.shape
+            pct.visual.vertex_colors = uint8(col.reshape(-1, 3))
+        else:
+            assert len(color) == 3
+            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
+
+        self.scene.add_geometry(pct)
+        return self
+
+    def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03):
+        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
+        add_scene_cam(self.scene, pose_c2w, color, image, focal, screen_width=cam_size)
+        return self
+
+    def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw):
+        def get(arr, idx): return None if arr is None else arr[idx]
+        for i, pose_c2w in enumerate(poses):
+            self.add_camera(pose_c2w, get(focals, i), image=get(images, i),
+                            color=get(colors, i), imsize=get(imsizes, i), **kw)
+        return self
+
+    def show(self, point_size=2):
+        self.scene.show(line_settings={'point_size': point_size})
+
+
+def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world,
+                                  point_size=2, cam_size=0.05, cam_color=None):
+    """ Visualization of a pointcloud with cameras
+        imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        focals = (N,) or N-size list of [focal, ...]
+        cams2world = (N,4,4) or N-size list of [(4,4), ...]
+    """
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+    scene.add_geometry(pct)
+
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size)
+
+    scene.show(line_settings={'point_size': point_size})
+
+
+def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, screen_width=0.03):
+
+    if image is not None:
+        H, W, THREE = image.shape
+        assert THREE == 3
+        if image.dtype != np.uint8:
+            image = np.uint8(255*image)
+    elif imsize is not None:
+        W, H = imsize
+    elif focal is not None:
+        H = W = focal / 1.1
+    else:
+        H = W = 1
+
+    if focal is None:
+        focal = min(H, W) * 1.1  # default value
+    elif isinstance(focal, np.ndarray):
+        focal = focal[0]
+
+    # create fake camera
+    height = focal * screen_width / H
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
+    rot45[2, 3] = -height  # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0, 0] = W/H
+    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
+
+    # this is the image
+    if image is not None:
+        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
+        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+        img = trimesh.Trimesh(vertices=vertices, faces=faces)
+        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+        img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
+        scene.add_geometry(img)
+
+    # this is the camera mesh
+    rot2 = np.eye(4)
+    rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
+    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
+    vertices = geotrf(transform, vertices)
+    faces = []
+    for face in cam.faces:
+        if 0 in face:
+            continue
+        a, b, c = face
+        a2, b2, c2 = face + len(cam.vertices)
+        a3, b3, c3 = face + 2*len(cam.vertices)
+
+        # add 3 pseudo-edges
+        faces.append((a, b, b2))
+        faces.append((a, a2, c))
+        faces.append((c2, b, c))
+
+        faces.append((a, b, b3))
+        faces.append((a, a3, c))
+        faces.append((c3, b, c))
+
+    # no culling
+    faces += [(c, b, a) for a, b, c in faces]
+
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:, :3] = edge_color
+    scene.add_geometry(cam)
+
+
+def cat(a, b):
+    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
+
+
+OPENGL = np.array([[1, 0, 0, 0],
+                   [0, -1, 0, 0],
+                   [0, 0, -1, 0],
+                   [0, 0, 0, 1]])
+
+
+CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204),
+              (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)]
+
+
+def uint8(colors):
+    if not isinstance(colors, np.ndarray):
+        colors = np.array(colors)
+    if np.issubdtype(colors.dtype, np.floating):
+        colors *= 255
+    assert 0 <= colors.min() and colors.max() < 256
+    return np.uint8(colors)
+
+
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+
+    # Convert to HSV
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255*image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+    # Define range for blue color and create mask
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+
+    # add luminous gray
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+
+    # Morphological operations
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+
+    # keep only largest CC
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8)
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+
+    # Apply mask
+    return torch.from_numpy(mask3)
diff --git a/gys_util.py b/gys_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..af60b006b1b437489b59d009ae4325720fcabbea
--- /dev/null
+++ b/gys_util.py
@@ -0,0 +1,117 @@
+import os
+import numpy as np
+from PIL import Image
+import torch
+
+def resize_to_image_8():
+    # 原始图片文件夹路径
+    input_folder = "D:\XMU\mac\hujie\\3D\DUSt3R\dust3r\data\llff(sanerf-hq)\colinepiano\images"
+    # 新的保存缩放图片的文件夹路径
+    output_folder = "D:\XMU\mac\hujie\\3D\DUSt3R\dust3r\data\llff(sanerf-hq)\colinepiano\images_8"
+
+    # 创建输出文件夹（如果不存在）
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # 遍历输入文件夹中的所有文件
+    for filename in os.listdir(input_folder):
+        # 获取文件路径
+        file_path = os.path.join(input_folder, filename)
+
+        # 确保是文件且是图片文件（这里假设图片格式为jpg或png）
+        if os.path.isfile(file_path) and filename.lower().endswith(('.jpg', '.jpeg', '.png')):
+            # 打开图片
+            original_image = Image.open(file_path)
+
+            # 获取原始尺寸
+            original_width, original_height = original_image.size
+
+            # 计算缩放后的尺寸（1/8）
+            new_width = original_width // 8
+            new_height = original_height // 8
+
+            # 进行缩放
+            resized_image = original_image.resize((new_width, new_height), Image.LANCZOS)
+
+            # 生成新的文件名并保存到输出文件夹
+            new_file_path = os.path.join(output_folder, filename)
+            resized_image.save(new_file_path)
+            print(f"Image {filename} resized and saved to {new_file_path}")
+
+    print("All images have been resized and saved.")
+
+
+def RGB_to_mask():
+
+    # 输入文件夹路径
+    input_folder = "D:\XMU\mac\hujie\\3D\DUSt3R\dust3r\data\llff(sanerf-hq)\\piano\gt"
+    # 输出文件夹路径
+    output_folder = "D:\XMU\mac\hujie\\3D\DUSt3R\dust3r\data\llff(sanerf-hq)\\piano\gt_masks"
+
+    # 创建输出文件夹（如果不存在）
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # 遍历输入文件夹中的所有文件
+    for filename in os.listdir(input_folder):
+        # 构建完整的文件路径
+        file_path = os.path.join(input_folder, filename)
+
+        # 打开图片
+        image = Image.open(file_path).convert('RGB')
+
+        # 转换为灰度图像
+        gray_image = image.convert('L')
+
+        # 转换为NumPy数组
+        gray_array = np.array(gray_image)
+
+        # 二值化（假设阈值为128，可根据需要调整）
+        threshold = 10
+        binary_array = (gray_array > threshold).astype(np.uint8) * 255
+
+        # 转换为Image对象
+        binary_image = Image.fromarray(binary_array, 'L')  # 'L'代表灰度模式
+
+        base_name, extension = os.path.splitext(filename)
+
+        # 生成新的文件名
+        new_filename = base_name + "_mask" + extension
+
+        new_file_path = os.path.join(output_folder, new_filename)
+
+        # 保存新的图片
+        binary_image.save(new_file_path)
+        print(f"Converted {filename} to {new_filename} and saved.")
+
+    print("All images have been processed.")
+
+# 读取mask文件夹下的所有ground truth masks，不再需要经过SAM生成mask
+def get_gt_masks(folder_path):
+
+    from dust3r.utils.image import load_images, rgb
+    imgs_mask = load_images(folder_path, 512)
+    # 定义保存布尔mask的列表
+    bool_masks = []
+
+    for mask in imgs_mask:
+        image_array = mask['img'].squeeze(0).numpy()
+        # 将RGB图像转换为灰度图像
+        # 使用简单的加权方法转换为灰度: Y = 0.299*R + 0.587*G + 0.114*B
+        gray_image = 0.299 * image_array[0] + 0.587 * image_array[1] + 0.114 * image_array[2]
+
+        # 将灰度图像转换为布尔数组（前景为True，背景为False）
+        bool_array = gray_image > 0
+
+        # 将布尔数组添加到列表中
+        bool_masks.append(bool_array)
+
+    # 输出布尔mask的数量
+    print(f"Total number of mask images processed: {len(bool_masks)}")
+    return bool_masks
+
+if __name__ == "__main__":
+    # 输入文件夹路径
+    folder_path = "D:\XMU\mac\hujie\\3D\DUSt3R\dust3r\data\llff(sanerf-hq)\cecread\gt_masks"
+
+    RGB_to_mask()
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/assets/base.css b/lib/assets/base.css
new file mode 100644
index 0000000000000000000000000000000000000000..1be444f16a29cf8169e591c48fb3ac7c471a4097
--- /dev/null
+++ b/lib/assets/base.css
@@ -0,0 +1,571 @@
+/*
+Gist: https://gist.github.com/xhlulu/0acba79000a3fd1e6f552ed82edb8a64/
+Production: https://cdn.rawgit.com/xhlulu/0acba79000a3fd1e6f552ed82edb8a64/raw/dash_template.css
+Development: https://rawgit.com/xhlulu/0acba79000a3fd1e6f552ed82edb8a64/raw/dash_template.css
+*/
+
+/* Table of contents
+––––––––––––––––––––––––––––––––––––––––––––––––––
+- Banner
+- Grid
+- Base Styles
+- Typography
+- Links
+- Buttons
+- Forms
+- Lists
+- Code
+- Tables
+- Spacing
+- Utilities
+- Clearing
+- Media Queries
+*/
+
+/* Banner
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.banner {
+  height: 75px;
+  background-color: #191e27;
+  padding: 0px;
+  margin: auto;
+}
+
+.banner h2 {
+  color: #e6ecf4;
+  margin-left: 1.5%;
+  display: inline-block;
+  font-family: "Raleway Medium", sans-serif;
+  font-size: 30px;
+}
+
+.banner img {
+  padding: 0px 15px;
+  margin-top: 20px;
+  margin-bottom: 15px;
+}
+
+/* Grid
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.container {
+  position: relative;
+  background-color: white;
+  border-radius: 2px 2px 5px 5px;
+  font-size: 1.5rem;
+  margin-left: auto;
+  margin-right: auto;
+  color: #6b6b6b;
+  margin-top: 30px;
+  padding: 45px;
+  width: 90%;
+  max-width: none;
+  box-sizing: border-box;
+  height: auto;
+}
+
+.column,
+.columns {
+  width: 100%;
+  float: left;
+  box-sizing: border-box;
+}
+
+/* For devices larger than 1000px i.e. Larger than desktop */
+@media (min-width: 1000px) {
+  .column,
+  .columns {
+    margin-left: 0.5%;
+  }
+  .column:first-child,
+  .columns:first-child {
+    margin-left: 0;
+  }
+
+  .one.column,
+  .one.columns {
+    width: 8%;
+  }
+  .two.columns {
+    width: 16.25%;
+  }
+  .three.columns {
+    width: 22%;
+  }
+  .four.columns {
+    width: 33%;
+  }
+  .five.columns {
+    width: 39.3333333333%;
+  }
+  .six.columns {
+    width: 49.75%;
+  }
+  .seven.columns {
+    width: 56.6666666667%;
+  }
+  .eight.columns {
+    width: 66.5%;
+  }
+  .nine.columns {
+    width: 74%;
+  }
+  .ten.columns {
+    width: 82.6666666667%;
+  }
+  .eleven.columns {
+    width: 91.5%;
+  }
+  .twelve.columns {
+    width: 100%;
+    margin-left: 0;
+  }
+
+  .one-third.column {
+    width: 30.6666666667%;
+  }
+  .two-thirds.column {
+    width: 65.3333333333%;
+  }
+
+  .one-half.column {
+    width: 48%;
+  }
+
+  /* Offsets */
+  .offset-by-one.column,
+  .offset-by-one.columns {
+    margin-left: 8.66666666667%;
+  }
+  .offset-by-two.column,
+  .offset-by-two.columns {
+    margin-left: 17.3333333333%;
+  }
+  .offset-by-three.column,
+  .offset-by-three.columns {
+    margin-left: 26%;
+  }
+  .offset-by-four.column,
+  .offset-by-four.columns {
+    margin-left: 34.6666666667%;
+  }
+  .offset-by-five.column,
+  .offset-by-five.columns {
+    margin-left: 43.3333333333%;
+  }
+  .offset-by-six.column,
+  .offset-by-six.columns {
+    margin-left: 52%;
+  }
+  .offset-by-seven.column,
+  .offset-by-seven.columns {
+    margin-left: 60.6666666667%;
+  }
+  .offset-by-eight.column,
+  .offset-by-eight.columns {
+    margin-left: 69.3333333333%;
+  }
+  .offset-by-nine.column,
+  .offset-by-nine.columns {
+    margin-left: 78%;
+  }
+  .offset-by-ten.column,
+  .offset-by-ten.columns {
+    margin-left: 86.6666666667%;
+  }
+  .offset-by-eleven.column,
+  .offset-by-eleven.columns {
+    margin-left: 95.3333333333%;
+  }
+
+  .offset-by-one-third.column,
+  .offset-by-one-third.columns {
+    margin-left: 34.6666666667%;
+  }
+  .offset-by-two-thirds.column,
+  .offset-by-two-thirds.columns {
+    margin-left: 69.3333333333%;
+  }
+
+  .offset-by-one-half.column,
+  .offset-by-one-half.columns {
+    margin-left: 52%;
+  }
+}
+
+/* Base Styles
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+/* NOTE
+html is set to 62.5% so that all the REM measurements throughout Skeleton
+are based on 10px sizing. So basically 1.5rem = 15px :) */
+html {
+  font-size: 62.5%;
+}
+body {
+  font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */
+  line-height: 1.6;
+  font-weight: 400;
+  font-family: "Raleway Semi Bold", sans-serif;
+  color: #222;
+}
+
+/* Typography
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+  margin-top: 0;
+  margin-bottom: 0;
+  font-weight: 300;
+}
+h1 {
+  font-size: 4.5rem;
+  line-height: 1.2;
+  letter-spacing: -0.1rem;
+  margin-bottom: 2rem;
+}
+h2 {
+  font-size: 3.6rem;
+  line-height: 1.25;
+  letter-spacing: -0.1rem;
+  margin-bottom: 1.8rem;
+  margin-top: 1.8rem;
+}
+h3 {
+  font-size: 3rem;
+  line-height: 1.3;
+  letter-spacing: -0.1rem;
+  margin-bottom: 1.5rem;
+  margin-top: 1.5rem;
+}
+h4 {
+  font-size: 2.6rem;
+  line-height: 1.35;
+  letter-spacing: -0.08rem;
+  margin-bottom: 1.2rem;
+  margin-top: 1.2rem;
+}
+h5 {
+  font-size: 2.2rem;
+  line-height: 1.5;
+  letter-spacing: -0.05rem;
+  margin-bottom: 0.6rem;
+  margin-top: 0.6rem;
+}
+h6 {
+  font-size: 1.75rem;
+  line-height: 1.6;
+  letter-spacing: 0;
+  margin-bottom: 0.75rem;
+  margin-top: 0.75rem;
+}
+
+p {
+  margin-top: 0;
+}
+
+/* Blockquotes
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+blockquote {
+  border-left: 4px lightgrey solid;
+  padding-left: 1rem;
+  margin-top: 2rem;
+  margin-bottom: 2rem;
+  margin-left: 0rem;
+}
+
+/* Links
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+a {
+  color: #1eaedb;
+}
+a:hover {
+  color: #0fa0ce;
+}
+
+/* Buttons
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.button,
+button,
+input[type="submit"],
+input[type="reset"],
+input[type="button"] {
+  display: inline-block;
+  height: 38px;
+  padding: 0 30px;
+  color: #555;
+  text-align: center;
+  font-size: 11px;
+  font-weight: 600;
+  line-height: 38px;
+  letter-spacing: 0.1rem;
+  text-transform: uppercase;
+  text-decoration: none;
+  white-space: nowrap;
+  background-color: transparent;
+  border-radius: 4px;
+  border: 1px solid #bbb;
+  cursor: pointer;
+  box-sizing: border-box;
+}
+.button:hover,
+button:hover,
+input[type="submit"]:hover,
+input[type="reset"]:hover,
+input[type="button"]:hover,
+.button:focus,
+button:focus,
+input[type="submit"]:focus,
+input[type="reset"]:focus,
+input[type="button"]:focus {
+  color: #333;
+  border-color: #888;
+  outline: 0;
+}
+.button.button-primary,
+button.button-primary,
+input[type="submit"].button-primary,
+input[type="reset"].button-primary,
+input[type="button"].button-primary {
+  color: #fff;
+  background-color: #33c3f0;
+  border-color: #33c3f0;
+}
+.button.button-primary:hover,
+button.button-primary:hover,
+input[type="submit"].button-primary:hover,
+input[type="reset"].button-primary:hover,
+input[type="button"].button-primary:hover,
+.button.button-primary:focus,
+button.button-primary:focus,
+input[type="submit"].button-primary:focus,
+input[type="reset"].button-primary:focus,
+input[type="button"].button-primary:focus {
+  color: #fff;
+  background-color: #db1e1e;
+  border-color: #db1e1e;
+}
+
+/* Forms
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+input[type="email"],
+input[type="number"],
+input[type="search"],
+input[type="text"],
+input[type="tel"],
+input[type="url"],
+input[type="password"],
+textarea,
+select {
+  height: 38px;
+  padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */
+  background-color: #fff;
+  border: 1px solid #d1d1d1;
+  border-radius: 4px;
+  box-shadow: none;
+  box-sizing: border-box;
+  font-family: inherit;
+  font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/
+}
+/* Removes awkward default styles on some inputs for iOS */
+input[type="email"],
+input[type="number"],
+input[type="search"],
+input[type="text"],
+input[type="tel"],
+input[type="url"],
+input[type="password"],
+textarea {
+  -webkit-appearance: none;
+  -moz-appearance: none;
+  appearance: none;
+}
+textarea {
+  min-height: 65px;
+  padding-top: 6px;
+  padding-bottom: 6px;
+}
+input[type="email"]:focus,
+input[type="number"]:focus,
+input[type="search"]:focus,
+input[type="text"]:focus,
+input[type="tel"]:focus,
+input[type="url"]:focus,
+input[type="password"]:focus,
+textarea:focus,
+select:focus {
+  border: 1px solid #33c3f0;
+  outline: 0;
+}
+label,
+legend {
+  display: block;
+  margin-bottom: 0px;
+}
+fieldset {
+  padding: 0;
+  border-width: 0;
+}
+input[type="checkbox"],
+input[type="radio"] {
+  display: inline;
+}
+label > .label-body {
+  display: inline-block;
+  margin-left: 0.5rem;
+  font-weight: normal;
+}
+
+/* Lists
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+ul {
+  list-style: circle inside;
+}
+ol {
+  list-style: decimal inside;
+}
+ol,
+ul {
+  padding-left: 0;
+  margin-top: 0;
+}
+ul ul,
+ul ol,
+ol ol,
+ol ul {
+  margin: 1.5rem 0 1.5rem 3rem;
+  font-size: 90%;
+}
+li {
+  margin-bottom: 1rem;
+}
+
+/* Tables
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+th,
+td {
+  padding: 12px 15px;
+  text-align: left;
+  border-bottom: 1px solid #e1e1e1;
+}
+th:first-child,
+td:first-child {
+  padding-left: 0;
+}
+th:last-child,
+td:last-child {
+  padding-right: 0;
+}
+
+/* Spacing
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+button,
+.button {
+  margin-bottom: 0rem;
+}
+input,
+textarea,
+select,
+fieldset {
+  margin-bottom: 0rem;
+}
+pre,
+dl,
+figure,
+table,
+form {
+  margin-bottom: 0rem;
+}
+p,
+ul,
+ol {
+  margin-bottom: 0.75rem;
+}
+
+/* Utilities
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.u-full-width {
+  width: 100%;
+  box-sizing: border-box;
+}
+.u-max-full-width {
+  max-width: 100%;
+  box-sizing: border-box;
+}
+.u-pull-right {
+  float: right;
+}
+.u-pull-left {
+  float: left;
+}
+
+/* Misc
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+hr {
+  margin-top: 3rem;
+  margin-bottom: 3.5rem;
+  border-width: 0;
+  border-top: 1px solid #e1e1e1;
+}
+
+/* Clearing
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+
+/* Self Clearing Goodness */
+.container:after,
+.row:after,
+.u-cf {
+  content: "";
+  display: table;
+  clear: both;
+}
+
+/* Media Queries
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+/*
+Note: The best way to structure the use of media queries is to create the queries
+near the relevant code. For example, if you wanted to change the styles for buttons
+on small devices, paste the mobile query code up in the buttons section and style it
+there.
+*/
+
+/* Larger than mobile */
+@media (max-width: 375px) {
+  
+}
+
+/* Larger than tablet */
+@media (max-width: 575px) {
+  
+}
+
+/* Larger than phablet (also point when grid becomes active) */
+@media (max-width: 710px) {
+  #title {
+    width: 50% !important;
+    font-size: 25px;
+  }
+
+  #plotly-logo {
+    padding: 5px 10px;
+    width: 10%;
+  }
+  #learn-more-button {
+    margin-left: 30px;
+  }
+}
+
+/* Larger than Desktop HD */
+@media (max-width: 1000px) {
+  #title {
+    width: 60%;
+  }
+
+  #plotly-logo {
+    padding: 5px 10px;
+    width: 20%;
+  }
+
+}
diff --git a/lib/assets/custom_styles.css b/lib/assets/custom_styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..0615d8cbcf1f0c76ded21173ed7113eeb7664c5e
--- /dev/null
+++ b/lib/assets/custom_styles.css
@@ -0,0 +1,812 @@
+/* custom styling for the new refreshed app */
+
+#div-total-step-count > h6 {
+  font-weight: bold;
+  font-size: 20px;
+  color: #666464;
+}
+
+#div-total-step-count {
+  margin-top: 15px;
+}
+
+#div-step-display > h6 {
+  font-weight: bold;
+  font-size: 20px;
+  color: #666464;
+}
+
+.markdown img {
+  width: 100%;
+}
+
+._dash-undo-redo {
+  display: none;
+}
+
+#plotly-logo {
+  margin-top: 10px;
+  margin-left: 4px;
+}
+
+#learn-more-button {
+  color: #e8e8e8;
+  font-family: "Open Sans Semi Bold", sans-serif;
+  font-weight: 200;
+  text-transform: uppercase;
+  letter-spacing: 1.16px;
+  font-size: 12px;
+  padding: 0px 15px;
+  margin-top: 17px;
+  width: auto;
+}
+
+.dropdown-box-first,
+.dropdown-box-second,
+.dropdown-box-third {
+  padding: 10px 5px;
+}
+
+
+
+/* slider colors */
+
+.rc-slider-track {
+  background-color: #f6ec91;
+}
+
+.rc-slider-dot-active {
+  border-color: #f6ec91;
+}
+
+.rc-slider-handle {
+  border-color: #f6ec91;
+}
+
+#slider-smoothing-accuracy > div > div.rc-slider-handle {
+  border-color: #f6ec91;
+}
+
+#slider-smoothing-accuracy
+  > div
+  > div.rc-slider-step
+  > div.rc-slider-dot-active {
+  border-color: #f6ec91;
+}
+
+#slider-smoothing-cross-entropy > div > div.rc-slider-track {
+  background-color: #36daaa;
+}
+
+#slider-smoothing-cross-entropy > div > div.rc-slider-handle {
+  border-color: #36daaa;
+}
+
+#slider-smoothing-cross-entropy
+  > div
+  > div.rc-slider-step
+  > .rc-slider-dot-active {
+  border-color: #36daaa;
+}
+
+/* overall app background color */
+#_dash-app-content > div {
+  background-color: #f8f9fa;
+}
+
+
+
+@media only screen and (max-width: 1000px) {
+  .container {
+    padding: 20px;
+  }
+
+  #title {
+    width: 45%;
+    font-size: 2.4rem;
+    font-weight: 400;
+    margin-top: 30px;
+    margin-left: 7% !important;
+  }
+
+  #learn-more-button {
+    width: 75px;
+    height: 30px;
+    margin-left: 90px;
+    font-size: 7.5px !important;
+    padding: 0px 8px !important;
+    margin-top: 25px;
+  }
+
+  #plotly-logo {
+    margin-top: 24px;
+    margin-left: 4px;
+  }
+
+  .dropdown-box-first,
+  .dropdown-box-second {
+    width: 50%;
+  }
+
+  #div-interval-control {
+    text-align: center;
+    display: flex;
+  }
+
+  #div-total-step-count {
+    margin-top: 0px;
+  }
+
+  #div-total-step-count>h6,
+  #div-step-display>h6 {
+    text-align: left;
+    display: inline-block;
+    font-size: 1.9rem;
+  }
+
+  #div-total-step-count>h6 {
+    float: right;
+    margin-right: 10%;
+  }
+
+  #div-step-display>h6 {
+    float: left !important;
+    margin-left: 25%;
+  }
+
+
+  /* First Page Smoothing Options */
+
+  .graph-checkbox-smoothing {
+    font-weight: bold;
+    display: inline-block;
+    width: 50%;
+  }
+
+  #slider-smoothing-accuracy {
+    width: 50%;
+    float: right;
+  }
+
+  .checklist-smoothing {
+    display: inline-block;
+    width: 50%;
+    text-align: center;
+  }
+
+
+  #checklist-smoothing-options-accuracy>label:nth-child(1),
+  #checklist-smoothing-options-accuracy>label:nth-child(2) {
+    display: inline-block;
+    font-size: 14.5px;
+    margin: 0;
+    padding: 10px;
+    text-align: right;
+  }
+
+
+  #_dash-app-content>div>div:nth-child(4)>div>div.two.columns>div.slider-smoothing {
+    margin-bottom: 60px !important;
+  }
+
+  /* Radio items */
+  #radio-display-mode-accuracy {
+    margin: 5px 0;
+
+  }
+
+  .radio-item-div {
+    text-align: right;
+    display: inline-block;
+    vertical-align: top;
+    padding-left: 20%;
+  }
+
+  .plot-display-radio-items {
+    text-align: left;
+    display: inline-block;
+    font-size: 12px;
+    margin: 5px 0;
+  }
+
+  .plot-display-text {
+    display: inline-block;
+    width: 45%;
+    font-size: 14px;
+  }
+
+  #radio-display-mode-accuracy>label:nth-child(1)>input[type=radio],
+  #radio-display-mode-accuracy>label:nth-child(2)>input[type=radio], 
+  #radio-display-mode-accuracy>label:nth-child(3)>input[type=radio] {
+    vertical-align: middle !important;
+  }
+
+  #radio-display-mode-cross-entropy>label:nth-child(1)>input[type=radio],
+  #radio-display-mode-cross-entropy>label:nth-child(2)>input[type=radio],
+  #radio-display-mode-cross-entropy>label:nth-child(3)>input[type=radio] {
+    vertical-align: middle !important;
+  }
+
+  /* Current Accuracy */
+  #div-current-accuracy-value>p {
+    display: inline-flex !important;
+    width: 40%;
+  }
+
+  #div-current-accuracy-value>div:nth-child(2),
+  #div-current-accuracy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 10px 2px !important;
+    font-size: 13px;
+    padding-left: 65px;
+  }
+
+
+  /* Second page smoothing options */
+
+  #checklist-smoothing-options-cross-entropy {
+    text-align: center;
+    margin: 10px 0;
+  }
+
+  #checklist-smoothing-options-cross-entropy>label:nth-child(1),
+  #checklist-smoothing-options-cross-entropy>label:nth-child(2) {
+    display: inline-block;
+    font-size: 14.5px;
+    margin: 0;
+    padding: 10px;
+    text-align: right;
+  }
+
+  #slider-smoothing-cross-entropy>div {
+    width: 50%;
+    float: right;
+  }
+
+  #_dash-app-content>div>div:nth-child(5)>div>div.two.columns>div.slider-smoothing {
+    margin-bottom: 60px !important;
+  }
+
+
+
+  /* Current Loss */
+
+  #div-current-cross-entropy-value>p {
+    display: inline-flex !important;
+    width: 40%;
+  }
+
+  #div-current-cross-entropy-value>div:nth-child(2),
+  #div-current-cross-entropy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 10px 10px !important;
+    font-size: 13px;
+    padding-left: 65px;
+  }
+}
+
+@media (max-width: 550px) {
+  .container {
+    padding: 20px;
+  }
+
+  #title {
+    width: 45%;
+    font-size: 1.8rem;
+    font-weight: 400;
+    margin-top: 30px;
+    margin-left: 7% !important;
+  }
+
+  #learn-more-button {
+    width: 75px;
+    height: 30px;
+    margin-left: 10px;
+    font-size: 7.5px !important;
+    padding: 0px 8px !important;
+    margin-top: 25px;
+  }
+  #plotly-logo {
+    margin-top: 30px;
+    margin-left: 4px;
+  }
+
+  .dropdown-box-first,
+  .dropdown-box-second {
+    width: 50%;
+  }
+
+  #div-interval-control {
+    text-align: center;
+    display: flex;
+  }
+
+  #div-total-step-count {
+    margin-top: 0px;
+  }
+
+  #div-total-step-count>h6,
+  #div-step-display>h6 {
+    display: block;
+    font-size: 1.4rem;
+  }
+ 
+  #div-total-step-count>h6 {
+    float: right;
+    margin-right: 10%;
+  }
+
+  #div-step-display>h6 {
+    float: left !important;
+    margin-left: 25%;
+  }
+ 
+  /* First Page Smoothing Options */
+
+
+  .graph-checkbox-smoothing {
+    font-weight: bold;
+    display: inline-block;
+    width: 50%;
+  }
+
+  #slider-smoothing-accuracy {
+    width: 50%;
+    float: right;
+  }
+
+  .checklist-smoothing{
+    display: inline-block;
+    width: 50%;
+    text-align: center;
+  }
+
+  
+  #checklist-smoothing-options-accuracy>label:nth-child(1),
+  #checklist-smoothing-options-accuracy>label:nth-child(2) {
+    display: inline-block;
+    font-size: 9.5px;
+    margin: 0;
+    padding: 10px;
+    text-align: right;
+  }
+  
+
+  #_dash-app-content>div>div:nth-child(4)>div>div.two.columns>div.slider-smoothing {
+    margin-bottom: 60px !important;
+  }
+
+  /* Radio items */
+  #radio-display-mode-accuracy {
+    margin: 5px 0;
+    
+  }
+
+  .radio-item-div {
+    text-align: right;
+    display: inline-block;
+    vertical-align: top;
+    padding-left: 15%;
+  }
+  
+  .plot-display-radio-items {
+    text-align: left;
+    display: inline-block;
+    font-size: 12px;
+    margin: 5px 0;
+  }
+
+  .plot-display-text {
+    display: inline-block;
+    width: 45%;
+    font-size: 14px;
+  }
+
+ 
+  /* Current Accuracy */
+  #div-current-accuracy-value>p {
+    display: inline-flex !important
+  }
+
+  #div-current-accuracy-value>div:nth-child(2),
+  #div-current-accuracy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 2px;
+    font-size: 9px !important;
+    padding-left: 25px;
+  }
+
+
+  /* Second page smoothing options */
+
+  #checklist-smoothing-options-cross-entropy {
+    text-align: center;
+    margin: 10px 0;
+  }
+  
+  #checklist-smoothing-options-cross-entropy>label:nth-child(1),
+  #checklist-smoothing-options-cross-entropy>label:nth-child(2) {
+      display: inline-block;
+      font-size: 9.5px;
+      margin: 0;
+      padding: 10px;
+      text-align: right;
+  }
+
+  #slider-smoothing-cross-entropy>div {
+    width: 50%;
+    float: right;
+  }
+
+  #_dash-app-content>div>div:nth-child(5)>div>div.two.columns>div.slider-smoothing {
+    margin-bottom: 60px !important;
+  }
+
+ 
+
+  /* Current Loss */
+
+  #div-current-cross-entropy-value {
+    display: inline-flex !important;
+    width: 100%;
+  }
+
+  #div-current-cross-entropy-value>div:nth-child(2),
+  #div-current-cross-entropy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 10px 2px;
+    font-size: 9px !important;
+    padding-left: 10px;
+    padding-top: 10px;
+    
+  }
+
+  
+
+
+}
+
+@media (max-width: 375px) {
+  #title {
+    width: 150px;
+    font-size: 1.5rem !important;
+    font-weight: 400;
+    margin-top: 30px;
+    margin-left: 8% !important;
+  }
+
+  #learn-more-button {
+    width: 75px;
+    height: 30px;
+    margin-left: -5px !important;
+    font-size: 7.5px !important;
+    padding: 0px 8px !important;
+    margin-top: 25px;
+  }
+
+  #plotly-logo {
+    margin-top: 30px;
+    margin-left: 4px;
+  }
+
+  .dropdown-box-first,
+  .dropdown-box-second {
+    width: 50%;
+  }
+
+
+
+  #div-interval-control {
+    text-align: center;
+    display: flex;
+  }
+
+
+  #div-total-step-count>h6,
+  #div-step-display>h6 {
+    text-align: left;
+    display: inline-block;
+    font-size: 1.4rem !important;
+  }
+
+  #div-total-step-count>h6 {
+    float: right !important;
+    margin-right: 5%;
+  }
+
+  #div-step-display>h6 {
+    float: left !important;
+  }
+
+  /* First page Smoothing Options */
+  #checklist-smoothing-options-accuracy {
+    text-align: center;
+    margin: 10px 0;
+    display: inline-flex;
+  }
+
+  #checklist-smoothing-options-accuracy>label:nth-child(1),
+  #checklist-smoothing-options-accuracy>label:nth-child(2) {
+    text-align: center;
+    display: inline;
+
+  }
+
+  /* Second page Smoothing Options */
+  #checklist-smoothing-options-cross-entropy {
+    text-align: center;
+    margin: 10px 0;
+    display: inline-flex;
+
+  }
+
+  #checklist-smoothing-options-cross-entropy>label:nth-child(1),
+  #checklist-smoothing-options-cross-entropy>label:nth-child(2) {
+    text-align: center;
+    display: inline;
+    
+  }
+
+  /* Radio items */
+  #radio-display-mode-accuracy {
+    margin: 1px 0;
+
+  }
+
+  .radio-item-div {
+    text-align: right;
+    display: inline-block;
+    vertical-align: top;
+    padding-left: 25px;
+
+  }
+
+  .plot-display-radio-items {
+    text-align: left;
+    display: inline-block;
+    font-size: 10px !important;
+    margin: 10px 0 !important;
+    
+    
+  }
+
+  .plot-display-text {
+    display: inline-block;
+    width: 50%;
+    margin-bottom: 10px;
+  }
+
+  
+
+  /* Current Accuracy */
+  #div-current-accuracy-value>p {
+    display: inline-flex !important;
+    font-size: 13px !important;
+  }
+
+  #div-current-accuracy-value>div:nth-child(2),
+  #div-current-accuracy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 5px;
+    font-size: 8px !important;
+  }
+
+
+  /* Current Loss */
+
+  #div-current-cross-entropy-value {
+    display: inline-flex !important;
+    font-size: 13px !important;
+  }
+
+  #div-current-cross-entropy-value>div:nth-child(2),
+  #div-current-cross-entropy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 2px;
+    font-size: 8px !important;
+    vertical-align: bottom !important;
+    padding: 0;
+    padding-top: 10px;
+    padding-left: 5px;
+  }
+
+}
+
+@media (max-width: 360px) {
+
+  /* Current Accuracy */
+  #div-current-accuracy-value>p {
+    display: inline-flex !important;
+    font-size: 13px !important;
+  }
+
+  #div-current-accuracy-value>div:nth-child(2),
+  #div-current-accuracy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 5px;
+    font-size: 7px !important;
+  }
+
+
+  /* Current Loss */
+
+  #div-current-cross-entropy-value {
+    display: inline-flex !important;
+    font-size: 13px !important;
+  }
+
+  #div-current-cross-entropy-value>div:nth-child(2),
+  #div-current-cross-entropy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 2px;
+    font-size: 8px !important;
+    vertical-align: bottom !important;
+    padding: 0;
+    padding-top: 10px;
+    padding-left: 2px;
+
+}
+
+
+
+@media (max-width: 320px) {
+  #title {
+    width: 150px;
+    font-size: 1.4rem !important;
+    font-weight: 400;
+    margin-top: 30px;
+    margin-left: 8% !important;
+  }
+
+  #learn-more-button {
+    width: 60px;
+    height: 30px;
+    margin-left: -5px !important;
+    font-size: 5.5px !important;
+    padding: 0px 8px !important;
+    margin-top: 25px;
+  }
+
+  #plotly-logo {
+    margin-top: 30px;
+  }
+
+  .dropdown-box-first,
+  .dropdown-box-second {
+    width: 50%;
+  }
+
+
+  #div-interval-control {
+    text-align: center;
+    display: flex;
+  }
+
+
+  #div-total-step-count>h6,
+  #div-step-display>h6 {
+    text-align: left;
+    display: inline-block;
+    font-size: 1.2rem !important;
+  }
+
+  #div-total-step-count>h6 {
+    float: right;
+    margin-right: 10%;
+  }
+
+  #div-step-display>h6 {
+    float: left;
+    margin-left: 10%;
+  }
+
+  /* First page Smoothing Options */
+  #checklist-smoothing-options-accuracy {
+    text-align: center;
+    margin: 10px 0;
+    display: inline-flex;
+  }
+
+  #checklist-smoothing-options-accuracy>label:nth-child(1),
+  #checklist-smoothing-options-accuracy>label:nth-child(2) {
+    text-align: center;
+    display: inline;
+
+  }
+
+  /* Second page Smoothing Options */
+  #checklist-smoothing-options-cross-entropy {
+    text-align: center;
+    margin: 10px 0;
+    display: inline-flex;
+
+  }
+
+  #checklist-smoothing-options-cross-entropy>label:nth-child(1),
+  #checklist-smoothing-options-cross-entropy>label:nth-child(2) {
+    text-align: center;
+    display: inline;
+
+  }
+
+  /* Radio items */
+  #radio-display-mode-accuracy {
+    margin: 1px 0;
+
+  }
+
+  .radio-item-div {
+    text-align: right;
+    display: inline-block;
+    vertical-align: top;
+    padding-left: 10px;
+
+  }
+
+  .plot-display-radio-items {
+    text-align: left;
+    display: inline-block;
+    font-size: 10px !important;
+    margin: 1px 0;
+  }
+
+  .plot-display-text {
+    display: inline-block;
+    width: 50%;
+    font-size: 12px;
+  }
+
+  /* Current Accuracy */
+  #div-current-accuracy-value>p {
+    display: inline-flex !important;
+    font-size: 10px !important;
+  }
+
+  #div-current-accuracy-value>div:nth-child(2),
+  #div-current-accuracy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 5px;
+    font-size: 8px !important;
+    padding-left: 5px;
+  }
+
+
+  /* Current Loss */
+
+  #div-current-cross-entropy-value {
+    display: inline-flex !important;
+    font-size: 13px !important;
+  }
+
+  #div-current-cross-entropy-value>div:nth-child(2),
+  #div-current-cross-entropy-value>div:nth-child(3) {
+    text-align: center;
+    display: inline;
+    margin: 20px 2px;
+    font-size: 7px !important;
+    vertical-align: bottom !important;
+    padding: 0;
+    padding-top: 10px;
+    padding-left: 0px;
+  }
+
+  }
+
+  
+}
+
+
+  
+
+/* #react-select-4--value-item {
+  color: #6b6b6b;
+} */
diff --git a/lib/assets/font.css b/lib/assets/font.css
new file mode 100644
index 0000000000000000000000000000000000000000..0535e2ad8db789f9f4e56327fc100cb4a12f5f66
--- /dev/null
+++ b/lib/assets/font.css
@@ -0,0 +1 @@
+@import url('https://fonts.googleapis.com/css?family=Open+Sans|Roboto&display=swap');
\ No newline at end of file
diff --git a/lib/assets/live-model-training.css b/lib/assets/live-model-training.css
new file mode 100644
index 0000000000000000000000000000000000000000..454576da30a680c5729e1d845bea52f9993b53f8
--- /dev/null
+++ b/lib/assets/live-model-training.css
@@ -0,0 +1,3 @@
+.markdown img {
+    width: 100%;
+}
diff --git a/lib/assets/normalize.css b/lib/assets/normalize.css
new file mode 100644
index 0000000000000000000000000000000000000000..dc79fbffd1292757a30ec3d299a1bde2651ce0c6
--- /dev/null
+++ b/lib/assets/normalize.css
@@ -0,0 +1,202 @@
+/*! normalize.css v7.0.0 | MIT License | github.com/necolas/normalize.css */
+html {
+    line-height: 1.15;
+    -ms-text-size-adjust: 100%;
+    -webkit-text-size-adjust: 100%
+}
+
+body {
+    margin: 0
+}
+
+article, aside, footer, header, nav, section {
+    display: block
+}
+
+h1 {
+    font-size: 2em;
+    margin: .67em 0
+}
+
+figcaption, figure, main {
+    display: block
+}
+
+figure {
+    margin: 1em 40px
+}
+
+hr {
+    box-sizing: content-box;
+    height: 0;
+    overflow: visible
+}
+
+pre {
+    font-family: monospace, monospace;
+    font-size: 1em
+}
+
+a {
+    background-color: transparent;
+    -webkit-text-decoration-skip: objects
+}
+
+abbr[title] {
+    border-bottom: none;
+    text-decoration: underline;
+    text-decoration: underline dotted
+}
+
+b, strong {
+    font-weight: inherit
+}
+
+b, strong {
+    font-weight: bolder
+}
+
+code, kbd, samp {
+    font-family: monospace, monospace;
+    font-size: 1em
+}
+
+dfn {
+    font-style: italic
+}
+
+mark {
+    background-color: #ff0;
+    color: #000
+}
+
+small {
+    font-size: 80%
+}
+
+sub, sup {
+    font-size: 75%;
+    line-height: 0;
+    position: relative;
+    vertical-align: baseline
+}
+
+sub {
+    bottom: -.25em
+}
+
+sup {
+    top: -.5em
+}
+
+audio, video {
+    display: inline-block
+}
+
+audio:not([controls]) {
+    display: none;
+    height: 0
+}
+
+img {
+    border-style: none
+}
+
+svg:not(:root) {
+    overflow: hidden
+}
+
+button, input, optgroup, select, textarea {
+    font-family: sans-serif;
+    font-size: 100%;
+    line-height: 1.15;
+    margin: 0
+}
+
+button, input {
+    overflow: visible
+}
+
+button, select {
+    text-transform: none
+}
+
+[type=reset], [type=submit], button, html [type=button] {
+    -webkit-appearance: button
+}
+
+[type=button]::-moz-focus-inner, [type=reset]::-moz-focus-inner, [type=submit]::-moz-focus-inner, button::-moz-focus-inner {
+    border-style: none;
+    padding: 0
+}
+
+[type=button]:-moz-focusring, [type=reset]:-moz-focusring, [type=submit]:-moz-focusring, button:-moz-focusring {
+    outline: 1px dotted ButtonText
+}
+
+fieldset {
+    padding: .35em .75em .625em
+}
+
+legend {
+    box-sizing: border-box;
+    color: inherit;
+    display: table;
+    max-width: 100%;
+    padding: 0;
+    white-space: normal
+}
+
+progress {
+    display: inline-block;
+    vertical-align: baseline
+}
+
+textarea {
+    overflow: auto
+}
+
+[type=checkbox], [type=radio] {
+    box-sizing: border-box;
+    padding: 0
+}
+
+[type=number]::-webkit-inner-spin-button, [type=number]::-webkit-outer-spin-button {
+    height: auto
+}
+
+[type=search] {
+    -webkit-appearance: textfield;
+    outline-offset: -2px
+}
+
+[type=search]::-webkit-search-cancel-button, [type=search]::-webkit-search-decoration {
+    -webkit-appearance: none
+}
+
+::-webkit-file-upload-button {
+    -webkit-appearance: button;
+    font: inherit
+}
+
+details, menu {
+    display: block
+}
+
+summary {
+    display: list-item
+}
+
+canvas {
+    display: inline-block
+}
+
+template {
+    display: none
+}
+
+[hidden] {
+    display: none
+}
+
+/*# sourceMappingURL=normalize.min.css.map */
diff --git a/lib/bbox_utils.py b/lib/bbox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..35ba08012320b3372f9dd074ab020188130a73f7
--- /dev/null
+++ b/lib/bbox_utils.py
@@ -0,0 +1,84 @@
+import torch
+import numpy as np
+from . import seg_dvgo as dvgo
+import time
+from .utils import load_model
+
+__ALL__ = ['_compute_bbox_by_cam_frustrm_bounded',
+            '_compute_bbox_by_cam_frustrm_unbounded',
+            'compute_bbox_by_cam_frustrm',
+            'compute_bbox_by_coarse_geo']
+
+def _compute_bbox_by_cam_frustrm_bounded(cfg, HW, Ks, poses, i_train, near, far):
+    xyz_min = torch.Tensor([np.inf, np.inf, np.inf])
+    xyz_max = -xyz_min
+    for (H, W), K, c2w in zip(HW[i_train], Ks[i_train], poses[i_train]):
+        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w,
+                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,
+                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)
+        if cfg.data.ndc:
+            pts_nf = torch.stack([rays_o+rays_d*near, rays_o+rays_d*far])
+        else:
+            pts_nf = torch.stack([rays_o+viewdirs*near, rays_o+viewdirs*far])
+        xyz_min = torch.minimum(xyz_min, pts_nf.amin((0,1,2)))
+        xyz_max = torch.maximum(xyz_max, pts_nf.amax((0,1,2)))
+    return xyz_min, xyz_max
+
+
+def _compute_bbox_by_cam_frustrm_unbounded(cfg, HW, Ks, poses, i_train, near_clip):
+    # Find a tightest cube that cover all camera centers
+    xyz_min = torch.Tensor([np.inf, np.inf, np.inf])
+    xyz_max = -xyz_min
+    for (H, W), K, c2w in zip(HW[i_train], Ks[i_train], poses[i_train]):
+        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w,
+                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,
+                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)
+        pts = rays_o + rays_d * near_clip
+        xyz_min = torch.minimum(xyz_min, pts.amin((0,1)))
+        xyz_max = torch.maximum(xyz_max, pts.amax((0,1)))
+    center = (xyz_min + xyz_max) * 0.5
+    radius = (center - xyz_min).max() * cfg.data.unbounded_inner_r
+    xyz_min = center - radius
+    xyz_max = center + radius
+    return xyz_min, xyz_max
+
+
+def compute_bbox_by_cam_frustrm(args, cfg, HW, Ks, poses, i_train, near, far, **kwargs):
+    print('compute_bbox_by_cam_frustrm: start')
+    if cfg.data.unbounded_inward:
+        xyz_min, xyz_max = _compute_bbox_by_cam_frustrm_unbounded(
+                cfg, HW, Ks, poses, i_train, kwargs.get('near_clip', None))
+
+    else:
+        xyz_min, xyz_max = _compute_bbox_by_cam_frustrm_bounded(
+                cfg, HW, Ks, poses, i_train, near, far)
+    print('compute_bbox_by_cam_frustrm: xyz_min', xyz_min)
+    print('compute_bbox_by_cam_frustrm: xyz_max', xyz_max)
+    print('compute_bbox_by_cam_frustrm: finish')
+    return xyz_min, xyz_max
+
+
+@torch.no_grad()
+def compute_bbox_by_coarse_geo(model_class, model_path, thres):
+    print('compute_bbox_by_coarse_geo: start')
+    eps_time = time.time()
+    model = load_model(model_class, model_path)
+    interp = torch.stack(torch.meshgrid(
+        torch.linspace(0, 1, model.world_size[0]),
+        torch.linspace(0, 1, model.world_size[1]),
+        torch.linspace(0, 1, model.world_size[2]),
+    ), -1)
+    dense_xyz = model.xyz_min * (1-interp) + model.xyz_max * interp
+    density = model.density(dense_xyz)
+    alpha = model.activate_density(density)
+    mask = (alpha > thres)
+    active_xyz = dense_xyz[mask]
+    xyz_min = active_xyz.amin(0)
+    xyz_max = active_xyz.amax(0)
+    print('compute_bbox_by_coarse_geo: xyz_min', xyz_min)
+    print('compute_bbox_by_coarse_geo: xyz_max', xyz_max)
+    eps_time = time.time() - eps_time
+    print('compute_bbox_by_coarse_geo: finish (eps time:', eps_time, 'secs)')
+    return xyz_min, xyz_max
\ No newline at end of file
diff --git a/lib/config_loader.py b/lib/config_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..2319c0321454c0c428e8c5d29147648af0bffada
--- /dev/null
+++ b/lib/config_loader.py
@@ -0,0 +1,741 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import types
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+from pathlib import Path
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .config_misc import import_modules_from_strings
+from .config_path import check_file_exist
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                    and not isinstance(value, types.ModuleType)
+                    and not isinstance(value, types.FunctionType)
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                import mmcv
+                cfg_dict = mmcv.load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types = (dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        if isinstance(filename, Path):
+            filename = str(filename)
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            :obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        if isinstance(filename, Path):
+            filename = str(filename)
+
+        super().__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super().__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename) as f:
+                text = f.read()
+        else:
+            text = ''
+        super().__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+
+        return other
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super().__setattr__('_cfg_dict', _cfg_dict)
+        super().__setattr__('_filename', _filename)
+        super().__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        """Dumps config into a file or returns a string representation of the
+        config.
+
+        If a file argument is given, saves the config to that file using the
+        format defined by the file argument extension.
+
+        Otherwise, returns a string representing the config. The formatting of
+        this returned string is defined by the extension of `self.filename`. If
+        `self.filename` is not defined, returns a string representation of a
+         dict (lowercased and using ' for strings).
+
+        Examples:
+            >>> cfg_dict = dict(item1=[1, 2], item2=dict(a=0),
+            ...     item3=True, item4='test')
+            >>> cfg = Config(cfg_dict=cfg_dict)
+            >>> dump_file = "a.py"
+            >>> cfg.dump(dump_file)
+
+        Args:
+            file (str, optional): Path of the output file where the config
+                will be dumped. Defaults to None.
+        """
+        import mmcv
+        cfg_dict = super().__getattribute__('_cfg_dict').to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
+                return self.pretty_text
+            else:
+                file_format = self.filename.split('.')[-1]
+                return mmcv.dump(cfg_dict, file_format=file_format)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return mmcv.dump(cfg_dict, file=file, file_format=file_format)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            >>> # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
diff --git a/lib/config_misc.py b/lib/config_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..7957ea89b762763566139edfbf0a75401dc4e268
--- /dev/null
+++ b/lib/config_misc.py
@@ -0,0 +1,377 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import subprocess
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec
+from itertools import repeat
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+        'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method, base_class, derived_class):
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/lib/config_path.py b/lib/config_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f68b3d00b913e7be2b48ddc0558229020f205
--- /dev/null
+++ b/lib/config_path.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .config_misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | :obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Default: True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
diff --git a/lib/configs.py b/lib/configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..11fc90e30257c7cf10954738f52584eb0c80ddbb
--- /dev/null
+++ b/lib/configs.py
@@ -0,0 +1,106 @@
+import argparse
+
+### configs
+def config_parser():
+    '''Define command line arguments
+    '''
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--config', required=True,
+                        help='config file path')
+    parser.add_argument("--seed", type=int, default=777,
+                        help='Random seed')
+    parser.add_argument("--no_reload", action='store_true',
+                        help='do not reload weights from saved ckpt')
+    parser.add_argument("--no_reload_optimizer", action='store_true',
+                        help='do not reload optimizer state from saved ckpt')
+    parser.add_argument("--ft_path", type=str, default='',
+                        help='specific weights npy file to reload for coarse network')
+    parser.add_argument("--export_bbox_and_cams_only", type=str, default='',
+                        help='export scene bbox and camera poses for debugging and 3d visualization')
+    parser.add_argument("--export_coarse_only", type=str, default='')
+
+    # testing options
+    parser.add_argument("--render_only", action='store_true',
+                        help='do not optimize, reload weights and render out render_poses path')
+    parser.add_argument("--render_opt", default=None, type=str, 
+                        choices=['train', 'test', 'video'], help='rendering mode')
+
+    parser.add_argument("--render_video_flipy", action='store_true')
+    parser.add_argument("--render_video_rot90", default=0, type=int)
+    parser.add_argument("--render_video_factor", type=float, default=0,
+                        help='downsampling factor to speed up rendering, set 4 or 8 for fast preview')
+    parser.add_argument("--dump_images", action='store_true')
+    parser.add_argument("--eval_ssim", action='store_true')
+    parser.add_argument("--eval_lpips_alex", action='store_true')
+    parser.add_argument("--eval_lpips_vgg", action='store_true')
+
+    # logging/saving options
+    parser.add_argument("--i_print",   type=int, default=500,
+                        help='frequency of console printout and metric loggin')
+    parser.add_argument("--i_weights", type=int, default=5000,
+                        help='frequency of weight ckpt saving')
+
+    # arguments for feature distillation
+    parser.add_argument("--freeze_density", action='store_true',
+                        help='freeze density grid')
+    parser.add_argument("--freeze_rgb", action='store_true',
+                        help='freeze rgb grid and mlp')
+    parser.add_argument("--freeze_feature", action='store_true',
+                        help='freeze feature grid')
+    parser.add_argument("--only_distill_loss", action='store_true',
+                        help='train on only loss of features')
+    parser.add_argument("--weighted_distill_loss", action='store_true',
+                        help='train on weighted loss')
+    parser.add_argument("--seg_mask", action='store_true',
+                        help='generate segmentation mask')
+    parser.add_argument("--segment", action='store_true',
+                        help='interactively set threshold and re-render until stopped.')
+
+    parser.add_argument("--segment_everything", action='store_true',
+                        help='if true, adopt SamAutomaticMaskGenerator to generate masks for all possible objects in the first frame.')
+    parser.add_argument("--stop_at", type=int, default=1000000,
+                        help='at what iteration to stop training.')
+    parser.add_argument("--get_cam_trajectory", action='store_true',
+                        help='Get the camera poses of the training set as the trajectory.')
+    
+    # prompt options
+    parser.add_argument("--num_prompts", type=int, default=3, help='number of prompts')
+    parser.add_argument("--num_epochs", type=int, default=1, help='number of training epochs')
+    parser.add_argument("--lamb", type=float, default=1., help='the negative force in seg loss')
+    parser.add_argument("--tau", type=float, default=0.5, help='the iou threshold')
+    parser.add_argument('--prompt_type', type=str, default='scene', choices=['scene', 'file', 'input', 'interactive', 'text'], 
+                        help='the type of prompt, point or box')
+    # type 1: scene property
+    parser.add_argument("--scene", type=str, default=None,
+                        help='scene, used for querying point and box coords')
+    # type 2: json file
+    parser.add_argument("--prompt_file", type=str, default=None,
+                        help='json file containing prompts')
+    # type 3: directly input the prompts
+    parser.add_argument('--coords', metavar='N', type=int, nargs='+', help='get prompts from command line')
+    # type 4: interactive backend
+    parser.add_argument("--interactive", action='store_true', help='interactive backend, \
+                        input points are from interactive GUI') # TODO
+    # type 5: text discription
+    parser.add_argument("--text", type=str, default=None,
+                        help='text discription of the prompt')
+    
+    # sp_name for instance
+    parser.add_argument("--sp_name", type=str, default=None, help="if None, use default, else use this as e_flag")
+    # seg training
+    parser.add_argument("--use_fine_stage", action='store_true',
+                        help='fine stage can be used when IoU is low')
+    parser.add_argument("--seg_poses", default='train', type=str,
+                        choices=['train', 'video'], help='which poses are used for segmentation')
+
+    # seg testing
+    parser.add_argument('--seg_type', nargs = '+', type=str, default=['seg_img', 'seg_density'],
+                        help='segmentation type in inference')
+    parser.add_argument("--save_ckpt", action='store_true',
+                        help='save segmentation ckpt')
+    parser.add_argument("--mobile_sam", action='store_true', help='Replace the original SAM encoder with MobileSAM to accelerate segmentation')
+    return parser
+
+
+
diff --git a/lib/cuda/adam_upd.cpp b/lib/cuda/adam_upd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b530241fa42fbc0d38d016716c61c7c4a5dce3d
--- /dev/null
+++ b/lib/cuda/adam_upd.cpp
@@ -0,0 +1,87 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+void adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int step, float beta1, float beta2, float lr, float eps);
+
+void masked_adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int step, float beta1, float beta2, float lr, float eps);
+
+void adam_upd_with_perlr_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    int step, float beta1, float beta2, float lr, float eps);
+
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void adam_upd(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int step, float beta1, float beta2, float lr, float eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,
+          step, beta1, beta2, lr, eps);
+}
+
+void masked_adam_upd(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int step, float beta1, float beta2, float lr, float eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  masked_adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,
+          step, beta1, beta2, lr, eps);
+}
+
+void adam_upd_with_perlr(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    int step, float beta1, float beta2, float lr, float eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  adam_upd_with_perlr_cuda(param, grad, exp_avg, exp_avg_sq, perlr,
+          step, beta1, beta2, lr, eps);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("adam_upd", &adam_upd,
+          "Adam update");
+  m.def("masked_adam_upd", &masked_adam_upd,
+          "Adam update ignoring zero grad");
+  m.def("adam_upd_with_perlr", &adam_upd_with_perlr,
+          "Adam update ignoring zero grad with per-voxel lr");
+}
+
diff --git a/lib/cuda/adam_upd_kernel.cu b/lib/cuda/adam_upd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27cc253cf1bfbbf1aeaca1b016722eba074a5e72
--- /dev/null
+++ b/lib/cuda/adam_upd_kernel.cu
@@ -0,0 +1,133 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+template <typename scalar_t>
+__global__ void adam_upd_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    const size_t N,
+    const float step_size, const float beta1, const float beta2, const float eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+template <typename scalar_t>
+__global__ void masked_adam_upd_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    const size_t N,
+    const float step_size, const float beta1, const float beta2, const float eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N && grad[index]!=0) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+template <typename scalar_t>
+__global__ void adam_upd_with_perlr_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    scalar_t* __restrict__ perlr,
+    const size_t N,
+    const float step_size, const float beta1, const float beta2, const float eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * perlr[index] * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+void adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    const int step, const float beta1, const float beta2, const float lr, const float eps) {
+
+  const size_t N = param.numel();
+
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.type(), "adam_upd_cuda", ([&] {
+    adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data<scalar_t>(),
+        grad.data<scalar_t>(),
+        exp_avg.data<scalar_t>(),
+        exp_avg_sq.data<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
+void masked_adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    const int step, const float beta1, const float beta2, const float lr, const float eps) {
+
+  const size_t N = param.numel();
+
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.type(), "masked_adam_upd_cuda", ([&] {
+    masked_adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data<scalar_t>(),
+        grad.data<scalar_t>(),
+        exp_avg.data<scalar_t>(),
+        exp_avg_sq.data<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
+void adam_upd_with_perlr_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    const int step, const float beta1, const float beta2, const float lr, const float eps) {
+
+  const size_t N = param.numel();
+
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.type(), "adam_upd_with_perlr_cuda", ([&] {
+    adam_upd_with_perlr_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data<scalar_t>(),
+        grad.data<scalar_t>(),
+        exp_avg.data<scalar_t>(),
+        exp_avg_sq.data<scalar_t>(),
+        perlr.data<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
diff --git a/lib/cuda/render_utils.cpp b/lib/cuda/render_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab9dc901f0ed51fb1fbecf3b88d093d57fb38c6a
--- /dev/null
+++ b/lib/cuda/render_utils.cpp
@@ -0,0 +1,185 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> infer_t_minmax_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far);
+
+torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist);
+
+std::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min);
+
+std::vector<torch::Tensor> sample_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far, const float stepdist);
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int N_samples);
+
+torch::Tensor sample_bg_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const float bg_preserve, const int N_samples);
+
+torch::Tensor maskcache_lookup_cuda(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift);
+
+std::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const float shift, const float interval);
+std::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const float shift, torch::Tensor interval);
+
+torch::Tensor raw2alpha_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, const float interval);
+torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval);
+
+std::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays);
+
+torch::Tensor alpha2weight_backward_cuda(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last);
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> infer_t_minmax(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  return infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);
+}
+
+torch::Tensor infer_n_samples(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist) {
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_min);
+  CHECK_INPUT(t_max);
+  return infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);
+}
+
+std::vector<torch::Tensor> infer_ray_start_dir(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_min);
+  return infer_ray_start_dir_cuda(rays_o, rays_d, t_min);
+}
+
+std::vector<torch::Tensor> sample_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far, const float stepdist) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  assert(rays_o.dim()==2);
+  assert(rays_o.size(1)==3);
+  return sample_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far, stepdist);
+}
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int N_samples) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  assert(rays_o.dim()==2);
+  assert(rays_o.size(1)==3);
+  return sample_ndc_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, N_samples);
+}
+
+torch::Tensor sample_bg_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const float bg_preserve, const int N_samples) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_max);
+  return sample_bg_pts_on_rays_cuda(rays_o, rays_d, t_max, bg_preserve, N_samples);
+}
+
+torch::Tensor maskcache_lookup(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift) {
+  CHECK_INPUT(world);
+  CHECK_INPUT(xyz);
+  CHECK_INPUT(xyz2ijk_scale);
+  CHECK_INPUT(xyz2ijk_shift);
+  assert(world.dim()==3);
+  assert(xyz.dim()==2);
+  assert(xyz.size(1)==3);
+  return maskcache_lookup_cuda(world, xyz, xyz2ijk_scale, xyz2ijk_shift);
+}
+
+std::vector<torch::Tensor> raw2alpha(torch::Tensor density, const float shift, const float interval) {
+  CHECK_INPUT(density);
+  assert(density.dim()==1);
+  return raw2alpha_cuda(density, shift, interval);
+}
+std::vector<torch::Tensor> raw2alpha_nonuni(torch::Tensor density, const float shift, torch::Tensor interval) {
+  CHECK_INPUT(density);
+  assert(density.dim()==1);
+  return raw2alpha_nonuni_cuda(density, shift, interval);
+}
+
+torch::Tensor raw2alpha_backward(torch::Tensor exp, torch::Tensor grad_back, const float interval) {
+  CHECK_INPUT(exp);
+  CHECK_INPUT(grad_back);
+  return raw2alpha_backward_cuda(exp, grad_back, interval);
+}
+torch::Tensor raw2alpha_nonuni_backward(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval) {
+  CHECK_INPUT(exp);
+  CHECK_INPUT(grad_back);
+  return raw2alpha_nonuni_backward_cuda(exp, grad_back, interval);
+}
+
+std::vector<torch::Tensor> alpha2weight(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays) {
+  CHECK_INPUT(alpha);
+  CHECK_INPUT(ray_id);
+  assert(alpha.dim()==1);
+  assert(ray_id.dim()==1);
+  assert(alpha.sizes()==ray_id.sizes());
+  return alpha2weight_cuda(alpha, ray_id, n_rays);
+}
+
+torch::Tensor alpha2weight_backward(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last) {
+  CHECK_INPUT(alpha);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(T);
+  CHECK_INPUT(alphainv_last);
+  CHECK_INPUT(i_start);
+  CHECK_INPUT(i_end);
+  CHECK_INPUT(grad_weights);
+  CHECK_INPUT(grad_last);
+  return alpha2weight_backward_cuda(
+          alpha, weight, T, alphainv_last,
+          i_start, i_end, n_rays,
+          grad_weights, grad_last);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("infer_t_minmax", &infer_t_minmax, "Inference t_min and t_max of ray-bbox intersection");
+  m.def("infer_n_samples", &infer_n_samples, "Inference the number of points to sample on each ray");
+  m.def("infer_ray_start_dir", &infer_ray_start_dir, "Inference the starting point and shooting direction of each ray");
+  m.def("sample_pts_on_rays", &sample_pts_on_rays, "Sample points on rays");
+  m.def("sample_ndc_pts_on_rays", &sample_ndc_pts_on_rays, "Sample points on rays");
+  m.def("sample_bg_pts_on_rays", &sample_bg_pts_on_rays, "Sample points on bg");
+  m.def("maskcache_lookup", &maskcache_lookup, "Lookup to skip know freespace.");
+  m.def("raw2alpha", &raw2alpha, "Raw values [-inf, inf] to alpha [0, 1].");
+  m.def("raw2alpha_backward", &raw2alpha_backward, "Backward pass of the raw to alpha");
+  m.def("raw2alpha_nonuni", &raw2alpha_nonuni, "Raw values [-inf, inf] to alpha [0, 1].");
+  m.def("raw2alpha_nonuni_backward", &raw2alpha_nonuni_backward, "Backward pass of the raw to alpha");
+  m.def("alpha2weight", &alpha2weight, "Per-point alpha to accumulated blending weight");
+  m.def("alpha2weight_backward", &alpha2weight_backward, "Backward pass of alpha2weight");
+}
+
diff --git a/lib/cuda/render_utils_kernel.cu b/lib/cuda/render_utils_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34a03d2dbc4c4949aea6fb179b53207efd77e3c7
--- /dev/null
+++ b/lib/cuda/render_utils_kernel.cu
@@ -0,0 +1,708 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+/*
+   Points sampling helper functions.
+ */
+template <typename scalar_t>
+__global__ void infer_t_minmax_cuda_kernel(
+        scalar_t* __restrict__ rays_o,
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ xyz_min,
+        scalar_t* __restrict__ xyz_max,
+        const float near, const float far, const int n_rays,
+        scalar_t* __restrict__ t_min,
+        scalar_t* __restrict__ t_max) {
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int offset = i_ray * 3;
+    float vx = ((rays_d[offset  ]==0) ? 1e-6 : rays_d[offset  ]);
+    float vy = ((rays_d[offset+1]==0) ? 1e-6 : rays_d[offset+1]);
+    float vz = ((rays_d[offset+2]==0) ? 1e-6 : rays_d[offset+2]);
+    float ax = (xyz_max[0] - rays_o[offset  ]) / vx;
+    float ay = (xyz_max[1] - rays_o[offset+1]) / vy;
+    float az = (xyz_max[2] - rays_o[offset+2]) / vz;
+    float bx = (xyz_min[0] - rays_o[offset  ]) / vx;
+    float by = (xyz_min[1] - rays_o[offset+1]) / vy;
+    float bz = (xyz_min[2] - rays_o[offset+2]) / vz;
+    t_min[i_ray] = max(min(max(max(min(ax, bx), min(ay, by)), min(az, bz)), far), near);
+    t_max[i_ray] = max(min(min(min(max(ax, bx), max(ay, by)), max(az, bz)), far), near);
+  }
+}
+
+template <typename scalar_t>
+__global__ void infer_n_samples_cuda_kernel(
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ t_min,
+        scalar_t* __restrict__ t_max,
+        const float stepdist,
+        const int n_rays,
+        int64_t* __restrict__ n_samples) {
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int offset = i_ray * 3;
+    const float rnorm = sqrt(
+            rays_d[offset  ]*rays_d[offset  ] +\
+            rays_d[offset+1]*rays_d[offset+1] +\
+            rays_d[offset+2]*rays_d[offset+2]);
+    // at least 1 point for easier implementation in the later sample_pts_on_rays_cuda
+    n_samples[i_ray] = max(ceil((t_max[i_ray]-t_min[i_ray]) * rnorm / stepdist), 1.);
+  }
+}
+
+template <typename scalar_t>
+__global__ void infer_ray_start_dir_cuda_kernel(
+        scalar_t* __restrict__ rays_o,
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ t_min,
+        const int n_rays,
+        scalar_t* __restrict__ rays_start,
+        scalar_t* __restrict__ rays_dir) {
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int offset = i_ray * 3;
+    const float rnorm = sqrt(
+            rays_d[offset  ]*rays_d[offset  ] +\
+            rays_d[offset+1]*rays_d[offset+1] +\
+            rays_d[offset+2]*rays_d[offset+2]);
+    rays_start[offset  ] = rays_o[offset  ] + rays_d[offset  ] * t_min[i_ray];
+    rays_start[offset+1] = rays_o[offset+1] + rays_d[offset+1] * t_min[i_ray];
+    rays_start[offset+2] = rays_o[offset+2] + rays_d[offset+2] * t_min[i_ray];
+    rays_dir  [offset  ] = rays_d[offset  ] / rnorm;
+    rays_dir  [offset+1] = rays_d[offset+1] / rnorm;
+    rays_dir  [offset+2] = rays_d[offset+2] / rnorm;
+  }
+}
+
+
+std::vector<torch::Tensor> infer_t_minmax_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far) {
+  const int n_rays = rays_o.size(0);
+  auto t_min = torch::empty({n_rays}, rays_o.options());
+  auto t_max = torch::empty({n_rays}, rays_o.options());
+
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), "infer_t_minmax_cuda", ([&] {
+    infer_t_minmax_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_o.data<scalar_t>(),
+        rays_d.data<scalar_t>(),
+        xyz_min.data<scalar_t>(),
+        xyz_max.data<scalar_t>(),
+        near, far, n_rays,
+        t_min.data<scalar_t>(),
+        t_max.data<scalar_t>());
+  }));
+
+  return {t_min, t_max};
+}
+
+torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist) {
+  const int n_rays = t_min.size(0);
+  auto n_samples = torch::empty({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+  AT_DISPATCH_FLOATING_TYPES(t_min.type(), "infer_n_samples_cuda", ([&] {
+    infer_n_samples_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_d.data<scalar_t>(),
+        t_min.data<scalar_t>(),
+        t_max.data<scalar_t>(),
+        stepdist,
+        n_rays,
+        n_samples.data<int64_t>());
+  }));
+  return n_samples;
+}
+
+std::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {
+  const int n_rays = rays_o.size(0);
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+  auto rays_start = torch::empty_like(rays_o);
+  auto rays_dir = torch::empty_like(rays_o);
+  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), "infer_ray_start_dir_cuda", ([&] {
+    infer_ray_start_dir_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_o.data<scalar_t>(),
+        rays_d.data<scalar_t>(),
+        t_min.data<scalar_t>(),
+        n_rays,
+        rays_start.data<scalar_t>(),
+        rays_dir.data<scalar_t>());
+  }));
+  return {rays_start, rays_dir};
+}
+
+/*
+   Sampling query points on rays.
+ */
+__global__ void __set_1_at_ray_seg_start(
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ N_steps_cumsum,
+        const int n_rays) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(0<idx && idx<n_rays) {
+    ray_id[N_steps_cumsum[idx-1]] = 1;
+  }
+}
+
+__global__ void __set_step_id(
+        int64_t* __restrict__ step_id,
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ N_steps_cumsum,
+        const int total_len) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx<total_len) {
+      const int rid = ray_id[idx];
+      step_id[idx] = idx - ((rid!=0) ? N_steps_cumsum[rid-1] : 0);
+    }
+}
+
+template <typename scalar_t>
+__global__ void sample_pts_on_rays_cuda_kernel(
+        scalar_t* __restrict__ rays_start,
+        scalar_t* __restrict__ rays_dir,
+        scalar_t* __restrict__ xyz_min,
+        scalar_t* __restrict__ xyz_max,
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ step_id,
+        const float stepdist, const int total_len,
+        scalar_t* __restrict__ rays_pts,
+        bool* __restrict__ mask_outbbox) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<total_len) {
+    const int i_ray = ray_id[idx];
+    const int i_step = step_id[idx];
+
+    const int offset_p = idx * 3;
+    const int offset_r = i_ray * 3;
+    const float dist = stepdist * i_step;
+    const float px = rays_start[offset_r  ] + rays_dir[offset_r  ] * dist;
+    const float py = rays_start[offset_r+1] + rays_dir[offset_r+1] * dist;
+    const float pz = rays_start[offset_r+2] + rays_dir[offset_r+2] * dist;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \
+                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);
+  }
+}
+
+std::vector<torch::Tensor> sample_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const float near, const float far, const float stepdist) {
+  const int threads = 256;
+  const int n_rays = rays_o.size(0);
+
+  // Compute ray-bbox intersection
+  auto t_minmax = infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);
+  auto t_min = t_minmax[0];
+  auto t_max = t_minmax[1];
+
+  // Compute the number of points required.
+  // Assign ray index and step index to each.
+  auto N_steps = infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);
+  auto N_steps_cumsum = N_steps.cumsum(0);
+  const int total_len = N_steps.sum().item<int>();
+  auto ray_id = torch::zeros({total_len}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  __set_1_at_ray_seg_start<<<(n_rays+threads-1)/threads, threads>>>(
+        ray_id.data<int64_t>(), N_steps_cumsum.data<int64_t>(), n_rays);
+  ray_id.cumsum_(0);
+  auto step_id = torch::empty({total_len}, ray_id.options());
+  __set_step_id<<<(total_len+threads-1)/threads, threads>>>(
+        step_id.data<int64_t>(), ray_id.data<int64_t>(), N_steps_cumsum.data<int64_t>(), total_len);
+
+  // Compute the global xyz of each point
+  auto rays_start_dir = infer_ray_start_dir_cuda(rays_o, rays_d, t_min);
+  auto rays_start = rays_start_dir[0];
+  auto rays_dir = rays_start_dir[1];
+
+  auto rays_pts = torch::empty({total_len, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+  auto mask_outbbox = torch::empty({total_len}, torch::dtype(torch::kBool).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), "sample_pts_on_rays_cuda", ([&] {
+    sample_pts_on_rays_cuda_kernel<scalar_t><<<(total_len+threads-1)/threads, threads>>>(
+        rays_start.data<scalar_t>(),
+        rays_dir.data<scalar_t>(),
+        xyz_min.data<scalar_t>(),
+        xyz_max.data<scalar_t>(),
+        ray_id.data<int64_t>(),
+        step_id.data<int64_t>(),
+        stepdist, total_len,
+        rays_pts.data<scalar_t>(),
+        mask_outbbox.data<bool>());
+  }));
+  return {rays_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max};
+}
+
+template <typename scalar_t>
+__global__ void sample_ndc_pts_on_rays_cuda_kernel(
+        const scalar_t* __restrict__ rays_o,
+        const scalar_t* __restrict__ rays_d,
+        const scalar_t* __restrict__ xyz_min,
+        const scalar_t* __restrict__ xyz_max,
+        const int N_samples, const int n_rays,
+        scalar_t* __restrict__ rays_pts,
+        bool* __restrict__ mask_outbbox) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<N_samples*n_rays) {
+    const int i_ray = idx / N_samples;
+    const int i_step = idx % N_samples;
+
+    const int offset_p = idx * 3;
+    const int offset_r = i_ray * 3;
+    const float dist = ((float)i_step) / (N_samples-1);
+    const float px = rays_o[offset_r  ] + rays_d[offset_r  ] * dist;
+    const float py = rays_o[offset_r+1] + rays_d[offset_r+1] * dist;
+    const float pz = rays_o[offset_r+2] + rays_d[offset_r+2] * dist;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \
+                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);
+  }
+}
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int N_samples) {
+  const int threads = 256;
+  const int n_rays = rays_o.size(0);
+
+  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+  auto mask_outbbox = torch::empty({n_rays, N_samples}, torch::dtype(torch::kBool).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), "sample_ndc_pts_on_rays_cuda", ([&] {
+    sample_ndc_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(
+        rays_o.data<scalar_t>(),
+        rays_d.data<scalar_t>(),
+        xyz_min.data<scalar_t>(),
+        xyz_max.data<scalar_t>(),
+        N_samples, n_rays,
+        rays_pts.data<scalar_t>(),
+        mask_outbbox.data<bool>());
+  }));
+  return {rays_pts, mask_outbbox};
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t norm3(const scalar_t x, const scalar_t y, const scalar_t z) {
+  return sqrt(x*x + y*y + z*z);
+}
+
+template <typename scalar_t>
+__global__ void sample_bg_pts_on_rays_cuda_kernel(
+        const scalar_t* __restrict__ rays_o,
+        const scalar_t* __restrict__ rays_d,
+        const scalar_t* __restrict__ t_max,
+        const float bg_preserve,
+        const int N_samples, const int n_rays,
+        scalar_t* __restrict__ rays_pts) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<N_samples*n_rays) {
+    const int i_ray = idx / N_samples;
+    const int i_step = idx % N_samples;
+
+    const int offset_p = idx * 3;
+    const int offset_r = i_ray * 3;
+    /* Original pytorch implementation
+    ori_t_outer = t_max[:,None] - 1 + 1 / torch.linspace(1, 0, N_outer+1)[:-1]
+    ori_ray_pts_outer = (rays_o[:,None,:] + rays_d[:,None,:] * ori_t_outer[:,:,None]).reshape(-1,3)
+    t_outer = ori_ray_pts_outer.norm(dim=-1)
+    R_outer = t_outer / ori_ray_pts_outer.abs().amax(1)
+    # r = R * R / t
+    o2i_p = R_outer.pow(2) / t_outer.pow(2) * (1-self.bg_preserve) + R_outer / t_outer * self.bg_preserve
+    ray_pts_outer = (ori_ray_pts_outer * o2i_p[:,None]).reshape(len(rays_o), -1, 3)
+   */
+    const float t_inner = t_max[i_ray];
+    const float ori_t_outer = t_inner - 1. + 1. / (1. - ((float)i_step) / N_samples);
+    const float ori_ray_pts_x =  rays_o[offset_r  ] + rays_d[offset_r  ] * ori_t_outer;
+    const float ori_ray_pts_y =  rays_o[offset_r+1] + rays_d[offset_r+1] * ori_t_outer;
+    const float ori_ray_pts_z =  rays_o[offset_r+2] + rays_d[offset_r+2] * ori_t_outer;
+    const float t_outer = norm3(ori_ray_pts_x, ori_ray_pts_y, ori_ray_pts_z);
+    const float ori_ray_pts_m = max(abs(ori_ray_pts_x), max(abs(ori_ray_pts_y), abs(ori_ray_pts_z)));
+    const float R_outer = t_outer / ori_ray_pts_m;
+    const float o2i_p = R_outer*R_outer / (t_outer*t_outer) * (1.-bg_preserve) + R_outer / t_outer * bg_preserve;
+    const float px = ori_ray_pts_x * o2i_p;
+    const float py = ori_ray_pts_y * o2i_p;
+    const float pz = ori_ray_pts_z * o2i_p;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+  }
+}
+
+torch::Tensor sample_bg_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const float bg_preserve, const int N_samples) {
+  const int threads = 256;
+  const int n_rays = rays_o.size(0);
+
+  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), "sample_bg_pts_on_rays_cuda", ([&] {
+    sample_bg_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(
+        rays_o.data<scalar_t>(),
+        rays_d.data<scalar_t>(),
+        t_max.data<scalar_t>(),
+        bg_preserve,
+        N_samples, n_rays,
+        rays_pts.data<scalar_t>());
+  }));
+  return rays_pts;
+}
+
+
+/*
+   MaskCache lookup to skip known freespace.
+ */
+
+static __forceinline__ __device__
+bool check_xyz(int i, int j, int k, int sz_i, int sz_j, int sz_k) {
+  return (0 <= i) && (i < sz_i) && (0 <= j) && (j < sz_j) && (0 <= k) && (k < sz_k);
+}
+
+
+template <typename scalar_t>
+__global__ void maskcache_lookup_cuda_kernel(
+    bool* __restrict__ world,
+    scalar_t* __restrict__ xyz,
+    bool* __restrict__ out,
+    scalar_t* __restrict__ xyz2ijk_scale,
+    scalar_t* __restrict__ xyz2ijk_shift,
+    const int sz_i, const int sz_j, const int sz_k, const int n_pts) {
+
+  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const int offset = i_pt * 3;
+    const int i = round(xyz[offset  ] * xyz2ijk_scale[0] + xyz2ijk_shift[0]);
+    const int j = round(xyz[offset+1] * xyz2ijk_scale[1] + xyz2ijk_shift[1]);
+    const int k = round(xyz[offset+2] * xyz2ijk_scale[2] + xyz2ijk_shift[2]);
+    if(check_xyz(i, j, k, sz_i, sz_j, sz_k)) {
+      out[i_pt] = world[i*sz_j*sz_k + j*sz_k + k];
+    }
+  }
+}
+
+torch::Tensor maskcache_lookup_cuda(
+        torch::Tensor world,
+        torch::Tensor xyz,
+        torch::Tensor xyz2ijk_scale,
+        torch::Tensor xyz2ijk_shift) {
+
+  const int sz_i = world.size(0);
+  const int sz_j = world.size(1);
+  const int sz_k = world.size(2);
+  const int n_pts = xyz.size(0);
+
+  auto out = torch::zeros({n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));
+  if(n_pts==0) {
+    return out;
+  }
+
+  const int threads = 256;
+  const int blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(xyz.type(), "maskcache_lookup_cuda", ([&] {
+    maskcache_lookup_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        world.data<bool>(),
+        xyz.data<scalar_t>(),
+        out.data<bool>(),
+        xyz2ijk_scale.data<scalar_t>(),
+        xyz2ijk_shift.data<scalar_t>(),
+        sz_i, sz_j, sz_k, n_pts);
+  }));
+
+  return out;
+}
+
+
+/*
+    Ray marching helper function.
+ */
+template <typename scalar_t>
+__global__ void raw2alpha_cuda_kernel(
+    scalar_t* __restrict__ density,
+    const float shift, const float interval, const int n_pts,
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ alpha) {
+
+  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const scalar_t e = exp(density[i_pt] + shift); // can be inf
+    exp_d[i_pt] = e;
+    alpha[i_pt] = 1 - pow((1 + e), (-interval));
+  }
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_nonuni_cuda_kernel(
+    scalar_t* __restrict__ density,
+    const float shift, scalar_t* __restrict__ interval, const int n_pts,
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ alpha) {
+
+  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const scalar_t e = exp(density[i_pt] + shift); // can be inf
+    exp_d[i_pt] = e;
+    alpha[i_pt] = 1 - pow((1 + e), (-interval[i_pt]));
+  }
+}
+
+std::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const float shift, const float interval) {
+
+  const int n_pts = density.size(0);
+  auto exp_d = torch::empty_like(density);
+  auto alpha = torch::empty_like(density);
+  if(n_pts==0) {
+    return {exp_d, alpha};
+  }
+
+  const int threads = 256;
+  const int blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(density.type(), "raw2alpha_cuda", ([&] {
+    raw2alpha_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        density.data<scalar_t>(),
+        shift, interval, n_pts,
+        exp_d.data<scalar_t>(),
+        alpha.data<scalar_t>());
+  }));
+
+  return {exp_d, alpha};
+}
+
+std::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const float shift, torch::Tensor interval) {
+
+  const int n_pts = density.size(0);
+  auto exp_d = torch::empty_like(density);
+  auto alpha = torch::empty_like(density);
+  if(n_pts==0) {
+    return {exp_d, alpha};
+  }
+
+  const int threads = 256;
+  const int blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(density.type(), "raw2alpha_cuda", ([&] {
+    raw2alpha_nonuni_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        density.data<scalar_t>(),
+        shift, interval.data<scalar_t>(), n_pts,
+        exp_d.data<scalar_t>(),
+        alpha.data<scalar_t>());
+  }));
+
+  return {exp_d, alpha};
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_backward_cuda_kernel(
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ grad_back,
+    const float interval, const int n_pts,
+    scalar_t* __restrict__ grad) {
+
+  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow((1+exp_d[i_pt]), (-interval-1)) * interval * grad_back[i_pt];
+  }
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_nonuni_backward_cuda_kernel(
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ grad_back,
+    scalar_t* __restrict__ interval, const int n_pts,
+    scalar_t* __restrict__ grad) {
+
+  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow((1+exp_d[i_pt]), (-interval[i_pt]-1)) * interval[i_pt] * grad_back[i_pt];
+  }
+}
+
+torch::Tensor raw2alpha_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, const float interval) {
+
+  const int n_pts = exp_d.size(0);
+  auto grad = torch::empty_like(exp_d);
+  if(n_pts==0) {
+    return grad;
+  }
+
+  const int threads = 256;
+  const int blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(exp_d.type(), "raw2alpha_backward_cuda", ([&] {
+    raw2alpha_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        exp_d.data<scalar_t>(),
+        grad_back.data<scalar_t>(),
+        interval, n_pts,
+        grad.data<scalar_t>());
+  }));
+
+  return grad;
+}
+
+torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, torch::Tensor interval) {
+
+  const int n_pts = exp_d.size(0);
+  auto grad = torch::empty_like(exp_d);
+  if(n_pts==0) {
+    return grad;
+  }
+
+  const int threads = 256;
+  const int blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(exp_d.type(), "raw2alpha_backward_cuda", ([&] {
+    raw2alpha_nonuni_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        exp_d.data<scalar_t>(),
+        grad_back.data<scalar_t>(),
+        interval.data<scalar_t>(), n_pts,
+        grad.data<scalar_t>());
+  }));
+
+  return grad;
+}
+
+template <typename scalar_t>
+__global__ void alpha2weight_cuda_kernel(
+    scalar_t* __restrict__ alpha,
+    const int n_rays,
+    scalar_t* __restrict__ weight,
+    scalar_t* __restrict__ T,
+    scalar_t* __restrict__ alphainv_last,
+    int64_t* __restrict__ i_start,
+    int64_t* __restrict__ i_end) {
+
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int i_s = i_start[i_ray];
+    const int i_e_max = i_end[i_ray];
+
+    float T_cum = 1.;
+    int i;
+    for(i=i_s; i<i_e_max; ++i) {
+      T[i] = T_cum;
+      weight[i] = T_cum * alpha[i];
+      T_cum *= (1. - alpha[i]);
+      if(T_cum<1e-3) {
+        i+=1;
+        break;
+      }
+    }
+    i_end[i_ray] = i;
+    alphainv_last[i_ray] = T_cum;
+  }
+}
+
+__global__ void __set_i_for_segment_start_end(
+        int64_t* __restrict__ ray_id,
+        const int n_pts,
+        int64_t* __restrict__ i_start,
+        int64_t* __restrict__ i_end) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(0<index && index<n_pts && ray_id[index]!=ray_id[index-1]) {
+    i_start[ray_id[index]] = index;
+    i_end[ray_id[index-1]] = index;
+  }
+}
+
+std::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays) {
+
+  const int n_pts = alpha.size(0);
+  const int threads = 256;
+
+  auto weight = torch::zeros_like(alpha);
+  auto T = torch::ones_like(alpha);
+  auto alphainv_last = torch::ones({n_rays}, alpha.options());
+  auto i_start = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  auto i_end = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  if(n_pts==0) {
+    return {weight, T, alphainv_last, i_start, i_end};
+  }
+
+  __set_i_for_segment_start_end<<<(n_pts+threads-1)/threads, threads>>>(
+          ray_id.data<int64_t>(), n_pts, i_start.data<int64_t>(), i_end.data<int64_t>());
+  i_end[ray_id[n_pts-1]] = n_pts;
+
+  const int blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(alpha.type(), "alpha2weight_cuda", ([&] {
+    alpha2weight_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        alpha.data<scalar_t>(),
+        n_rays,
+        weight.data<scalar_t>(),
+        T.data<scalar_t>(),
+        alphainv_last.data<scalar_t>(),
+        i_start.data<int64_t>(),
+        i_end.data<int64_t>());
+  }));
+
+  return {weight, T, alphainv_last, i_start, i_end};
+}
+
+template <typename scalar_t>
+__global__ void alpha2weight_backward_cuda_kernel(
+    scalar_t* __restrict__ alpha,
+    scalar_t* __restrict__ weight,
+    scalar_t* __restrict__ T,
+    scalar_t* __restrict__ alphainv_last,
+    int64_t* __restrict__ i_start,
+    int64_t* __restrict__ i_end,
+    const int n_rays,
+    scalar_t* __restrict__ grad_weights,
+    scalar_t* __restrict__ grad_last,
+    scalar_t* __restrict__ grad) {
+
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int i_s = i_start[i_ray];
+    const int i_e = i_end[i_ray];
+
+    float back_cum = grad_last[i_ray] * alphainv_last[i_ray];
+    for(int i=i_e-1; i>=i_s; --i) {
+      grad[i] = grad_weights[i] * T[i] - back_cum / (1-alpha[i] + 1e-10);
+      back_cum += grad_weights[i] * weight[i];
+    }
+  }
+}
+
+torch::Tensor alpha2weight_backward_cuda(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last) {
+
+  auto grad = torch::zeros_like(alpha);
+  if(n_rays==0) {
+    return grad;
+  }
+
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(alpha.type(), "alpha2weight_backward_cuda", ([&] {
+    alpha2weight_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        alpha.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        T.data<scalar_t>(),
+        alphainv_last.data<scalar_t>(),
+        i_start.data<int64_t>(),
+        i_end.data<int64_t>(),
+        n_rays,
+        grad_weights.data<scalar_t>(),
+        grad_last.data<scalar_t>(),
+        grad.data<scalar_t>());
+  }));
+
+  return grad;
+}
+
diff --git a/lib/cuda/total_variation.cpp b/lib/cuda/total_variation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2af47ae0abd435999cea9c0538b54606139b9b09
--- /dev/null
+++ b/lib/cuda/total_variation.cpp
@@ -0,0 +1,25 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+void total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode);
+
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void total_variation_add_grad(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  total_variation_add_grad_cuda(param, grad, wx, wy, wz, dense_mode);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("total_variation_add_grad", &total_variation_add_grad, "Add total variation grad");
+}
+
diff --git a/lib/cuda/total_variation_kernel.cu b/lib/cuda/total_variation_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..72fab090b468ca63524c263db3f3625a8ffb45b6
--- /dev/null
+++ b/lib/cuda/total_variation_kernel.cu
@@ -0,0 +1,68 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+template <typename scalar_t, typename bound_t>
+__device__ __forceinline__ scalar_t clamp(const scalar_t v, const bound_t lo, const bound_t hi) {
+  return min(max(v, lo), hi);
+}
+
+template <typename scalar_t, bool dense_mode>
+__global__ void total_variation_add_grad_cuda_kernel(
+    const scalar_t* __restrict__ param,
+    scalar_t* __restrict__ grad,
+    float wx, float wy, float wz,
+    const size_t sz_i, const size_t sz_j, const size_t sz_k, const size_t N) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N && (dense_mode || grad[index]!=0)) {
+    const size_t k = index % sz_k;
+    const size_t j = index / sz_k % sz_j;
+    const size_t i = index / sz_k / sz_j % sz_i;
+
+    float grad_to_add = 0;
+    grad_to_add += (k==0      ? 0 : wz * clamp(param[index]-param[index-1], -1.f, 1.f));
+    grad_to_add += (k==sz_k-1 ? 0 : wz * clamp(param[index]-param[index+1], -1.f, 1.f));
+    grad_to_add += (j==0      ? 0 : wy * clamp(param[index]-param[index-sz_k], -1.f, 1.f));
+    grad_to_add += (j==sz_j-1 ? 0 : wy * clamp(param[index]-param[index+sz_k], -1.f, 1.f));
+    grad_to_add += (i==0      ? 0 : wz * clamp(param[index]-param[index-sz_k*sz_j], -1.f, 1.f));
+    grad_to_add += (i==sz_i-1 ? 0 : wz * clamp(param[index]-param[index+sz_k*sz_j], -1.f, 1.f));
+    grad[index] += grad_to_add;
+  }
+}
+
+void total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode) {
+  const size_t N = param.numel();
+  const size_t sz_i = param.size(2);
+  const size_t sz_j = param.size(3);
+  const size_t sz_k = param.size(4);
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  wx /= 6;
+  wy /= 6;
+  wz /= 6;
+
+  if(dense_mode) {
+    AT_DISPATCH_FLOATING_TYPES(param.type(), "total_variation_add_grad_cuda", ([&] {
+      total_variation_add_grad_cuda_kernel<scalar_t,true><<<blocks, threads>>>(
+          param.data<scalar_t>(),
+          grad.data<scalar_t>(),
+          wx, wy, wz,
+          sz_i, sz_j, sz_k, N);
+    }));
+  }
+  else {
+     AT_DISPATCH_FLOATING_TYPES(param.type(), "total_variation_add_grad_cuda", ([&] {
+      total_variation_add_grad_cuda_kernel<scalar_t,false><<<blocks, threads>>>(
+          param.data<scalar_t>(),
+          grad.data<scalar_t>(),
+          wx, wy, wz,
+          sz_i, sz_j, sz_k, N);
+    }));
+  }
+}
+
diff --git a/lib/cuda/ub360_utils.cpp b/lib/cuda/ub360_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe67163b7cddcf6ddb3b59f7040178628338fec5
--- /dev/null
+++ b/lib/cuda/ub360_utils.cpp
@@ -0,0 +1,23 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+torch::Tensor cumdist_thres_cuda(torch::Tensor dist, float thres);
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+torch::Tensor cumdist_thres(torch::Tensor dist, float thres) {
+  CHECK_INPUT(dist);
+  return cumdist_thres_cuda(dist, thres);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("cumdist_thres", &cumdist_thres, "Generate mask for cumulative dist.");
+}
+
diff --git a/lib/cuda/ub360_utils_kernel.cu b/lib/cuda/ub360_utils_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd81eb86071434d18aa2e734021dc9c42eb7573e
--- /dev/null
+++ b/lib/cuda/ub360_utils_kernel.cu
@@ -0,0 +1,48 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+/*
+   helper function to skip oversampled points,
+   especially near the foreground scene bbox boundary
+   */
+template <typename scalar_t>
+__global__ void cumdist_thres_cuda_kernel(
+        scalar_t* __restrict__ dist,
+        const float thres,
+        const int n_rays,
+        const int n_pts,
+        bool* __restrict__ mask) {
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    float cum_dist = 0;
+    const int i_s = i_ray * n_pts;
+    const int i_t = i_s + n_pts;
+    int i;
+    for(i=i_s; i<i_t; ++i) {
+      cum_dist += dist[i];
+      bool over = (cum_dist > thres);
+      cum_dist *= float(!over);
+      mask[i] = over;
+    }
+  }
+}
+
+torch::Tensor cumdist_thres_cuda(torch::Tensor dist, float thres) {
+  const int n_rays = dist.size(0);
+  const int n_pts = dist.size(1);
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+  auto mask = torch::zeros({n_rays, n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));
+  AT_DISPATCH_FLOATING_TYPES(dist.type(), "cumdist_thres_cuda", ([&] {
+    cumdist_thres_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        dist.data<scalar_t>(), thres,
+        n_rays, n_pts,
+        mask.data<bool>());
+  }));
+  return mask;
+}
+
diff --git a/lib/dcvgo.py b/lib/dcvgo.py
new file mode 100644
index 0000000000000000000000000000000000000000..626d547021457643c07ecb11a25a091c1c723c1e
--- /dev/null
+++ b/lib/dcvgo.py
@@ -0,0 +1,433 @@
+import os
+import time
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch_scatter import segment_coo
+
+from . import grid
+from .dvgo import Raw2Alpha, Alphas2Weights
+from .dmpigo import create_full_step_id
+
+from torch.utils.cpp_extension import load
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+ub360_utils_cuda = load(
+        name='ub360_utils_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/ub360_utils.cpp', 'cuda/ub360_utils_kernel.cu']],
+        verbose=True)
+
+
+#TODO ORIGINAL bg_len=0.2
+'''Model'''
+class DirectContractedVoxGO(nn.Module):
+    def __init__(self, xyz_min, xyz_max,
+                 num_voxels=0, num_voxels_base=0,
+                 alpha_init=None,
+                 mask_cache_world_size=None,
+                 fast_color_thres=0, bg_len=0.2,
+                 contracted_norm='inf',
+                 density_type='DenseGrid', k0_type='DenseGrid',
+                 density_config={}, k0_config={},
+                 rgbnet_dim=0,
+                 rgbnet_depth=3, rgbnet_width=128,
+                 viewbase_pe=4,
+                 **kwargs):
+        super(DirectContractedVoxGO, self).__init__()
+        # xyz_min/max are the boundary that separates fg and bg scene
+        xyz_min = torch.Tensor(xyz_min)
+        xyz_max = torch.Tensor(xyz_max)
+        assert len(((xyz_max - xyz_min) * 100000).long().unique()), 'scene bbox must be a cube in DirectContractedVoxGO'
+        self.register_buffer('scene_center', (xyz_min + xyz_max) * 0.5)
+        self.register_buffer('scene_radius', (xyz_max - xyz_min) * 0.5)
+        self.register_buffer('xyz_min', torch.Tensor([-1,-1,-1]) - bg_len)
+        self.register_buffer('xyz_max', torch.Tensor([1,1,1]) + bg_len)
+        if isinstance(fast_color_thres, dict):
+            self._fast_color_thres = fast_color_thres
+            self.fast_color_thres = fast_color_thres[0]
+        else:
+            self._fast_color_thres = None
+            self.fast_color_thres = fast_color_thres
+        self.bg_len = bg_len
+        self.contracted_norm = contracted_norm
+
+        # determine based grid resolution
+        self.num_voxels_base = num_voxels_base
+        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)
+
+        # determine init grid resolution
+        self._set_grid_resolution(num_voxels)
+
+        # determine the density bias shift
+        self.alpha_init = alpha_init
+        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))
+        print('dcvgo: set density bias shift to', self.act_shift)
+
+        # init density voxel grid
+        self.density_type = density_type
+        self.density_config = density_config
+        self.density = grid.create_grid(
+            density_type, channels=1, world_size=self.world_size,
+            xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+            config=self.density_config)
+
+        # init color representation
+        self.rgbnet_kwargs = {
+            'rgbnet_dim': rgbnet_dim,
+            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,
+            'viewbase_pe': viewbase_pe,
+        }
+        self.k0_type = k0_type
+        self.k0_config = k0_config
+        
+        if rgbnet_dim <= 0:
+            # color voxel grid  (coarse stage)
+            self.k0_dim = 3
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.rgbnet = None
+        else:
+            # feature voxel grid + shallow MLP  (fine stage)
+            self.k0_dim = rgbnet_dim
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))
+            dim0 = (3+3*viewbase_pe*2)
+            dim0 += self.k0_dim
+            self.rgbnet = nn.Sequential(
+                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),
+                *[
+                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))
+                    for _ in range(rgbnet_depth-2)
+                ],
+                nn.Linear(rgbnet_width, 3),
+            )
+            nn.init.constant_(self.rgbnet[-1].bias, 0)
+            print('dcvgo: feature voxel grid', self.k0)
+            print('dcvgo: mlp', self.rgbnet)
+
+        # Using the coarse geometry if provided (used to determine known free space and unknown space)
+        # Re-implement as occupancy grid (2021/1/31)
+        if mask_cache_world_size is None:
+            mask_cache_world_size = self.world_size
+        mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)
+        self.mask_cache = grid.MaskGrid(
+            path=None, mask=mask,
+            xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+    def _set_grid_resolution(self, num_voxels):
+        # Determine grid resolution
+        self.num_voxels = num_voxels
+        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)
+        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()
+        self.world_len = self.world_size[0].item()
+        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base
+
+        print('dcvgo: voxel_size      ', self.voxel_size)
+        print('dcvgo: world_size      ', self.world_size)
+        print('dcvgo: voxel_size_base ', self.voxel_size_base)
+        print('dcvgo: voxel_size_ratio', self.voxel_size_ratio)
+
+    def get_kwargs(self):
+        return {
+            'xyz_min': self.xyz_min.cpu().numpy(),
+            'xyz_max': self.xyz_max.cpu().numpy(),
+            'num_voxels': self.num_voxels,
+            'num_voxels_base': self.num_voxels_base,
+            'alpha_init': self.alpha_init,
+            'voxel_size_ratio': self.voxel_size_ratio,
+            'mask_cache_world_size': list(self.mask_cache.mask.shape),
+            'fast_color_thres': self.fast_color_thres,
+            'contracted_norm': self.contracted_norm,
+            'density_type': self.density_type,
+            'k0_type': self.k0_type,
+            'density_config': self.density_config,
+            'k0_config': self.k0_config,
+            **self.rgbnet_kwargs,
+        }
+
+    @torch.no_grad()
+    def scale_volume_grid(self, num_voxels):
+        print('dcvgo: scale_volume_grid start')
+        ori_world_size = self.world_size
+        self._set_grid_resolution(num_voxels)
+        print('dcvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())
+
+        self.density.scale_volume_grid(self.world_size)
+        self.k0.scale_volume_grid(self.world_size)
+
+        if np.prod(self.world_size.tolist()) <= 256**3:
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+            ), -1)
+            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]
+            self.mask_cache = grid.MaskGrid(
+                path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+        print('dcvgo: scale_volume_grid finish')
+
+    @torch.no_grad()
+    def update_occupancy_cache(self):
+        ori_p = self.mask_cache.mask.float().mean().item()
+        cache_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),
+        ), -1)
+        cache_grid_density = self.density(cache_grid_xyz)[None,None]
+        cache_grid_alpha = self.activate_density(cache_grid_density)
+        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]
+        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+
+    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):
+        print('dcvgo: update mask_cache lt_nviews start')
+        eps_time = time.time()
+        count = torch.zeros_like(self.density.get_dense_grid()).long()
+        device = count.device
+        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):
+            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)
+            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):
+                ray_pts, inner_mask, t = self.sample_ray(
+                        ori_rays_o=rays_o.to(device), ori_rays_d=rays_d.to(device),
+                        **render_kwargs)
+                ones(ray_pts).sum().backward()
+            count.data += (ones.grid.grad > 1)
+        ori_p = self.mask_cache.mask.float().mean().item()
+        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+        eps_time = time.time() - eps_time
+        print(f'dcvgo: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')
+
+    def density_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.density.total_variation_add_grad(w, w, w, dense_mode)
+
+    def k0_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.k0.total_variation_add_grad(w, w, w, dense_mode)
+
+    def activate_density(self, density, interval=None):
+        interval = interval if interval is not None else self.voxel_size_ratio
+        shape = density.shape
+        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)
+
+    def sample_ray(self, ori_rays_o, ori_rays_d, stepsize, is_train=False, **render_kwargs):
+        '''Sample query points on rays.
+        All the output points are sorted from near to far.
+        Input:
+            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.
+            stepsize:         the number of voxels of each sample step.
+        Output:
+            ray_pts:          [M, 3] storing all the sampled points.
+            ray_id:           [M]    the index of the ray of each point.
+            step_id:          [M]    the i'th step on a ray of each point.
+        '''
+        rays_o = (ori_rays_o - self.scene_center) / self.scene_radius
+        rays_d = ori_rays_d / ori_rays_d.norm(dim=-1, keepdim=True)
+        N_inner = int(2 / (2+2*self.bg_len) * self.world_len / stepsize) + 1
+        N_outer = N_inner
+        b_inner = torch.linspace(0, 2, N_inner+1)
+        b_outer = 2 / torch.linspace(1, 1/128, N_outer+1)
+        t = torch.cat([
+            (b_inner[1:] + b_inner[:-1]) * 0.5,
+            (b_outer[1:] + b_outer[:-1]) * 0.5,
+        ])
+        ray_pts = rays_o[:,None,:] + rays_d[:,None,:] * t[None,:,None]
+        if self.contracted_norm == 'inf':
+            norm = ray_pts.abs().amax(dim=-1, keepdim=True)
+        elif self.contracted_norm == 'l2':
+            norm = ray_pts.norm(dim=-1, keepdim=True)
+        else:
+            raise NotImplementedError
+        inner_mask = (norm<=1)
+        ray_pts = torch.where(
+            inner_mask,
+            ray_pts,
+            ray_pts / norm * ((1+self.bg_len) - self.bg_len/norm)
+        )
+        return ray_pts, inner_mask.squeeze(-1), t
+
+    def forward(self, rays_o, rays_d, viewdirs, global_step=None, is_train=False, render_fct=0.0, **render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+        if isinstance(self._fast_color_thres, dict) and global_step in self._fast_color_thres:
+            print(f'dcvgo: update fast_color_thres {self.fast_color_thres} => {self._fast_color_thres[global_step]}')
+            self.fast_color_thres = self._fast_color_thres[global_step]
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, inner_mask, t = self.sample_ray( # ray_pts: [8192, num_sample, 3]
+                ori_rays_o=rays_o, ori_rays_d=rays_d, is_train=global_step is not None, **render_kwargs) 
+        n_max = len(t)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+        ray_id, step_id = create_full_step_id(ray_pts.shape[:2])
+
+        # cumsum ray_pts to get distance from ray_o to any ray_pt in a ray
+        ray_distance = torch.zeros_like(ray_pts)
+        ray_distance[:, 1:] = torch.abs(ray_pts[:, 1:] - ray_pts[:, :-1])
+        ray_distance = torch.cumsum(ray_distance, dim=1)
+
+        # skip oversampled points outside scene bbox
+        mask = inner_mask.clone()
+        dist_thres = (2+2*self.bg_len) / self.world_len * render_kwargs['stepsize'] * 0.95
+        dist = (ray_pts[:,1:] - ray_pts[:,:-1]).norm(dim=-1)
+        mask[:, 1:] |= ub360_utils_cuda.cumdist_thres(dist, dist_thres)
+        ray_pts = ray_pts[mask]
+        ray_distance = ray_distance[mask]
+        inner_mask = inner_mask[mask]
+        t = t[None].repeat(N,1)[mask]
+        ray_id = ray_id[mask.flatten()]
+        step_id = step_id[mask.flatten()]
+
+        # skip known free space
+        mask = self.mask_cache(ray_pts)
+        ray_pts = ray_pts[mask]
+        ray_distance = ray_distance[mask]
+        inner_mask = inner_mask[mask]
+        t = t[mask]
+        ray_id = ray_id[mask]
+        step_id = step_id[mask]
+
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        alpha = self.activate_density(density, interval)
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_distance = ray_distance[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_distance = ray_distance[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+            weights = weights[mask]
+
+        # query for color
+        k0 = self.k0(ray_pts)
+        if self.rgbnet is None:
+            # no view-depend effect
+            rgb = torch.sigmoid(k0)
+        else:
+            # view-dependent color emission
+            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)
+            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)
+            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]
+            rgb_feat = torch.cat([k0, viewdirs_emb], -1)
+            rgb_logit = self.rgbnet(rgb_feat)
+            rgb = torch.sigmoid(rgb_logit)
+
+        # Ray marching
+        rgb_marched = segment_coo(
+                src=(weights.unsqueeze(-1) * rgb),
+                index=ray_id,
+                out=torch.zeros([N, 3]),
+                reduce='sum')
+
+        if render_kwargs.get('rand_bkgd', False) and is_train:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))
+        else:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])
+        wsum_mid = segment_coo(
+                src=weights[inner_mask],
+                index=ray_id[inner_mask],
+                out=torch.zeros([N]),
+                reduce='sum')
+
+        s = 1 - 1/(1+t)  # [0, inf] => [0, 1]
+        ret_dict.update({
+            'alphainv_last': alphainv_last,
+            'weights': weights,
+            'wsum_mid': wsum_mid,
+            'rgb_marched': rgb_marched,
+            'raw_density': density,
+            'raw_alpha': alpha,
+            'raw_rgb': rgb,
+            'ray_id': ray_id,
+            'step_id': step_id,
+            'n_max': n_max,
+            't': t,
+            's': s,
+            'density': density,
+            'ray_pts': ray_pts
+        })
+
+        if render_kwargs.get('render_depth', False):
+            with torch.no_grad():
+                depth = segment_coo(
+                        src=(weights * s),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+                # depth = segment_coo(
+                #         src=(weights * ray_distance.norm(dim=-1)),
+                #         index=ray_id,
+                #         out=torch.zeros([N]),
+                #         reduce='sum')
+            ret_dict.update({'depth': depth})
+
+        return ret_dict
+
+
+class DistortionLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w, s, n_max, ray_id):
+        n_rays = ray_id.max()+1
+        interval = 1/n_max
+        w_prefix, w_total, ws_prefix, ws_total = ub360_utils_cuda.segment_cumsum(w, s, ray_id)
+        loss_uni = (1/3) * interval * w.pow(2)
+        loss_bi = 2 * w * (s * w_prefix - ws_prefix)
+        ctx.save_for_backward(w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id)
+        ctx.interval = interval
+        return (loss_bi.sum() + loss_uni.sum()) / n_rays
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id = ctx.saved_tensors
+        interval = ctx.interval
+        grad_uni = (1/3) * interval * 2 * w
+        w_suffix = w_total[ray_id] - (w_prefix + w)
+        ws_suffix = ws_total[ray_id] - (ws_prefix + w*s)
+        grad_bi = 2 * (s * (w_prefix - w_suffix) + (ws_suffix - ws_prefix))
+        grad = grad_back * (grad_bi + grad_uni)
+        return grad, None, None, None
+
+distortion_loss = DistortionLoss.apply
+
diff --git a/lib/dmpigo.py b/lib/dmpigo.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbad9726a6ba760f7c6f9a5ed02742a6020c8ab7
--- /dev/null
+++ b/lib/dmpigo.py
@@ -0,0 +1,348 @@
+import os
+import time
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange
+from torch_scatter import scatter_add, segment_coo
+
+from . import grid
+from .dvgo import Raw2Alpha, Alphas2Weights, render_utils_cuda
+
+
+'''Model'''
+class DirectMPIGO(torch.nn.Module):
+    def __init__(self, xyz_min, xyz_max,
+                 num_voxels=0, mpi_depth=0,
+                 mask_cache_path=None, mask_cache_thres=1e-3, mask_cache_world_size=None,
+                 fast_color_thres=0,
+                 density_type='DenseGrid', k0_type='DenseGrid',
+                 density_config={}, k0_config={},
+                 rgbnet_dim=0,
+                 rgbnet_depth=3, rgbnet_width=128,
+                 viewbase_pe=0,
+                 **kwargs):
+        super(DirectMPIGO, self).__init__()
+        self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+        self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+        self.fast_color_thres = fast_color_thres
+
+        # determine init grid resolution
+        self._set_grid_resolution(num_voxels, mpi_depth)
+
+        # init density voxel grid
+        self.density_type = density_type
+        self.density_config = density_config
+        self.density = grid.create_grid(
+                density_type, channels=1, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+
+        # init density bias so that the initial contribution (the alpha values)
+        # of each query points on a ray is equal
+        self.act_shift = grid.DenseGrid(
+                channels=1, world_size=[1,1,mpi_depth],
+                xyz_min=xyz_min, xyz_max=xyz_max)
+        self.act_shift.grid.requires_grad = False
+        with torch.no_grad():
+            g = np.full([mpi_depth], 1./mpi_depth - 1e-6)
+            p = [1-g[0]]
+            for i in range(1, len(g)):
+                p.append((1-g[:i+1].sum())/(1-g[:i].sum()))
+            for i in range(len(p)):
+                self.act_shift.grid[..., i].fill_(np.log(p[i] ** (-1/self.voxel_size_ratio) - 1))
+
+        # init color representation
+        # feature voxel grid + shallow MLP  (fine stage)
+        self.rgbnet_kwargs = {
+            'rgbnet_dim': rgbnet_dim,
+            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,
+            'viewbase_pe': viewbase_pe,
+        }
+        self.k0_type = k0_type
+        self.k0_config = k0_config
+        if rgbnet_dim <= 0:
+            # color voxel grid  (coarse stage)
+            self.k0_dim = 3
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.rgbnet = None
+        else:
+            self.k0_dim = rgbnet_dim
+            self.k0 = grid.create_grid(
+                    k0_type, channels=self.k0_dim, world_size=self.world_size,
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                    config=self.k0_config)
+            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))
+            dim0 = (3+3*viewbase_pe*2) + self.k0_dim
+            self.rgbnet = nn.Sequential(
+                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),
+                *[
+                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))
+                    for _ in range(rgbnet_depth-2)
+                ],
+                nn.Linear(rgbnet_width, 3),
+            )
+            nn.init.constant_(self.rgbnet[-1].bias, 0)
+
+        print('dmpigo: densitye grid', self.density)
+        print('dmpigo: feature grid', self.k0)
+        print('dmpigo: mlp', self.rgbnet)
+
+        # Using the coarse geometry if provided (used to determine known free space and unknown space)
+        # Re-implement as occupancy grid (2021/1/31)
+        self.mask_cache_path = mask_cache_path
+        self.mask_cache_thres = mask_cache_thres
+        if mask_cache_world_size is None:
+            mask_cache_world_size = self.world_size
+        if mask_cache_path is not None and mask_cache_path:
+            mask_cache = grid.MaskGrid(
+                    path=mask_cache_path,
+                    mask_cache_thres=mask_cache_thres).to(self.xyz_min.device)
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], mask_cache_world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], mask_cache_world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], mask_cache_world_size[2]),
+            ), -1)
+            mask = mask_cache(self_grid_xyz)
+        else:
+            mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)
+        self.mask_cache = grid.MaskGrid(
+                path=None, mask=mask,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+    def _set_grid_resolution(self, num_voxels, mpi_depth):
+        # Determine grid resolution
+        self.num_voxels = num_voxels
+        self.mpi_depth = mpi_depth
+        r = (num_voxels / self.mpi_depth / (self.xyz_max - self.xyz_min)[:2].prod()).sqrt()
+        self.world_size = torch.zeros(3, dtype=torch.long)
+        self.world_size[:2] = (self.xyz_max - self.xyz_min)[:2] * r
+        self.world_size[2] = self.mpi_depth
+        self.voxel_size_ratio = 256. / mpi_depth
+        print('dmpigo: world_size      ', self.world_size)
+        print('dmpigo: voxel_size_ratio', self.voxel_size_ratio)
+
+    def get_kwargs(self):
+        return {
+            'xyz_min': self.xyz_min.cpu().numpy(),
+            'xyz_max': self.xyz_max.cpu().numpy(),
+            'num_voxels': self.num_voxels,
+            'mpi_depth': self.mpi_depth,
+            'voxel_size_ratio': self.voxel_size_ratio,
+            'mask_cache_path': self.mask_cache_path,
+            'mask_cache_thres': self.mask_cache_thres,
+            'mask_cache_world_size': list(self.mask_cache.mask.shape),
+            'fast_color_thres': self.fast_color_thres,
+            'density_type': self.density_type,
+            'k0_type': self.k0_type,
+            'density_config': self.density_config,
+            'k0_config': self.k0_config,
+            **self.rgbnet_kwargs,
+        }
+
+    @torch.no_grad()
+    def scale_volume_grid(self, num_voxels, mpi_depth):
+        print('dmpigo: scale_volume_grid start')
+        ori_world_size = self.world_size
+        self._set_grid_resolution(num_voxels, mpi_depth)
+        print('dmpigo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())
+
+        self.density.scale_volume_grid(self.world_size)
+        self.k0.scale_volume_grid(self.world_size)
+
+        if np.prod(self.world_size.tolist()) <= 256**3:
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+            ), -1)
+            dens = self.density.get_dense_grid() + self.act_shift.grid
+            self_alpha = F.max_pool3d(self.activate_density(dens), kernel_size=3, padding=1, stride=1)[0,0]
+            self.mask_cache = grid.MaskGrid(
+                    path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+        print('dmpigo: scale_volume_grid finish')
+
+    @torch.no_grad()
+    def update_occupancy_cache(self):
+        ori_p = self.mask_cache.mask.float().mean().item()
+        cache_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),
+        ), -1)
+        cache_grid_density = self.density(cache_grid_xyz)[None,None]
+        cache_grid_alpha = self.activate_density(cache_grid_density)
+        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]
+        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dmpigo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+
+    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):
+        print('dmpigo: update mask_cache lt_nviews start')
+        eps_time = time.time()
+        count = torch.zeros_like(self.density.get_dense_grid()).long()
+        device = count.device
+        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):
+            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)
+            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):
+                ray_pts, ray_id, step_id, N_samples = self.sample_ray(
+                        rays_o=rays_o.to(device), rays_d=rays_d.to(device), **render_kwargs)
+                ones(ray_pts).sum().backward()
+            count.data += (ones.grid.grad > 1)
+        ori_p = self.mask_cache.mask.float().mean().item()
+        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dmpigo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+        torch.cuda.empty_cache()
+        eps_time = time.time() - eps_time
+        print(f'dmpigo: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')
+
+    def density_total_variation_add_grad(self, weight, dense_mode):
+        wxy = weight * self.world_size[:2].max() / 128
+        wz = weight * self.mpi_depth / 128
+        self.density.total_variation_add_grad(wxy, wxy, wz, dense_mode)
+
+    def k0_total_variation_add_grad(self, weight, dense_mode):
+        wxy = weight * self.world_size[:2].max() / 128
+        wz = weight * self.mpi_depth / 128
+        self.k0.total_variation_add_grad(wxy, wxy, wz, dense_mode)
+
+    def activate_density(self, density, interval=None):
+        interval = interval if interval is not None else self.voxel_size_ratio
+        shape = density.shape
+        return Raw2Alpha.apply(density.flatten(), 0, interval).reshape(shape)
+
+    def sample_ray(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):
+        '''Sample query points on rays.
+        All the output points are sorted from near to far.
+        Input:
+            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.
+            near, far:        the near and far distance of the rays.
+            stepsize:         the number of voxels of each sample step.
+        Output:
+            ray_pts:          [M, 3] storing all the sampled points.
+            ray_id:           [M]    the index of the ray of each point.
+            step_id:          [M]    the i'th step on a ray of each point.
+        '''
+        assert near==0 and far==1
+        rays_o = rays_o.contiguous()
+        rays_d = rays_d.contiguous()
+        N_samples = int((self.mpi_depth-1)/stepsize) + 1
+        ray_pts, mask_outbbox = render_utils_cuda.sample_ndc_pts_on_rays(
+            rays_o, rays_d, self.xyz_min, self.xyz_max, N_samples)
+        mask_inbbox = ~mask_outbbox
+        ray_pts = ray_pts[mask_inbbox]
+        if mask_inbbox.all():
+            ray_id, step_id = create_full_step_id(mask_inbbox.shape)
+        else:
+            ray_id = torch.arange(mask_inbbox.shape[0]).view(-1,1).expand_as(mask_inbbox)[mask_inbbox]
+            step_id = torch.arange(mask_inbbox.shape[1]).view(1,-1).expand_as(mask_inbbox)[mask_inbbox]
+        return ray_pts, ray_id, step_id, N_samples
+
+    def forward(self, rays_o, rays_d, viewdirs, global_step=None, **render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, ray_id, step_id, N_samples = self.sample_ray(
+                rays_o=rays_o, rays_d=rays_d, **render_kwargs)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+
+        # skip known free space
+        if self.mask_cache is not None:
+            mask = self.mask_cache(ray_pts)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts) + self.act_shift(ray_pts)
+        alpha = self.activate_density(density, interval)
+        if self.fast_color_thres > 0:
+            mask = (alpha > self.fast_color_thres)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if self.fast_color_thres > 0:
+            mask = (weights > self.fast_color_thres)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            alpha = alpha[mask]
+            weights = weights[mask]
+
+        # query for color
+        vox_emb = self.k0(ray_pts)
+
+        if self.rgbnet is None:
+            # no view-depend effect
+            rgb = torch.sigmoid(vox_emb)
+        else:
+            # view-dependent color emission
+            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)
+            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)
+            viewdirs_emb = viewdirs_emb[ray_id]
+            rgb_feat = torch.cat([vox_emb, viewdirs_emb], -1)
+            rgb_logit = self.rgbnet(rgb_feat)
+            rgb = torch.sigmoid(rgb_logit)
+
+        # Ray marching
+        rgb_marched = segment_coo(
+                src=(weights.unsqueeze(-1) * rgb),
+                index=ray_id,
+                out=torch.zeros([N, 3]),
+                reduce='sum')
+        if render_kwargs.get('rand_bkgd', False) and global_step is not None:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))
+        else:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])
+        s = (step_id+0.5) / N_samples
+        ret_dict.update({
+            'alphainv_last': alphainv_last,
+            'weights': weights,
+            'rgb_marched': rgb_marched,
+            'raw_alpha': alpha,
+            'raw_rgb': rgb,
+            'ray_id': ray_id,
+            'n_max': N_samples,
+            's': s,
+        })
+
+        if render_kwargs.get('render_depth', False):
+            with torch.no_grad():
+                depth = segment_coo(
+                        src=(weights * s),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+            ret_dict.update({'depth': depth})
+
+        return ret_dict
+
+
+@functools.lru_cache(maxsize=128)
+def create_full_step_id(shape):
+    ray_id = torch.arange(shape[0]).view(-1,1).expand(shape).flatten()
+    step_id = torch.arange(shape[1]).view(1,-1).expand(shape).flatten()
+    return ray_id, step_id
+
diff --git a/lib/dvgo.py b/lib/dvgo.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f34efb82fc1f52b91cc2f4874f7ee864169de7
--- /dev/null
+++ b/lib/dvgo.py
@@ -0,0 +1,652 @@
+import os
+import time
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch_scatter import segment_coo
+
+from . import grid
+from torch.utils.cpp_extension import load
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+render_utils_cuda = load(
+        name='render_utils_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/render_utils.cpp', 'cuda/render_utils_kernel.cu']],
+        verbose=True)
+
+
+'''Model'''
+class DirectVoxGO(torch.nn.Module):
+    def __init__(self, xyz_min, xyz_max,
+                 num_voxels=0, num_voxels_base=0,
+                 alpha_init=None,
+                 mask_cache_path=None, mask_cache_thres=1e-3, mask_cache_world_size=None,
+                 fast_color_thres=0,
+                 density_type='DenseGrid', k0_type='DenseGrid',
+                 density_config={}, k0_config={},
+                 rgbnet_dim=0, rgbnet_direct=False, rgbnet_full_implicit=False,
+                 rgbnet_depth=3, rgbnet_width=128,
+                 viewbase_pe=4,
+                 **kwargs):
+        super(DirectVoxGO, self).__init__()
+        self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+        self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+        self.fast_color_thres = fast_color_thres
+
+        # determine based grid resolution
+        self.num_voxels_base = num_voxels_base
+        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)
+
+        # determine the density bias shift
+        self.alpha_init = alpha_init
+        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))
+        print('dvgo: set density bias shift to', self.act_shift)
+
+        # determine init grid resolution
+        self._set_grid_resolution(num_voxels)
+
+        # init density voxel grid
+        self.density_type = density_type
+        self.density_config = density_config
+        self.density = grid.create_grid(
+                density_type, channels=1, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+
+        # init color representation
+        self.rgbnet_kwargs = {
+            'rgbnet_dim': rgbnet_dim, 'rgbnet_direct': rgbnet_direct,
+            'rgbnet_full_implicit': rgbnet_full_implicit,
+            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,
+            'viewbase_pe': viewbase_pe,
+        }
+        self.k0_type = k0_type
+        self.k0_config = k0_config
+        self.rgbnet_full_implicit = rgbnet_full_implicit
+        
+        if rgbnet_dim <= 0:
+            # color voxel grid  (coarse stage)
+            self.k0_dim = 3
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.rgbnet = None
+        else:
+            # feature voxel grid + shallow MLP  (fine stage)
+            if self.rgbnet_full_implicit:
+                self.k0_dim = 0
+            else:
+                self.k0_dim = rgbnet_dim
+            self.k0 = grid.create_grid(
+                    k0_type, channels=self.k0_dim, world_size=self.world_size,
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                    config=self.k0_config)
+            self.rgbnet_direct = rgbnet_direct
+            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))
+            dim0 = (3+3*viewbase_pe*2)
+            if self.rgbnet_full_implicit:
+                pass
+            elif rgbnet_direct:
+                dim0 += self.k0_dim
+            else:
+                dim0 += self.k0_dim-3
+            self.rgbnet = nn.Sequential(
+                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),
+                *[
+                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))
+                    for _ in range(rgbnet_depth-2)
+                ],
+                nn.Linear(rgbnet_width, 3),
+            )
+            nn.init.constant_(self.rgbnet[-1].bias, 0)
+            print('dvgo: feature voxel grid', self.k0)
+            print('dvgo: mlp', self.rgbnet)
+
+        # Using the coarse geometry if provided (used to determine known free space and unknown space)
+        # Re-implement as occupancy grid (2021/1/31)
+        self.mask_cache_path = mask_cache_path
+        self.mask_cache_thres = mask_cache_thres
+        if mask_cache_world_size is None:
+            mask_cache_world_size = self.world_size
+        if mask_cache_path is not None and mask_cache_path:
+            mask_cache = grid.MaskGrid(
+                    path=mask_cache_path,
+                    mask_cache_thres=mask_cache_thres).to(self.xyz_min.device)
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], mask_cache_world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], mask_cache_world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], mask_cache_world_size[2]),
+            ), -1)
+            mask = mask_cache(self_grid_xyz)
+        else:
+            mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)
+        self.mask_cache = grid.MaskGrid(
+                path=None, mask=mask,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+    def _set_grid_resolution(self, num_voxels):
+        # Determine grid resolution
+        self.num_voxels = num_voxels
+        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)
+        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()
+        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base
+
+        print('dvgo: voxel_size      ', self.voxel_size)
+        print('dvgo: world_size      ', self.world_size)
+        print('dvgo: voxel_size_base ', self.voxel_size_base)
+        print('dvgo: voxel_size_ratio', self.voxel_size_ratio)
+
+    def get_kwargs(self):
+        return {
+            'xyz_min': self.xyz_min.cpu().numpy(),
+            'xyz_max': self.xyz_max.cpu().numpy(),
+            'num_voxels': self.num_voxels,
+            'num_voxels_base': self.num_voxels_base,
+            'alpha_init': self.alpha_init,
+            'voxel_size_ratio': self.voxel_size_ratio,
+            'mask_cache_path': self.mask_cache_path,
+            'mask_cache_thres': self.mask_cache_thres,
+            'mask_cache_world_size': list(self.mask_cache.mask.shape),
+            'fast_color_thres': self.fast_color_thres,
+            'density_type': self.density_type,
+            'k0_type': self.k0_type,
+            'density_config': self.density_config,
+            'k0_config': self.k0_config,
+            **self.rgbnet_kwargs,
+        }
+
+    @torch.no_grad()
+    def maskout_near_cam_vox(self, cam_o, near_clip):
+        # maskout grid points that between cameras and their near planes
+        self_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+        ), -1)
+        nearest_dist = torch.stack([
+            (self_grid_xyz.unsqueeze(-2) - co).pow(2).sum(-1).sqrt().amin(-1)
+            for co in cam_o.split(100)  # for memory saving
+        ]).amin(0)
+        self.density.grid[nearest_dist[None,None] <= near_clip] = -100
+
+    @torch.no_grad()
+    def scale_volume_grid(self, num_voxels):
+        print('dvgo: scale_volume_grid start')
+        ori_world_size = self.world_size
+        self._set_grid_resolution(num_voxels)
+        print('dvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())
+
+        self.density.scale_volume_grid(self.world_size)
+        self.k0.scale_volume_grid(self.world_size)
+
+        if np.prod(self.world_size.tolist()) <= 256**3:
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+            ), -1)
+            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]
+            self.mask_cache = grid.MaskGrid(
+                    path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+        print('dvgo: scale_volume_grid finish')
+
+    @torch.no_grad()
+    def update_occupancy_cache(self):
+        cache_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),
+        ), -1)
+        cache_grid_density = self.density(cache_grid_xyz)[None,None]
+        cache_grid_alpha = self.activate_density(cache_grid_density)
+        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]
+        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)
+
+    def voxel_count_views(self, rays_o_tr, rays_d_tr, imsz, near, far, stepsize, downrate=1, irregular_shape=False):
+        print('dvgo: voxel_count_views start')
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        eps_time = time.time()
+        N_samples = int(np.linalg.norm(np.array(self.world_size.cpu())+1) / stepsize) + 1
+        rng = torch.arange(N_samples)[None].float()
+        count = torch.zeros_like(self.density.get_dense_grid())
+
+        device = rng.device
+        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):
+            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)
+            if irregular_shape:
+                rays_o_ = rays_o_.split(10000)
+                rays_d_ = rays_d_.split(10000)
+            else:
+                rays_o_ = rays_o_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)
+                rays_d_ = rays_d_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)
+
+            for rays_o, rays_d in zip(rays_o_, rays_d_):
+                vec = torch.where(rays_d==0, torch.full_like(rays_d, 1e-6), rays_d)
+                rate_a = (self.xyz_max - rays_o) / vec
+                rate_b = (self.xyz_min - rays_o) / vec
+                t_min = torch.minimum(rate_a, rate_b).amax(-1).clamp(min=near, max=far)
+                t_max = torch.maximum(rate_a, rate_b).amin(-1).clamp(min=near, max=far)
+                step = stepsize * self.voxel_size * rng
+                interpx = (t_min[...,None] + step/rays_d.norm(dim=-1,keepdim=True))
+                rays_pts = rays_o[...,None,:] + rays_d[...,None,:] * interpx[...,None]
+                ones(rays_pts).sum().backward()
+            with torch.no_grad():
+                count += (ones.grid.grad > 1)
+        eps_time = time.time() - eps_time
+        print('dvgo: voxel_count_views finish (eps time:', eps_time, 'sec)')
+
+        return count
+
+    def density_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.density.total_variation_add_grad(w, w, w, dense_mode)
+
+    def k0_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.k0.total_variation_add_grad(w, w, w, dense_mode)
+
+    def activate_density(self, density, interval=None):
+        interval = interval if interval is not None else self.voxel_size_ratio
+        shape = density.shape
+        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)
+
+    def hit_coarse_geo(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):
+        '''Check whether the rays hit the solved coarse geometry or not'''
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        shape = rays_o.shape[:-1]
+        rays_o = rays_o.reshape(-1, 3).contiguous()
+        rays_d = rays_d.reshape(-1, 3).contiguous()
+        stepdist = stepsize * self.voxel_size
+        ray_pts, mask_outbbox, ray_id = render_utils_cuda.sample_pts_on_rays(
+                rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)[:3]
+        mask_inbbox = ~mask_outbbox
+        hit = torch.zeros([len(rays_o)], dtype=torch.bool)
+        hit[ray_id[mask_inbbox][self.mask_cache(ray_pts[mask_inbbox])]] = 1
+        return hit.reshape(shape)
+
+    def sample_ray(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):
+        '''Sample query points on rays.
+        All the output points are sorted from near to far.
+        Input:
+            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.
+            near, far:        the near and far distance of the rays.
+            stepsize:         the number of voxels of each sample step.
+        Output:
+            ray_pts:          [M, 3] storing all the sampled points.
+            ray_id:           [M]    the index of the ray of each point.
+            step_id:          [M]    the i'th step on a ray of each point.
+        '''
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        rays_o = rays_o.contiguous()
+        rays_d = rays_d.contiguous()
+        stepdist = stepsize * self.voxel_size
+        ray_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max = render_utils_cuda.sample_pts_on_rays(
+            rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)
+        mask_inbbox = ~mask_outbbox
+        ray_pts = ray_pts[mask_inbbox]
+        ray_id = ray_id[mask_inbbox]
+        step_id = step_id[mask_inbbox]
+        return ray_pts, ray_id, step_id
+
+    def forward(self, rays_o, rays_d, viewdirs, global_step=None, render_fct=0.0,**render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, ray_id, step_id = self.sample_ray(
+                rays_o=rays_o, rays_d=rays_d, **render_kwargs)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+
+        # skip known free space
+        if self.mask_cache is not None:
+            mask = self.mask_cache(ray_pts)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # self.fast_color_thres = 0.1
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        alpha = self.activate_density(density, interval)
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            weights = weights[mask]
+            alpha = alpha[mask]
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            
+            density = density[mask]
+
+        # query for color
+        if self.rgbnet_full_implicit:
+            pass
+        else:
+            k0 = self.k0(ray_pts)
+
+        if self.rgbnet is None:
+            # no view-depend effect
+            rgb = torch.sigmoid(k0)
+        else:
+            # view-dependent color emission
+            if self.rgbnet_direct:
+                k0_view = k0
+            else:
+                k0_view = k0[:, 3:]
+                k0_diffuse = k0[:, :3]
+            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)
+            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)
+            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]
+            rgb_feat = torch.cat([k0_view, viewdirs_emb], -1)
+            rgb_logit = self.rgbnet(rgb_feat)
+            if self.rgbnet_direct:
+                rgb = torch.sigmoid(rgb_logit)
+            else:
+                rgb = torch.sigmoid(rgb_logit + k0_diffuse)
+
+        # Ray marching
+        rgb_marched = segment_coo(
+                src=(weights.unsqueeze(-1) * rgb),
+                index=ray_id,
+                out=torch.zeros([N, 3]),
+                reduce='sum')
+
+        rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])
+        ret_dict.update({
+            'alphainv_last': alphainv_last,
+            'weights': weights,
+            'rgb_marched': rgb_marched,
+            'raw_alpha': alpha,
+            'raw_rgb': rgb,
+            'ray_id': ray_id,
+            'density': density,
+            'ray_pts': ray_pts
+        })
+
+        if render_kwargs.get('render_depth', False):
+            with torch.no_grad():
+                depth = segment_coo(
+                        src=(weights * step_id),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+            ret_dict.update({'depth': depth})
+
+        return ret_dict
+
+
+''' Misc
+'''
+class Raw2Alpha(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, density, shift, interval):
+        '''
+        alpha = 1 - exp(-softplus(density + shift) * interval)
+              = 1 - exp(-log(1 + exp(density + shift)) * interval)
+              = 1 - exp(log(1 + exp(density + shift)) ^ (-interval))
+              = 1 - (1 + exp(density + shift)) ^ (-interval)
+        '''
+        exp, alpha = render_utils_cuda.raw2alpha(density, shift, interval)
+        if density.requires_grad:
+            ctx.save_for_backward(exp)
+            ctx.interval = interval
+        return alpha
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        '''
+        alpha' = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)'
+               = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)
+        '''
+        exp = ctx.saved_tensors[0]
+        interval = ctx.interval
+        return render_utils_cuda.raw2alpha_backward(exp, grad_back.contiguous(), interval), None, None
+
+class Raw2Alpha_nonuni(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, density, shift, interval):
+        exp, alpha = render_utils_cuda.raw2alpha_nonuni(density, shift, interval)
+        if density.requires_grad:
+            ctx.save_for_backward(exp)
+            ctx.interval = interval
+        return alpha
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        exp = ctx.saved_tensors[0]
+        interval = ctx.interval
+        return render_utils_cuda.raw2alpha_nonuni_backward(exp, grad_back.contiguous(), interval), None, None
+
+class Alphas2Weights(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alpha, ray_id, N):
+        weights, T, alphainv_last, i_start, i_end = render_utils_cuda.alpha2weight(alpha, ray_id, N)
+        if alpha.requires_grad:
+            ctx.save_for_backward(alpha, weights, T, alphainv_last, i_start, i_end)
+            ctx.n_rays = N
+        return weights, alphainv_last
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_weights, grad_last):
+        alpha, weights, T, alphainv_last, i_start, i_end = ctx.saved_tensors
+        grad = render_utils_cuda.alpha2weight_backward(
+                alpha, weights, T, alphainv_last,
+                i_start, i_end, ctx.n_rays, grad_weights, grad_last)
+        return grad, None, None
+
+
+''' Ray and batch
+'''
+def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
+    i, j = torch.meshgrid(
+        torch.linspace(0, W-1, W, device=c2w.device),
+        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
+    i = i.t().float()
+    j = j.t().float()
+    if mode == 'lefttop':
+        pass
+    elif mode == 'center':
+        i, j = i+0.5, j+0.5
+    elif mode == 'random':
+        i = i+torch.rand_like(i)
+        j = j+torch.rand_like(j)
+    else:
+        raise NotImplementedError
+
+    if flip_x:
+        i = i.flip((1,))
+    if flip_y:
+        j = j.flip((0,))
+    if inverse_y:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
+    else:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = c2w[:3,3].expand(rays_d.shape)
+    return rays_o, rays_d
+
+
+def get_rays_np(H, W, K, c2w):
+    i, j = np.meshgrid(np.arange(W, dtype=np.float32), np.arange(H, dtype=np.float32), indexing='xy')
+    dirs = np.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -np.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = np.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = np.broadcast_to(c2w[:3,3], np.shape(rays_d))
+    return rays_o, rays_d
+
+
+def ndc_rays(H, W, focal, near, rays_o, rays_d):
+    # Shift ray origins to near plane
+    t = -(near + rays_o[...,2]) / rays_d[...,2]
+    rays_o = rays_o + t[...,None] * rays_d
+
+    # Projection
+    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
+    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
+    o2 = 1. + 2. * near / rays_o[...,2]
+
+    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
+    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
+    d2 = -2. * near / rays_o[...,2]
+
+    rays_o = torch.stack([o0,o1,o2], -1)
+    rays_d = torch.stack([d0,d1,d2], -1)
+
+    return rays_o, rays_d
+
+
+def get_rays_of_a_view(H, W, K, c2w, ndc, inverse_y, flip_x, flip_y, mode='center'):
+    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
+    viewdirs = rays_d / rays_d.norm(dim=-1, keepdim=True)
+    if ndc:
+        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
+    return rays_o, rays_d, viewdirs
+
+
+@torch.no_grad()
+def get_training_rays(rgb_tr, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):
+    print('get_training_rays: start')
+    assert len(np.unique(HW, axis=0)) == 1
+    assert len(np.unique(Ks.reshape(len(Ks),-1), axis=0)) == 1
+    assert len(rgb_tr) == len(train_poses) and len(rgb_tr) == len(Ks) and len(rgb_tr) == len(HW)
+    H, W = HW[0]
+    K = Ks[0]
+    eps_time = time.time()
+    rays_o_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    rays_d_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    viewdirs_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    imsz = [1] * len(rgb_tr)
+    for i, c2w in enumerate(train_poses):
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        rays_o_tr[i].copy_(rays_o.to(rgb_tr.device))
+        rays_d_tr[i].copy_(rays_d.to(rgb_tr.device))
+        viewdirs_tr[i].copy_(viewdirs.to(rgb_tr.device))
+        del rays_o, rays_d, viewdirs
+    eps_time = time.time() - eps_time
+    print('get_training_rays: finish (eps time:', eps_time, 'sec)')
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+@torch.no_grad()
+def get_training_rays_flatten(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):
+    print('get_training_rays_flatten: start')
+    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)
+    eps_time = time.time()
+    DEVICE = rgb_tr_ori[0].device
+    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)
+    rgb_tr = torch.zeros([N,3], device=DEVICE)
+
+    rays_o_tr = torch.zeros_like(rgb_tr)
+    rays_d_tr = torch.zeros_like(rgb_tr)
+    viewdirs_tr = torch.zeros_like(rgb_tr)
+    imsz = []
+    top = 0
+
+    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):
+        assert img.shape[:2] == (H, W)
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc,
+                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        n = H * W
+        rgb_tr[top:top+n].copy_(img.flatten(0,1))
+        rays_o_tr[top:top+n].copy_(rays_o.flatten(0,1).to(DEVICE))
+        rays_d_tr[top:top+n].copy_(rays_d.flatten(0,1).to(DEVICE))
+        viewdirs_tr[top:top+n].copy_(viewdirs.flatten(0,1).to(DEVICE))
+        imsz.append(n)
+        top += n
+
+    assert top == N
+    eps_time = time.time() - eps_time
+    print('get_training_rays_flatten: finish (eps time:', eps_time, 'sec)')
+
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+@torch.no_grad()
+def get_training_rays_in_maskcache_sampling(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y, model, render_kwargs):
+    print('get_training_rays_in_maskcache_sampling: start')
+    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)
+    CHUNK = 64
+    DEVICE = rgb_tr_ori[0].device
+    eps_time = time.time()
+    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)
+
+    rgb_tr = torch.zeros([N,3], device=DEVICE)
+
+    rays_o_tr = torch.zeros_like(rgb_tr)
+    rays_d_tr = torch.zeros_like(rgb_tr)
+    viewdirs_tr = torch.zeros_like(rgb_tr)
+    imsz = []
+    top = 0
+
+    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):
+        assert img.shape[:2] == (H, W)
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc,
+                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        mask = torch.empty(img.shape[:2], device=DEVICE, dtype=torch.bool)
+        for i in range(0, img.shape[0], CHUNK):
+            mask[i:i+CHUNK] = model.hit_coarse_geo(
+                    rays_o=rays_o[i:i+CHUNK], rays_d=rays_d[i:i+CHUNK], **render_kwargs).to(DEVICE)
+        n = mask.sum()
+        rgb_tr[top:top+n].copy_(img[mask])
+        rays_o_tr[top:top+n].copy_(rays_o[mask].to(DEVICE))
+        rays_d_tr[top:top+n].copy_(rays_d[mask].to(DEVICE))
+        viewdirs_tr[top:top+n].copy_(viewdirs[mask].to(DEVICE))
+        imsz.append(n)
+        top += n
+
+    print('get_training_rays_in_maskcache_sampling: ratio', top / N)
+    rgb_tr = rgb_tr[:top]
+    rays_o_tr = rays_o_tr[:top]
+    rays_d_tr = rays_d_tr[:top]
+    viewdirs_tr = viewdirs_tr[:top]
+    eps_time = time.time() - eps_time
+    print('get_training_rays_in_maskcache_sampling: finish (eps time:', eps_time, 'sec)')
+    
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+def batch_indices_generator(N, BS):
+    # torch.randperm on cuda produce incorrect results in my machine
+    idx, top = torch.LongTensor(np.random.permutation(N)), 0
+    while True:
+        if top + BS > N:
+            idx, top = torch.LongTensor(np.random.permutation(N)), 0
+        yield idx[top:top+BS]
+        top += BS
+
diff --git a/lib/evaluate.py b/lib/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..c941da7ea682789fa9d61a8c0a4a2e99a65d27ec
--- /dev/null
+++ b/lib/evaluate.py
@@ -0,0 +1,65 @@
+import os
+import numpy as np
+import imageio
+
+def cal_IoU(a, b):
+    """Calculates the Intersection over Union (IoU) between two ndarrays.
+
+    Args:
+        a: shape (N, H, W).
+        b: shape (N, H, W).
+
+    Returns:
+        Shape (N,) containing the IoU score between each pair of
+        elements in a and b.
+    """
+    intersection = np.count_nonzero(np.logical_and(a == b, a != 0))
+    union = np.count_nonzero(a + b)
+    return intersection / union
+
+def cal_IoU_from_path(a_path, b_path):
+    """Calculates the Intersection over Union (IoU) between two images.
+
+    Args:
+        a_path: path to image a.
+        b_path: path to image b.
+
+    Returns:
+        IoU score between image a and b.
+    """
+    a = imageio.imread(a_path) > 0
+    b = imageio.imread(b_path) > 0
+    return cal_IoU(a, b)
+
+def cal_IoU_from_paths(a_paths, b_paths):
+    """Calculates the Intersection over Union (IoU) between two sets of images.
+
+    Args:
+        a_paths: list of paths to images in set a.
+        b_paths: list of paths to images in set b.
+
+    Returns:
+        Shape (N,) containing the IoU score between each pair of
+        elements in a and b.
+    """
+    assert len(a_paths) == len(b_paths)
+    a = np.stack([imageio.imread(path) > 0 for path in a_paths])
+    b = np.stack([imageio.imread(path) > 0 for path in b_paths])
+    return cal_IoU(a, b)
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mask_path', type=str, required=True)
+    parser.add_argument('--gt_path', type=str, required=True)
+    args = parser.parse_args()
+
+    if os.path.isdir(args.mask_path):
+        assert(os.path.isdir(args.gt_path))
+        mask_paths = [os.path.join(args.mask_path, f) for f in sorted(os.listdir(args.mask_path))]
+        gt_paths = [os.path.join(args.gt_path, f) for f in sorted(os.listdir(args.gt_path))]
+        iou = cal_IoU_from_paths(mask_paths, gt_paths)
+    else:
+        iou = cal_IoU_from_path(args.mask_path, args.gt_path)
+    
+    print('IoU: ', iou)
\ No newline at end of file
diff --git a/lib/grid.py b/lib/grid.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6370b46187c35be6ee22b85345ab9baeb77a247
--- /dev/null
+++ b/lib/grid.py
@@ -0,0 +1,382 @@
+import os
+import time
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import time
+
+from torch.utils.cpp_extension import load
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+render_utils_cuda = load(
+        name='render_utils_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/render_utils.cpp', 'cuda/render_utils_kernel.cu']],
+        verbose=True)
+
+total_variation_cuda = load(
+        name='total_variation_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/total_variation.cpp', 'cuda/total_variation_kernel.cu']],
+        verbose=True)
+
+
+def create_grid(type, **kwargs):
+    if type == 'DenseGrid':
+        return DenseGrid(**kwargs)
+    elif type == 'TensoRFGrid':
+        return TensoRFGrid(**kwargs)
+    else:
+        raise NotImplementedError
+
+
+''' Dense 3D grid
+'''
+class DenseGrid(nn.Module):
+    def __init__(self, channels, world_size, xyz_min, xyz_max, **kwargs):
+        super(DenseGrid, self).__init__()
+        self.channels = channels
+        self.world_size = world_size
+        self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+        self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+        self.grid = nn.Parameter(torch.zeros([1, channels, *world_size]))
+        print(self.xyz_min, self.xyz_max, self.world_size)
+
+    def forward(self, xyz):
+        '''
+        xyz: global coordinates to query
+        '''
+        shape = xyz.shape[:-1]
+        xyz = xyz.reshape(1,1,1,-1,3)
+        ind_norm = ((xyz - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1
+        out = F.grid_sample(self.grid, ind_norm, mode='bilinear', align_corners=True)
+        out = out.reshape(self.channels,-1).T.reshape(*shape,self.channels)
+        if self.channels == 1:
+            out = out.squeeze(-1)
+        return out
+
+    def scale_volume_grid(self, new_world_size):
+        if self.channels == 0:
+            self.grid = nn.Parameter(torch.zeros([1, self.channels, *new_world_size]))
+        else:
+            self.grid = nn.Parameter(
+                F.interpolate(self.grid.data, size=tuple(new_world_size), mode='trilinear', align_corners=True))
+
+    def total_variation_add_grad(self, wx, wy, wz, dense_mode):
+        '''Add gradients by total variation loss in-place'''
+        total_variation_cuda.total_variation_add_grad(
+            self.grid, self.grid.grad, wx, wy, wz, dense_mode)
+
+    def get_dense_grid(self):
+        return self.grid
+
+    @torch.no_grad()
+    def __isub__(self, val):
+        self.grid.data -= val
+        return self
+
+    def extra_repr(self):
+        return f'channels={self.channels}, world_size={self.world_size.tolist()}'
+
+# ''' Utilize autograd for 3D mask generation
+# '''
+# class ConstrainedGrad(torch.autograd.Function):
+#     @staticmethod
+#     def forward(ctx, inp):
+#         if inp.requires_grad:
+#             ctx.save_for_backward(inp)
+
+#         return inp
+
+#     @staticmethod
+#     @torch.autograd.function.once_differentiable
+#     def backward(ctx, grad_back):
+#         '''
+#         grad_back should be [0,1]
+#         '''
+#         val = ctx.saved_tensors[0]
+
+#         return grad_back * (1-x), None, None
+
+# ''' Dense 3D grid for 3D mask
+# '''
+# class MaskDenseGrid(nn.Module):
+#     def __init__(self, channels, world_size, xyz_min, xyz_max, **kwargs):
+#         super(MaskDenseGrid, self).__init__()
+#         self.channels = channels
+#         self.world_size = world_size
+#         self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+#         self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+#         self.grid = nn.Parameter(torch.zeros([1, channels, *world_size]))
+
+#     def forward(self, xyz):
+#         '''
+#         xyz: global coordinates to query
+#         '''
+#         shape = xyz.shape[:-1]
+#         xyz = xyz.reshape(1,1,1,-1,3)
+#         ind_norm = ((xyz - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1
+#         # modify the backward gradients
+#         out = F.grid_sample(ConstrainedGrad.apply(self.grid), ind_norm, mode='bilinear', align_corners=True)
+#         out = out.reshape(self.channels,-1).T.reshape(*shape,self.channels)
+#         if self.channels == 1:
+#             out = out.squeeze(-1)
+#         return out
+        
+#     @torch.no_grad()
+#     def scale_volume_grid(self, new_world_size):
+#         if self.channels == 0:
+#             self.grid = nn.Parameter(torch.zeros([1, self.channels, *new_world_size]))
+#         else:
+#             self.grid = nn.Parameter(
+#                 F.interpolate(self.grid.data, size=tuple(new_world_size), mode='trilinear', align_corners=True))
+#         self.world_size = new_world_size
+    
+#     @torch.no_grad()
+#     def total_variation_add_grad(self, wx, wy, wz, dense_mode):
+#         '''Add gradients by total variation loss in-place'''
+#         total_variation_cuda.total_variation_add_grad(
+#             self.grid, self.grid.grad, wx, wy, wz, dense_mode)
+    
+#     @torch.no_grad()
+#     def get_dense_grid(self):
+#         return self.grid
+
+#     @torch.no_grad()
+#     def __isub__(self, val):
+#         self.grid.data -= val
+#         return self
+
+#     def extra_repr(self):
+#         return f'channels={self.channels}, world_size={self.world_size.tolist()}'
+    
+''' Vector-Matrix decomposited grid
+See TensoRF: Tensorial Radiance Fields (https://arxiv.org/abs/2203.09517)
+'''
+class TensoRFGrid(nn.Module):
+    def __init__(self, channels, world_size, xyz_min, xyz_max, config):
+        super(TensoRFGrid, self).__init__()
+        self.channels = channels
+        self.world_size = world_size
+        self.config = config
+        self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+        self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+        X, Y, Z = world_size
+        R = config['n_comp']
+        Rxy = config.get('n_comp_xy', R)
+        self.xy_plane = nn.Parameter(torch.randn([1, Rxy, X, Y]) * 0.1)
+        self.xz_plane = nn.Parameter(torch.randn([1, R, X, Z]) * 0.1)
+        self.yz_plane = nn.Parameter(torch.randn([1, R, Y, Z]) * 0.1)
+        self.x_vec = nn.Parameter(torch.randn([1, R, X, 1]) * 0.1)
+        self.y_vec = nn.Parameter(torch.randn([1, R, Y, 1]) * 0.1)
+        self.z_vec = nn.Parameter(torch.randn([1, Rxy, Z, 1]) * 0.1)
+        if self.channels > 1:
+            self.f_vec = nn.Parameter(torch.ones([R+R+Rxy, channels]))
+            nn.init.kaiming_uniform_(self.f_vec, a=np.sqrt(5))
+
+    def forward(self, xyz):
+        '''
+        xyz: global coordinates to query
+        '''
+        shape = xyz.shape[:-1]
+        xyz = xyz.reshape(1,1,-1,3)
+        ind_norm = (xyz - self.xyz_min) / (self.xyz_max - self.xyz_min) * 2 - 1
+        ind_norm = torch.cat([ind_norm, torch.zeros_like(ind_norm[...,[0]])], dim=-1)
+        if self.channels > 1:
+            out = compute_tensorf_feat(
+                    self.xy_plane, self.xz_plane, self.yz_plane,
+                    self.x_vec, self.y_vec, self.z_vec, self.f_vec, ind_norm)
+            out = out.reshape(*shape,self.channels)
+        else:
+            out = compute_tensorf_val(
+                    self.xy_plane, self.xz_plane, self.yz_plane,
+                    self.x_vec, self.y_vec, self.z_vec, ind_norm)
+            out = out.reshape(*shape)
+        return out
+
+    def scale_volume_grid(self, new_world_size):
+        if self.channels == 0:
+            return
+        X, Y, Z = new_world_size
+        self.xy_plane = nn.Parameter(F.interpolate(self.xy_plane.data, size=[X,Y], mode='bilinear', align_corners=True))
+        self.xz_plane = nn.Parameter(F.interpolate(self.xz_plane.data, size=[X,Z], mode='bilinear', align_corners=True))
+        self.yz_plane = nn.Parameter(F.interpolate(self.yz_plane.data, size=[Y,Z], mode='bilinear', align_corners=True))
+        self.x_vec = nn.Parameter(F.interpolate(self.x_vec.data, size=[X,1], mode='bilinear', align_corners=True))
+        self.y_vec = nn.Parameter(F.interpolate(self.y_vec.data, size=[Y,1], mode='bilinear', align_corners=True))
+        self.z_vec = nn.Parameter(F.interpolate(self.z_vec.data, size=[Z,1], mode='bilinear', align_corners=True))
+
+    def total_variation_add_grad(self, wx, wy, wz, dense_mode):
+        '''Add gradients by total variation loss in-place'''
+        loss = wx * F.smooth_l1_loss(self.xy_plane[:,:,1:], self.xy_plane[:,:,:-1], reduction='sum') +\
+               wy * F.smooth_l1_loss(self.xy_plane[:,:,:,1:], self.xy_plane[:,:,:,:-1], reduction='sum') +\
+               wx * F.smooth_l1_loss(self.xz_plane[:,:,1:], self.xz_plane[:,:,:-1], reduction='sum') +\
+               wz * F.smooth_l1_loss(self.xz_plane[:,:,:,1:], self.xz_plane[:,:,:,:-1], reduction='sum') +\
+               wy * F.smooth_l1_loss(self.yz_plane[:,:,1:], self.yz_plane[:,:,:-1], reduction='sum') +\
+               wz * F.smooth_l1_loss(self.yz_plane[:,:,:,1:], self.yz_plane[:,:,:,:-1], reduction='sum') +\
+               wx * F.smooth_l1_loss(self.x_vec[:,:,1:], self.x_vec[:,:,:-1], reduction='sum') +\
+               wy * F.smooth_l1_loss(self.y_vec[:,:,1:], self.y_vec[:,:,:-1], reduction='sum') +\
+               wz * F.smooth_l1_loss(self.z_vec[:,:,1:], self.z_vec[:,:,:-1], reduction='sum')
+        loss /= 6
+        loss.backward()
+
+    def get_dense_grid(self):
+        if self.channels > 1:
+            feat = torch.cat([
+                torch.einsum('rxy,rz->rxyz', self.xy_plane[0], self.z_vec[0,:,:,0]),
+                torch.einsum('rxz,ry->rxyz', self.xz_plane[0], self.y_vec[0,:,:,0]),
+                torch.einsum('ryz,rx->rxyz', self.yz_plane[0], self.x_vec[0,:,:,0]),
+            ])
+            grid = torch.einsum('rxyz,rc->cxyz', feat, self.f_vec)[None]
+        else:
+            grid = torch.einsum('rxy,rz->xyz', self.xy_plane[0], self.z_vec[0,:,:,0]) + \
+                   torch.einsum('rxz,ry->xyz', self.xz_plane[0], self.y_vec[0,:,:,0]) + \
+                   torch.einsum('ryz,rx->xyz', self.yz_plane[0], self.x_vec[0,:,:,0])
+            grid = grid[None,None]
+        return grid
+
+    def extra_repr(self):
+        return f'channels={self.channels}, world_size={self.world_size.tolist()}, n_comp={self.config["n_comp"]}'
+
+def compute_tensorf_feat(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, f_vec, ind_norm):
+    # Interp feature (feat shape: [n_pts, n_comp])
+    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T
+    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T
+    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T
+    # Aggregate components
+    feat = torch.cat([
+        xy_feat * z_feat,
+        xz_feat * y_feat,
+        yz_feat * x_feat,
+    ], dim=-1)
+    feat = torch.mm(feat, f_vec)
+    return feat
+
+def compute_tensorf_val(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, ind_norm):
+    # Interp feature (feat shape: [n_pts, n_comp])
+    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T
+    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T
+    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T
+    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T
+    # Aggregate components
+    feat = (xy_feat * z_feat).sum(-1) + (xz_feat * y_feat).sum(-1) + (yz_feat * x_feat).sum(-1)
+    return feat
+
+
+''' Mask grid
+It supports query for the known free space and unknown space.
+'''
+class MaskGrid(nn.Module):
+    def __init__(self, path=None, mask_cache_thres=None, mask=None, xyz_min=None, xyz_max=None):
+        super(MaskGrid, self).__init__()
+        if path is not None:
+            st = torch.load(path)
+            self.mask_cache_thres = mask_cache_thres
+            density = F.max_pool3d(st['model_state_dict']['density.grid'], kernel_size=3, padding=1, stride=1)
+            alpha = 1 - torch.exp(-F.softplus(density + st['model_state_dict']['act_shift']) * st['model_kwargs']['voxel_size_ratio'])
+            mask = (alpha >= self.mask_cache_thres).squeeze(0).squeeze(0)
+            xyz_min = torch.Tensor(st['model_kwargs']['xyz_min'])
+            xyz_max = torch.Tensor(st['model_kwargs']['xyz_max'])
+        else:
+            mask = mask.bool()
+            xyz_min = torch.Tensor(xyz_min)
+            xyz_max = torch.Tensor(xyz_max)
+
+        self.register_buffer('mask', mask)
+        xyz_len = xyz_max - xyz_min
+        self.register_buffer('xyz2ijk_scale', (torch.Tensor(list(mask.shape)) - 1) / xyz_len)
+        self.register_buffer('xyz2ijk_shift', -xyz_min * self.xyz2ijk_scale)
+
+    @torch.no_grad()
+    def forward(self, xyz):
+        '''Skip know freespace
+        @xyz:   [..., 3] the xyz in global coordinate.
+        '''
+        shape = xyz.shape[:-1]
+        xyz = xyz.reshape(-1, 3)
+        mask = render_utils_cuda.maskcache_lookup(self.mask, xyz, self.xyz2ijk_scale, self.xyz2ijk_shift)
+        mask = mask.reshape(shape)
+        return mask
+
+    def extra_repr(self):
+        return f'mask.shape=list(self.mask.shape)'
+
+
+def get_dense_grid_batch_processing(tensorf: TensoRFGrid):
+    '''
+    Expects the tensorf to be already on device and processes it on device batchwise.
+    Not transferring from cpu to avoid repeated transfers from cpu to device
+    Returns the grid which is also on device
+    '''
+    # we will construct it 3d column wise
+    # result_grid = torch.zeros([1, tensorf.channels, *tensorf.world_size], dtype=tensorf.xy_plane.dtype).cpu()
+    start_time = time.time()
+    # result_grid = torch.stack([torch.zeros([1, *tensorf.world_size], dtype=tensorf.x_vec.dtype).cpu() for _ in range(tensorf.channels)], dim=1)
+#     print(tensorf.channels, tensorf.world_size)
+#     result_grid = torch.zeros([1, tensorf.channels, *tensorf.world_size], dtype=tensorf.x_vec.dtype)
+    # debugging
+    result_grid = torch.zeros([1, 64, *tensorf.world_size], dtype=tensorf.x_vec.dtype)
+    print("Time taken for initializing the grid", time.time() - start_time)
+
+    # created y batches just in case if needed
+
+    batch_size_x = 35
+    batch_size_y = 35
+    batch_size_z = 35
+    for start_x in range(0, tensorf.world_size[0], batch_size_x):
+        end_x = start_x + batch_size_x
+        for start_y in range(0, tensorf.world_size[1], batch_size_y):
+            end_y = start_y + batch_size_y
+            for start_z in range(0, tensorf.world_size[2], batch_size_z):
+                end_z = start_z + batch_size_z
+                feat = torch.cat([
+                    torch.einsum('rxy,rz->rxyz', tensorf.xy_plane[0, :, start_x:end_x, start_y:end_y], tensorf.z_vec[0,:,start_z:end_z,0]),
+                    torch.einsum('rxz,ry->rxyz', tensorf.xz_plane[0, :, start_x:end_x, start_z:end_z], tensorf.y_vec[0,:,start_y:end_y,0]),
+                    torch.einsum('ryz,rx->rxyz', tensorf.yz_plane[0, :, start_y:end_y, start_z:end_z], tensorf.x_vec[0,:,start_x:end_x,0]),
+                ])
+                sub_grid = torch.einsum('rxyz,rc->cxyz', feat, tensorf.f_vec)[None]
+                result_grid[:, :, start_x:end_x, start_y:end_y, start_z:end_z] = sub_grid[:,:64,:,:,:]
+    return result_grid
+
+@torch.no_grad()
+def reconstruct_feature_grid(render_viewpoints_kwargs):
+    model = render_viewpoints_kwargs['model']
+
+    f_k0 = model.f_k0.cuda()
+    fg = get_dense_grid_batch_processing(f_k0).cuda()
+
+    fg_kmeans = fg.clone()
+    fg_kmeans = fg_kmeans.squeeze(0).permute(1, 2, 3, 0) # x, y, z, 64
+    fg_kmeans = fg_kmeans.reshape(-1, 64)
+    fg_kmeans = fg_kmeans.cpu().contiguous()
+
+    return torch.nn.functional.pad(fg, [1] * 6), fg_kmeans
+
+if __name__ == "__main__":
+    with torch.no_grad():
+        print("Testing whether the outputted grid is the correct or not.")
+        tensorf = TensoRFGrid(64, torch.tensor([100, 100, 100]), 0, 1, {'n_comp': 64})
+        tensorf = tensorf.cuda()
+        start_time = time.time()
+        grid1 = tensorf.get_dense_grid().cpu()
+        print("Time taken for full gpu implementation", time.time() - start_time)
+        grid2 = get_dense_grid_batch_processing(tensorf)
+        assert grid1.isclose(grid2, atol=1e-7).all()
+        del grid1, grid2, tensorf
+
+        torch.cuda.empty_cache()
+
+        tensorf = TensoRFGrid(64, torch.tensor([320, 320, 320]), 0, 1, {'n_comp': 64})
+        tensorf = tensorf.cuda()
+        start_time = time.time()
+        grid = get_dense_grid_batch_processing(tensorf)
+        print("Time taken to reconstruct the grid", time.time() - start_time)
+        print("Program over.")
\ No newline at end of file
diff --git a/lib/gui.py b/lib/gui.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf38c2e0916cac343c0e41a588983ea0becb1f4
--- /dev/null
+++ b/lib/gui.py
@@ -0,0 +1,382 @@
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+import os
+# os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+import cv2
+import imageio
+import time
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.express as px
+import torch
+import dash
+from dash import Dash, Input, Output, dcc, html, State
+from dash.exceptions import PreventUpdate
+from .self_prompting import grounding_dino_prompt
+
+def mark_image(_img, points):
+    assert(len(points) > 0)
+    img = _img.copy()
+    r = 10
+    mark_color = np.array([255, 0, 0]).reshape(1, 1, 3)
+    for i in range(len(points)):
+        point = points[i]
+        img[point[1]-r:point[1]+r+1, point[0]-r:point[0]+r+1] = mark_color
+    return img
+
+def draw_figure(fig, title, animation_frame=None):
+    fig = px.imshow(fig, animation_frame=animation_frame)
+    if animation_frame is not None:
+        # fig.update_layout(sliders = [{'visible': False}])
+        fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 33
+    fig.update_layout(title_text=title, showlegend=False)
+    fig.update_xaxes(showticklabels=False)
+    fig.update_yaxes(showticklabels=False)
+    return fig
+
+
+class Sam3dGUI:
+    def __init__(self, Seg3d, debug=False):
+        ctx = {
+            'num_clicks': 0, 
+            'click': [], 
+            'cur_img': None, 
+            'btn_clear': 0, 
+            'btn_text': 0, 
+            'prompt_type': 'point',
+            'show_rgb': False
+            }
+        self.ctx = ctx
+        self.Seg3d = Seg3d
+        self.debug = debug
+
+        self.train_idx = 0
+
+    def run(self):
+        init_rgb = self.Seg3d.init_model()
+        self.ctx['cur_img'] = init_rgb
+        self.run_app(sam_pred=self.Seg3d.predictor, ctx=self.ctx, init_rgb=init_rgb)
+
+
+    def run_app(self, sam_pred, ctx, init_rgb):
+        '''
+        run dash app
+        '''
+        def query(points=None, text=None):
+            with torch.no_grad():
+                if text is None:
+                    input_point = points
+                    input_label = np.ones(len(input_point))
+                    masks, scores, logits = sam_pred.predict(
+                        point_coords=input_point,
+                        point_labels=input_label,
+                        multimask_output=True,
+                    )
+                elif points is None:
+                    input_boxes = grounding_dino_prompt(ctx['cur_img'], text)
+                    boxes = torch.tensor(input_boxes)[0:1].cuda()
+                    transformed_boxes = sam_pred.transform.apply_boxes_torch(boxes, ctx['cur_img'].shape[:2])
+                    masks, scores, logits = sam_pred.predict_torch(
+                        point_coords=None,
+                        point_labels=None,
+                        boxes=transformed_boxes,
+                        multimask_output=True,
+                    )
+                    masks = masks[0].cpu().numpy()
+                else:
+                    raise NotImplementedError
+
+            fig1 = (255*masks[0, :, :, None]*0.6 + ctx['cur_img']*0.4).astype(np.uint8)
+            fig2 = (255*masks[1, :, :, None]*0.6 + ctx['cur_img']*0.4).astype(np.uint8)
+            fig3 = (255*masks[2, :, :, None]*0.6 + ctx['cur_img']*0.4).astype(np.uint8)
+            fig1 = draw_figure(fig1, 'mask0')
+            fig2 = draw_figure(fig2, 'mask1')
+            fig3 = draw_figure(fig3, 'mask2')
+
+            if text is None:
+                fig0 = mark_image(ctx['cur_img'], points)
+            else:
+                fig0 = ctx['cur_img']
+            fig0 = draw_figure(fig0, 'original_image')
+
+            return  masks, fig0, fig1, fig2, fig3
+        
+        # _, fig0, fig1, fig2, fig3, desc = query(np.array([[100, 100], [101, 101]]))
+        self.ctx['fig0'] = draw_figure(init_rgb, 'original_image')
+        self.ctx['fig1'] = draw_figure(np.zeros_like(init_rgb), 'mask0')
+        self.ctx['fig2'] = draw_figure(np.zeros_like(init_rgb), 'mask1')
+        self.ctx['fig3'] = draw_figure(np.zeros_like(init_rgb), 'mask2')
+        self.ctx['fig_seg_rgb'] = draw_figure(np.zeros_like(init_rgb), 'Masked image in Training')
+        self.ctx['fig_sam_mask'] = draw_figure(np.zeros_like(init_rgb), 'SAM Mask with Prompts in Training')
+        self.ctx['fig_masked_rgb'] = draw_figure(np.zeros_like(init_rgb), 'Masked RGB')
+        self.ctx['fig_seged_rgb'] = draw_figure(np.zeros_like(init_rgb), 'Seged RGB')
+        
+        app = dash.Dash(
+            __name__, meta_tags=[{"name": "viewport", "content": "width=device-width"}]
+        )
+        app.layout = html.Div(
+            style={"height": "100%"},
+            children=[
+            html.Div(className="container", children=[
+                html.Div(className="row", children=[
+                    html.Div(className="two columns",style={"padding-bottom": "5%"},children=[
+                        html.Div([html.H3(['SAM Init'])]),
+                        html.Br(),
+
+                        html.H5('Prompt Type:'),
+                        html.Div([
+                            dcc.Dropdown(
+                                id = 'prompt_type',
+                                options = [{'label': 'Points', 'value': 'point'}, 
+                                        {'label': 'Text', 'value': 'text'},],
+                                value = 'point'),
+                                html.Div(id = 'output-prompt_type')
+                        ]),
+                        html.Br(),
+
+                        html.H5('Point Prompts:'),
+                        html.Button('Clear Points', id='btn-nclicks-clear', n_clicks=0),
+                        html.Br(),
+
+                        html.H5('Text Prompt:'),
+                        html.Div([
+                            dcc.Input(id='input-text-state', type='text', value='none'),
+                            html.Button(id='submit-button-state', n_clicks=0, children='Generate'),
+                            html.Div(id='output-state-text')
+                        ]),
+                        html.Br(),
+
+                        html.H5('Please select the mask:'),
+                        html.Div([
+                            dcc.RadioItems(['mask0', 'mask1', 'mask2'], id='sel_mask_id', value=None)
+                        ], style={'display': 'flex'}),
+                        html.Br(),
+
+                        html.H5(id='container-sel-mask'),
+                    ]),
+                    html.Div(className="ten columns",children=[
+                        html.Div(children=[
+                            dcc.Graph(id='main_image', figure=self.ctx['fig0'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+
+                        html.Div(children=[
+                            dcc.Graph(id='mask0', figure=self.ctx['fig1'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+
+                        html.Div(children=[
+                            dcc.Graph(id='mask1', figure=self.ctx['fig2'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+
+                        html.Div(children=[
+                            dcc.Graph(id='mask2', figure=self.ctx['fig3'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+                    ])
+                ])
+            ]),
+
+            html.Div(className="container", children=[
+                html.Div(className="row", children=[
+                    html.Div(className="two columns",style={"padding-bottom": "5%"},children=[
+                        html.Div([html.H3(['SA3D Training'])]),
+                        html.Br(),
+
+                        html.Button('Start Training', id='btn-nclicks-training', n_clicks=0),
+                        html.Div(id='container-button-training', style={'display': 'inline-block'}),
+                        ]),
+
+                    html.Div(className="ten columns",children=[
+                        html.Div(children=[
+                            dcc.Graph(id='seg_rgb', figure=self.ctx['fig_seg_rgb'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+
+                        html.Div(children=[
+                            dcc.Graph(id='sam_mask', figure=self.ctx['fig_sam_mask'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+                    ]),
+
+                    dcc.Interval(
+                        id='interval-component',
+                        interval=1*1000,  # in milliseconds
+                        n_intervals=0),
+                ])
+            ]),
+
+            html.Div(className="container", children=[
+                html.Div(className="row", children=[
+                    html.Div(className="two columns",style={"padding-bottom": "5%"},children=[
+                        html.Div([html.H3(['SA3D Rendering Results'])]),
+                        html.Br(),
+                        ]),
+
+                    html.Div(className="ten columns",children=[
+                        html.Div(children=[
+                            dcc.Graph(id='masked_rgb', figure=self.ctx['fig_masked_rgb'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+
+                        html.Div(children=[
+                            dcc.Graph(id='seged_rgb', figure=self.ctx['fig_seged_rgb'])
+                        ], style={'display': 'inline-block', 'width': '40%'}),
+                    ]),
+                ])
+            ])
+            
+        ])
+
+        @app.callback(Output('output-prompt_type', 'children'), [Input('prompt_type', 'value')])
+        def update_prompt_type(value):
+            self.ctx['prompt_type'] = value
+            if value != 'point':
+                ctx['click'] = []
+                ctx['num_clicks'] = 0
+            return f"Type {value} is chosen"
+        
+
+        @app.callback(
+            Output('main_image', 'figure'),
+            Output('mask0', 'figure'),
+            Output('mask1', 'figure'),
+            Output('mask2', 'figure'),
+            Output('output-state-text', 'children'),
+            Input('main_image', 'clickData'),
+            Input('btn-nclicks-clear', 'n_clicks'),
+            Input('submit-button-state', 'n_clicks'),
+            State('input-text-state', 'value')
+        )
+        def update_prompt(clickData, btn_point, btn_text, text):
+            '''
+            update mask
+            '''
+            if self.ctx['prompt_type'] == 'point':
+                if clickData is None and btn_point == self.ctx['btn_clear']:
+                    raise PreventUpdate
+
+                if btn_point > self.ctx['btn_clear']:
+                    self.ctx['btn_clear'] += 1
+                    ctx['click'] = []
+                    ctx['num_clicks'] = 0
+                    return self.ctx['fig0'], self.ctx['fig1'], self.ctx['fig2'], self.ctx['fig3'], 'none'
+                
+                ctx['num_clicks'] += 1
+                ctx['click'].append(np.array([clickData['points'][0]['x'], clickData['points'][0]['y']]))
+                
+                ctx['saved_click'] = np.stack(ctx['click'])
+                masks, fig0, fig1, fig2, fig3 = query(ctx['saved_click'])
+                ctx['masks'] = masks
+                return fig0, fig1, fig2, fig3, 'none'
+            
+            elif self.ctx['prompt_type'] == 'text':
+                if btn_text > self.ctx['btn_text']:
+                    self.ctx['btn_text'] += 1
+                    self.ctx['text'] = text
+                    masks, fig0, fig1, fig2, fig3 = query(points=None, text=text)
+                    ctx['masks'] = masks
+                    return fig0, fig1, fig2, fig3, u'''
+                        Input text is "{}"
+                    '''.format(text)
+                else:
+                    raise PreventUpdate
+            else:
+                raise NotImplementedError
+
+        @app.callback(
+            Output("container-sel-mask", 'children'),
+            Input("sel_mask_id", 'value')
+        )
+        def update_graph(radio_items):
+            if radio_items == 'mask0':
+                ctx['select_mask_id'] = 0
+                return html.Div("you select mask0")
+            elif radio_items == 'mask1':
+                ctx['select_mask_id'] = 1
+                return html.Div("you select mask1")
+            elif radio_items == 'mask2':
+                ctx['select_mask_id'] = 2
+                return html.Div("you select mask2")
+            else:
+                raise PreventUpdate
+            
+        @app.callback(
+            Output('seg_rgb', 'figure'),
+            Output('sam_mask', 'figure'),
+            Input('interval-component', 'n_intervals')
+        )
+        def displaySeg(n):
+            if self.ctx['show_rgb']:
+                self.ctx['show_rgb'] = False
+                fig_seg_rgb = draw_figure(self.ctx['fig_seg_rgb'], 'Masked image in Training')
+                fig_sam_mask = draw_figure(self.ctx['fig_sam_mask'], 'SAM Mask with Prompts in Training')
+                return fig_seg_rgb, fig_sam_mask
+            else:
+                raise PreventUpdate
+        
+
+        @app.callback(
+            Output('container-button-training', 'children'),
+            Output('masked_rgb', 'figure'),
+            Output('seged_rgb', 'figure'),
+            Input('btn-nclicks-training', 'n_clicks')
+        )
+        def start_training(btn):
+            if btn < 1:
+                return html.Div("Press to start training"), self.ctx['fig_masked_rgb'], self.ctx['fig_seged_rgb']
+            else:
+                # optim in the first view
+                self.Seg3d.train_step(self.train_idx, sam_mask=ctx['masks'][ctx['select_mask_id']])
+                self.train_idx += 1
+
+                # cross-view training
+                while True:
+                    rgb, sam_prompt, is_finished = self.Seg3d.train_step(self.train_idx)
+                    self.train_idx += 1
+                    self.ctx['fig_seg_rgb'] = rgb
+                    self.ctx['fig_sam_mask'] = sam_prompt
+                    self.ctx['show_rgb'] = True
+                    if is_finished:
+                        break
+                self.Seg3d.save_ckpt()
+                masked_rgb, seged_rgb = self.Seg3d.render_test()
+                fig_masked_rgb = draw_figure(masked_rgb, 'Masked RGB', animation_frame=0)
+                fig_seged_rgb = draw_figure(seged_rgb, 'Seged RGB', animation_frame=0)
+
+                return html.Div("Train Stage Finished! Press Ctrl+C to Exit!"), fig_masked_rgb, fig_seged_rgb
+            
+        
+        app.run_server(debug=self.debug)
+
+if __name__ == '__main__':
+    from segment_anything import (SamAutomaticMaskGenerator, SamPredictor,
+                              sam_model_registry)
+    class Sam_predictor():
+        def __init__(self, device):
+            sam_checkpoint = "./dependencies/sam_ckpt/sam_vit_h_4b8939.pth"
+            model_type = "vit_h"
+            self.sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device)
+            self.predictor = SamPredictor(self.sam)
+            print('sam inited!')
+            # pass
+
+        def forward(self, points, multimask_output=True, return_logits=False):
+            # self.predictor.set_image(image)
+            # input_point = np.array([[x, y], [x + 1, y + 1]]) # TODO, add interactive mode
+            input_point = points
+            input_label = np.ones(len(input_point))
+
+            masks, scores, logits = self.predictor.predict(
+                point_coords=input_point,
+                point_labels=input_label,
+                multimask_output=multimask_output,
+                return_logits=return_logits
+            )
+            return masks
+        
+    image = cv2.cvtColor(cv2.imread('data/nerf_llff_data(NVOS)/fern/images_4/image000.png'), cv2.COLOR_BGR2RGB)
+    sam_pred = Sam_predictor(torch.device('cuda'))
+    sam_pred.predictor.set_image(image)
+    video = np.stack(imageio.mimread('logs/llff/fern/render_train_coarse_segmentation_gui/video.rgbseg_gui.mp4'))
+    gui = Sam3dGUI(None, debug=True)
+    gui.ctx['cur_img'] = image
+    gui.ctx['video'] = video
+    gui.run_app(sam_pred.predictor, gui.ctx, image)
+
diff --git a/lib/interactive_prompt.py b/lib/interactive_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c894107ec85cc0d1f7d19d1fe6d9107322bf559
--- /dev/null
+++ b/lib/interactive_prompt.py
@@ -0,0 +1,145 @@
+import os
+
+import numpy as np
+import plotly.express as px
+import torch
+from dash import Dash, Input, Output, dcc, html
+from dash.exceptions import PreventUpdate
+
+def interactive_prompting(sam_pred, ctx, init_rgb):
+    # print(ctx)
+    def query(points):
+        with torch.no_grad():
+            input_point = points
+            input_label = np.ones(len(input_point))
+            masks, scores, logits = sam_pred.predict(
+                point_coords=input_point,
+                point_labels=input_label,
+                multimask_output=True,
+            )
+
+        if len(masks) == 1:
+            fig1 = px.imshow(masks[0], title='mask0')
+            fig2 = px.imshow(np.zeros_like(masks[0]), title='mask1')
+            fig3 = px.imshow(np.zeros_like(masks[0]), title='mask2')
+        elif len(masks) == 2:
+            fig1 = px.imshow(masks[0], title='mask0')
+            fig2 = px.imshow(masks[1], title='mask1')
+            fig3 = px.imshow(np.zeros_like(masks[0]), title='mask2')
+        else:
+            fig1 = px.imshow(masks[0][...,None].repeat(3, -1), title='mask0')
+            fig2 = px.imshow(masks[1][...,None].repeat(3, -1), title='mask1')
+            fig3 = px.imshow(masks[2][...,None].repeat(3, -1), title='mask2')
+
+        return  masks, fig1, fig2, fig3, 'You clicked: x: {}, y: {}'.format(points[:, 0], points[:, 1])
+    
+    _, fig1, fig2, fig3, desc = query(np.array([[100, 100], [101, 101]]))
+    app = Dash('dash_app')
+    app.layout = html.Div([
+        html.H3(id='desc', children=desc),
+
+        html.Div([
+            html.Label('Please select the mask you want to use:'),
+            dcc.RadioItems(['mask0', 'mask1', 'mask2'], id='sel_mask_id', value=None)
+        ], style={'display': 'flex'}),
+
+        html.Label('Number of prompts: '),
+        html.Div([
+            dcc.RadioItems(
+                id='num_prompts', 
+                options = [{'label': '1', 'value': 1},
+                        {'label': '2', 'value': 2},
+                        {'label': '3', 'value': 3}],
+                value = 1),], style={'display': 'inline-block', 'width': '15%'}),
+
+        html.Div([
+            dcc.Graph(id='main_image', figure=px.imshow(init_rgb, 
+                            title='original_image', aspect='equal'))
+        ], style={'width': '33%', 'display': 'inline-block', 'textAlign': 'center'}),
+
+        html.Div([
+            dcc.Graph(id='selected_mask', figure=px.imshow(np.zeros_like(init_rgb), 
+                            title='selected_mask', aspect='equal'))
+        ], style={'width': '33%', 'display': 'inline-block', 'textAlign': 'center'}),
+
+        html.Div([
+            dcc.Graph(id='mask0', figure=fig1)
+        ], style={'display': 'inline-block', 'width': '33%'}),
+
+        html.Div([
+            dcc.Graph(id='mask1', figure=fig2)
+        ], style={'display': 'inline-block', 'width': '33%'}),
+
+        html.Div([
+            dcc.Graph(id='mask2', figure=fig3)
+        ], style={'display': 'inline-block', 'width': '33%'}),
+    ])
+
+    @app.callback(
+        Output('mask0', 'figure'),
+        Output('mask1', 'figure'),
+        Output('mask2', 'figure'),
+        Output('desc', 'children'),
+        Input('main_image', 'clickData'),
+        Input('num_prompts', 'value')
+    )
+    def update_right_side(clickData, value_prompts):
+        '''
+        {'points': [{'curveNumber': 0, 'x': 62, 'y': 15, 'color': {'0': 254, '1': 254, '2': 254, '3': 1}, 'colormodel': 'rgba256', 'z': {'0': 254, '1': 254, '2': 254, '3': 1}, 'bbox': {'x0': 948.03, 'x1': 948.41, 'y0': 74.01, 'y1': 74.01}}]}
+        '''
+        if clickData is None:
+            raise PreventUpdate
+        ctx['num_clicks'] += 1
+        ctx['click'].append(np.array([clickData['points'][0]['x'], clickData['points'][0]['y']]))
+
+        if ctx['num_clicks'] < value_prompts:
+            raise PreventUpdate
+        
+        ctx['num_clicks'] = 0
+        ctx['click'] = np.stack(ctx['click'])
+        ctx['saved_click'] = np.stack(ctx['click'])
+        masks, fig1, fig2, fig3, desc = query(ctx['click'])
+        ctx['masks'] = masks
+        ctx['click'] = []
+        return fig1, fig2, fig3, desc
+
+
+    @app.callback(
+        Output("selected_mask", "figure"),
+        Input("sel_mask_id", 'value')
+    )
+    def update_graph(radio_items):
+        # # record the selected prompt and mask
+        # with open(os.path.join(self.base_save_dir, "user-specific-prompt.json"), 'w') as f:
+        #     prompt_dict = {
+        #         "mask_id": selected_mask
+        #     }
+        #     json.dump(prompt_dict, f)
+        # print(f"Prompt saved in {os.path.join(self.base_save_dir, 'user-specific-prompt.json')}")
+
+        if radio_items == 'mask0':
+            ctx['select_mask_id'] = 0
+            return px.imshow(ctx['masks'][0][...,None].repeat(3, -1), title='you select mask0, type Ctrl+C to start train seg', aspect='equal')
+        elif radio_items == 'mask1':
+            ctx['select_mask_id'] = 1
+            return px.imshow(ctx['masks'][1][...,None].repeat(3, -1), title='you select mask1, type Ctrl+C to start train seg', aspect='equal')
+        elif radio_items == 'mask2':
+            ctx['select_mask_id'] = 2
+            return px.imshow(ctx['masks'][2][...,None].repeat(3, -1), title='you select mask2, type Ctrl+C to start train seg', aspect='equal')
+        else:
+            raise PreventUpdate
+
+    # @app.callback(
+    #     Output('button_text', 'children'),
+    #     Input('submit-val', 'n_clicks'),
+    # )
+    # def update_output(n_clicks, value):
+    #     msg = 'points are cleared!'
+    #     return msg
+
+    app.run_server(debug=False)
+
+    return ctx['saved_click'], ctx['select_mask_id'], ctx['masks']
+
+
+
diff --git a/lib/load_blendedmvs.py b/lib/load_blendedmvs.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cd415a25e4aa697441ad550d9fcbb3e6b230b6
--- /dev/null
+++ b/lib/load_blendedmvs.py
@@ -0,0 +1,36 @@
+import os
+import glob
+import torch
+import numpy as np
+import imageio
+import json
+import torch.nn.functional as F
+import cv2
+
+
+def load_blendedmvs_data(basedir):
+    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))
+    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*png')))
+
+    all_poses = []
+    all_imgs = []
+    i_split = [[], []]
+    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):
+        i_set = int(os.path.split(rgb_path)[-1][0])
+        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))
+        all_poses.append(np.loadtxt(pose_path).astype(np.float32))
+        i_split[i_set].append(i)
+
+    imgs = np.stack(all_imgs, 0)
+    poses = np.stack(all_poses, 0)
+    i_split.append(i_split[-1])
+
+    path_intrinsics = os.path.join(basedir, 'intrinsics.txt')
+    H, W = imgs[0].shape[:2]
+    K = np.loadtxt(path_intrinsics)
+    focal = float(K[0,0])
+
+    render_poses = torch.Tensor(np.loadtxt(os.path.join(basedir, 'test_traj.txt')).reshape(-1,4,4).astype(np.float32))
+
+    return imgs, poses, render_poses, [H, W, focal], K, i_split
+
diff --git a/lib/load_blender.py b/lib/load_blender.py
new file mode 100644
index 0000000000000000000000000000000000000000..a990a614624da26c137bee023ba2456e2cfe574d
--- /dev/null
+++ b/lib/load_blender.py
@@ -0,0 +1,110 @@
+import os
+import torch
+import numpy as np
+import imageio
+import json
+import torch.nn.functional as F
+import cv2
+
+
+trans_t = lambda t : torch.Tensor([
+    [1,0,0,0],
+    [0,1,0,0],
+    [0,0,1,t],
+    [0,0,0,1]]).float()
+
+rot_phi = lambda phi : torch.Tensor([
+    [1,0,0,0],
+    [0,np.cos(phi),-np.sin(phi),0],
+    [0,np.sin(phi), np.cos(phi),0],
+    [0,0,0,1]]).float()
+
+rot_theta = lambda th : torch.Tensor([
+    [np.cos(th),0,-np.sin(th),0],
+    [0,1,0,0],
+    [np.sin(th),0, np.cos(th),0],
+    [0,0,0,1]]).float()
+
+
+def pose_spherical(theta, phi, radius):
+    c2w = trans_t(radius)
+    c2w = rot_phi(phi/180.*np.pi) @ c2w
+    c2w = rot_theta(theta/180.*np.pi) @ c2w
+    c2w = torch.Tensor(np.array([[-1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1]])) @ c2w
+    return c2w
+
+
+def load_blender_data(basedir, half_res=False, testskip=5, args=None):
+    splits = ['train', 'val', 'test']
+    metas = {}
+    for s in splits:
+        with open(os.path.join(basedir, 'transforms.json'.format(s)), 'r') as fp:
+            metas[s] = json.load(fp)
+
+    all_imgs = []
+    all_poses = []
+    if args is not None and args.distill_active:
+        all_fts = []
+    counts = [0]
+
+    # get H, W
+    tmp_img = imageio.imread(os.path.join(basedir, next(iter(metas.values()))['frames'][::1][0]['file_path'] + '.png'))
+    H, W = tmp_img.shape[:2]
+    if args is not None and args.distill_active:
+        fts_dict = load_features(file=os.path.join(basedir, "features.pt"), imhw=(H, W))
+
+    for s in splits:
+        meta = metas[s]
+        imgs = []
+        poses = []
+        fts = []
+        if s=='train' or testskip==0:
+            skip = 3
+        else:
+            skip = testskip
+
+        for frame in meta['frames'][::skip]:
+            fname = os.path.join(basedir, frame['file_path'] + '.png')
+            just_fname = fname.split('/')[-1]
+            if args is not None and args.distill_active:
+                fts.append(fts_dict[just_fname].permute(1, 2, 0))
+            imgs.append(imageio.imread(fname))
+            poses.append(np.array(frame['transform_matrix']))
+        imgs = (np.array(imgs) / 255.).astype(np.float32) # keep all 4 channels (RGBA)
+        if args is not None and args.distill_active:
+            fts = torch.stack(fts)
+        poses = np.array(poses).astype(np.float32)
+        counts.append(counts[-1] + imgs.shape[0])
+        all_imgs.append(imgs)
+        all_poses.append(poses)
+        if args is not None and args.distill_active:
+            all_fts.append(fts)
+
+    i_split = [np.arange(counts[i], counts[i+1]) for i in range(3)]
+
+    imgs = np.concatenate(all_imgs, 0)
+    poses = np.concatenate(all_poses, 0)
+    if args is not None and args.distill_active:
+        fts = torch.cat(all_fts, 0)
+
+    H, W = imgs[0].shape[:2]
+    camera_angle_x = float(meta['camera_angle_x'])
+    focal = .5 * W / np.tan(.5 * camera_angle_x)
+
+    render_poses = torch.stack([pose_spherical(angle, -30.0, 4.0) for angle in np.linspace(-180,180,160+1)[:-1]], 0)
+
+    if half_res:
+        H = H//2
+        W = W//2
+        focal = focal/2.
+
+        imgs_half_res = np.zeros((imgs.shape[0], H, W, 4))
+        for i, img in enumerate(imgs):
+            imgs_half_res[i] = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)
+        imgs = imgs_half_res
+        # imgs = tf.image.resize_area(imgs, [400, 400]).numpy()
+
+    if args is not None and args.distill_active:
+        return imgs, poses, render_poses, [H, W, focal], i_split, fts
+    else:
+        return imgs, poses, render_poses, [H, W, focal], i_split, None
diff --git a/lib/load_co3d.py b/lib/load_co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..df36851b601ffde80edb249286693d80254b195f
--- /dev/null
+++ b/lib/load_co3d.py
@@ -0,0 +1,85 @@
+import os
+import json
+import gzip
+import glob
+import torch
+import numpy as np
+import imageio
+import torch.nn.functional as F
+import cv2
+
+
+def load_co3d_data(cfg):
+
+    # load meta
+    with gzip.open(cfg.annot_path, 'rt', encoding='utf8') as zipfile:
+        annot = [v for v in json.load(zipfile) if v['sequence_name'] == cfg.sequence_name]
+    with open(cfg.split_path) as f:
+        split = json.load(f)
+        train_im_path = set()
+        test_im_path = set()
+        for k, lst in split.items():
+            for v in lst:
+                if v[0] == cfg.sequence_name:
+                    if 'known' in k:
+                        train_im_path.add(v[-1])
+                    else:
+                        test_im_path.add(v[-1])
+    assert len(annot) == len(train_im_path) + len(test_im_path), 'Mismatch: '\
+            f'{len(annot)} == {len(train_im_path) + len(test_im_path)}'
+
+    # load datas
+    imgs = []
+    masks = []
+    poses = []
+    Ks = []
+    i_split = [[], []]
+    remove_empty_masks_cnt = [0, 0]
+    for i, meta in enumerate(annot):
+        im_fname = meta['image']['path']
+        assert im_fname in train_im_path or im_fname in test_im_path
+        sid = 0 if im_fname in train_im_path else 1
+        if meta['mask']['mass'] == 0:
+            remove_empty_masks_cnt[sid] += 1
+            continue
+        im_path = os.path.join(cfg.datadir, im_fname)
+        mask_path = os.path.join(cfg.datadir, meta['mask']['path'])
+        mask = imageio.imread(mask_path) / 255.
+        if mask.max() < 0.5:
+            remove_empty_masks_cnt[sid] += 1
+            continue
+        Rt = np.concatenate([meta['viewpoint']['R'], np.array(meta['viewpoint']['T'])[:,None]], 1)
+        pose = np.linalg.inv(np.concatenate([Rt, [[0,0,0,1]]]))
+        imgs.append(imageio.imread(im_path) / 255.)
+        masks.append(mask)
+        poses.append(pose)
+        assert imgs[-1].shape[:2] == tuple(meta['image']['size'])
+        half_image_size_wh = np.float32(meta['image']['size'][::-1]) * 0.5
+        principal_point = np.float32(meta['viewpoint']['principal_point'])
+        focal_length = np.float32(meta['viewpoint']['focal_length'])
+        principal_point_px = -1.0 * (principal_point - 1.0) * half_image_size_wh
+        focal_length_px = focal_length * half_image_size_wh
+        Ks.append(np.array([
+            [focal_length_px[0], 0, principal_point_px[0]],
+            [0, focal_length_px[1], principal_point_px[1]],
+            [0, 0, 1],
+        ]))
+        i_split[sid].append(len(imgs)-1)
+
+    if sum(remove_empty_masks_cnt) > 0:
+        print('load_co3d_data: removed %d train / %d test due to empty mask' % tuple(remove_empty_masks_cnt))
+    print(f'load_co3d_data: num images {len(i_split[0])} train / {len(i_split[1])} test')
+
+    imgs = np.array(imgs)
+    masks = np.array(masks)
+    poses = np.stack(poses, 0)
+    Ks = np.stack(Ks, 0)
+    render_poses = poses[i_split[-1]]
+    i_split.append(i_split[-1])
+
+    # visyalization hwf
+    H, W = np.array([im.shape[:2] for im in imgs]).mean(0).astype(int)
+    focal = Ks[:,[0,1],[0,1]].mean()
+
+    return imgs, masks, poses, render_poses, [H, W, focal], Ks, i_split
+
diff --git a/lib/load_data.py b/lib/load_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..86bfcb748b5bd2cf99d37626911bbbcadc3d3c88
--- /dev/null
+++ b/lib/load_data.py
@@ -0,0 +1,221 @@
+import numpy as np
+
+from .load_llff import load_llff_data
+from .load_blender import load_blender_data
+from .load_nsvf import load_nsvf_data
+from .load_blendedmvs import load_blendedmvs_data
+from .load_tankstemple import load_tankstemple_data
+from .load_deepvoxels import load_dv_data
+from .load_co3d import load_co3d_data
+from .load_nerfpp import load_nerfpp_data
+from .load_replica import load_replica_data
+from .load_lerf import load_lerf_data
+
+
+def load_data(args):
+
+    K, depths = None, None
+    near_clip = None
+
+    if args.dataset_type == 'llff':
+        images, depths, poses, bds, render_poses, i_test = load_llff_data(
+                args.datadir, args.factor, args.width, args.height,
+                recenter=True, bd_factor=args.bd_factor,
+                spherify=args.spherify,
+                load_depths=args.load_depths,
+                movie_render_kwargs=args.movie_render_kwargs, args=args)
+        hwf = poses[0,:3,-1]
+        poses = poses[:,:3,:4]
+        print('Loaded llff', images.shape, render_poses.shape, hwf, args.datadir)
+        if not isinstance(i_test, list):
+            i_test = [i_test]
+
+        if args.llffhold > 0:
+            print('Auto LLFF holdout,', args.llffhold)
+            i_test = np.arange(images.shape[0])[::args.llffhold]
+
+        # i_test = [1, 2]
+        # i_test = []
+        i_val = i_test
+        i_train = np.array([i for i in np.arange(int(images.shape[0])) if
+                        (i not in i_test and i not in i_val)])
+
+        print('DEFINING BOUNDS')
+        if args.ndc:
+            near = 0.
+            far = 1.
+        else:
+            near_clip = max(np.ndarray.min(bds) * .9, 0)
+            _far = max(np.ndarray.max(bds) * 1., 0)
+            near = 0
+            far = inward_nearfar_heuristic(poses[i_train, :3, 3])[1]
+            print('near_clip', near_clip)
+            print('original far', _far)
+        print('NEAR FAR', near, far)
+
+        if depths == 0:
+            depths = np.zeros_like(images[..., :1])
+
+    elif args.dataset_type == 'blender':
+        images, poses, render_poses, hwf, i_split = load_blender_data(args.datadir, args.half_res, args.testskip, args=args)
+        print('Loaded blender', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near, far = 2., 6.
+
+        if images.shape[-1] == 4:
+            if args.white_bkgd:
+                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])
+            else:
+                images = images[...,:3]*images[...,-1:]
+
+    elif args.dataset_type == 'blendedmvs':
+        images, poses, render_poses, hwf, K, i_split = load_blendedmvs_data(args.datadir)
+        print('Loaded blendedmvs', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3])
+
+        assert images.shape[-1] == 3
+
+    elif args.dataset_type == 'tankstemple':
+        images, poses, render_poses, hwf, K, i_split = load_tankstemple_data(
+                args.datadir, movie_render_kwargs=args.movie_render_kwargs)
+        print('Loaded tankstemple', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+#        i_test = [0]
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)
+        near_clip = near
+
+        if images.shape[-1] == 4:
+            if args.white_bkgd:
+                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])
+            else:
+                images = images[...,:3]*images[...,-1:]
+
+    elif args.dataset_type == 'nsvf':
+        images, poses, render_poses, hwf, i_split = load_nsvf_data(args.datadir)
+        print('Loaded nsvf', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3])
+        near_clip = near
+
+        if images.shape[-1] == 4:
+            if args.white_bkgd:
+                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])
+            else:
+                images = images[...,:3]*images[...,-1:]
+
+    elif args.dataset_type == 'deepvoxels':
+        images, poses, render_poses, hwf, i_split = load_dv_data(scene=args.scene, basedir=args.datadir, testskip=args.testskip)
+        print('Loaded deepvoxels', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        hemi_R = np.mean(np.linalg.norm(poses[:,:3,-1], axis=-1))
+        near = hemi_R - 1
+        far = hemi_R + 1
+        assert args.white_bkgd
+        assert images.shape[-1] == 3
+
+    elif args.dataset_type == 'co3d':
+        # each image can be in different shapes and intrinsics
+        images, masks, poses, render_poses, hwf, K, i_split = load_co3d_data(args)
+        print('Loaded co3d', args.datadir, args.annot_path, args.sequence_name)
+        i_train, i_val, i_test = i_split
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)
+
+        for i in range(len(images)):
+            if args.white_bkgd:
+                images[i] = images[i] * masks[i][...,None] + (1.-masks[i][...,None])
+            else:
+                images[i] = images[i] * masks[i][...,None]
+
+    elif args.dataset_type == 'nerfpp':
+        images, poses, render_poses, hwf, K, i_split = load_nerfpp_data(args.datadir)
+        print('Loaded nerf_pp', images.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near_clip, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0.02)
+        near = 0
+
+    elif args.dataset_type == 'replica':
+        images, poses, render_poses, hwf, i_split = load_replica_data(args.datadir, args.half_res, args.testskip, args=args, \
+                                                                                        spherify=args.spherify,movie_render_kwargs=args.movie_render_kwargs)
+        print('Loaded replica', images.shape, render_poses.shape, hwf, args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)
+        near_clip = near
+
+        print('NEAR FAR', near, far)
+
+
+        if images.shape[-1] == 4:
+            if args.white_bkgd:
+                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])
+            else:
+                images = images[...,:3]*images[...,-1:]
+
+    elif args.dataset_type == 'lerf':
+        images, poses, render_poses, hwf, K, i_split = load_lerf_data(args.datadir, args.factor,movie_render_kwargs=args.movie_render_kwargs)
+        print('Loaded lerf', images.shape, render_poses.shape, hwf[:2], args.datadir)
+        i_train, i_val, i_test = i_split
+
+        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)
+        near_clip = near
+
+        print('NEAR FAR', near, far)
+
+
+        if images.shape[-1] == 4:
+            if args.white_bkgd:
+                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])
+            else:
+                images = images[...,:3]*images[...,-1:]
+
+    else:
+        raise NotImplementedError(f'Unknown dataset type {args.dataset_type} exiting')
+
+    # Cast intrinsics to right types
+    H, W, focal = hwf
+    H, W = int(H), int(W)
+    hwf = [H, W, focal]
+    HW = np.array([im.shape[:2] for im in images])
+    irregular_shape = (images.dtype is np.dtype('object'))
+
+    if K is None:
+        K = np.array([
+            [focal, 0, 0.5*W],
+            [0, focal, 0.5*H],
+            [0, 0, 1]
+        ])
+
+    if len(K.shape) == 2:
+        Ks = K[None].repeat(len(poses), axis=0)
+    else:
+        Ks = K
+
+    render_poses = render_poses[...,:4]
+
+    data_dict = dict(
+        hwf=hwf, HW=HW, Ks=Ks,
+        near=near, far=far, near_clip=near_clip,
+        i_train=i_train, i_val=i_val, i_test=i_test,
+        poses=poses, render_poses=render_poses,
+        images=images, depths=depths,
+        irregular_shape=irregular_shape
+    )
+    return data_dict
+
+
+def inward_nearfar_heuristic(cam_o, ratio=0.05):
+    dist = np.linalg.norm(cam_o[:,None] - cam_o, axis=-1)
+    far = dist.max()  # could be too small to exist the scene bbox
+                      # it is only used to determined scene bbox
+                      # lib/dvgo use 1e9 as far
+    near = far * ratio
+    return near, far
+
diff --git a/lib/load_deepvoxels.py b/lib/load_deepvoxels.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3c1db9bf159404473113bd46b10298b56a0f37
--- /dev/null
+++ b/lib/load_deepvoxels.py
@@ -0,0 +1,107 @@
+import os
+import numpy as np
+import imageio
+
+
+def load_dv_data(scene='cube', basedir='/data/deepvoxels', testskip=1):
+
+    def parse_intrinsics(filepath, trgt_sidelength, invert_y=False):
+        # Get camera intrinsics
+        with open(filepath, 'r') as file:
+            f, cx, cy = list(map(float, file.readline().split()))[:3]
+            grid_barycenter = np.array(list(map(float, file.readline().split())))
+            near_plane = float(file.readline())
+            scale = float(file.readline())
+            height, width = map(float, file.readline().split())
+
+            try:
+                world2cam_poses = int(file.readline())
+            except ValueError:
+                world2cam_poses = None
+
+        if world2cam_poses is None:
+            world2cam_poses = False
+
+        world2cam_poses = bool(world2cam_poses)
+
+        print(cx,cy,f,height,width)
+
+        cx = cx / width * trgt_sidelength
+        cy = cy / height * trgt_sidelength
+        f = trgt_sidelength / height * f
+
+        fx = f
+        if invert_y:
+            fy = -f
+        else:
+            fy = f
+
+        # Build the intrinsic matrices
+        full_intrinsic = np.array([[fx, 0., cx, 0.],
+                                   [0., fy, cy, 0],
+                                   [0., 0, 1, 0],
+                                   [0, 0, 0, 1]])
+
+        return full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses
+
+
+    def load_pose(filename):
+        assert os.path.isfile(filename)
+        nums = open(filename).read().split()
+        return np.array([float(x) for x in nums]).reshape([4,4]).astype(np.float32)
+
+
+    H = 512
+    W = 512
+    deepvoxels_base = '{}/train/{}/'.format(basedir, scene)
+
+    full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses = parse_intrinsics(os.path.join(deepvoxels_base, 'intrinsics.txt'), H)
+    print(full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses)
+    focal = full_intrinsic[0,0]
+    print(H, W, focal)
+
+    def dir2poses(posedir):
+        poses = np.stack([load_pose(os.path.join(posedir, f)) for f in sorted(os.listdir(posedir)) if f.endswith('txt')], 0)
+        transf = np.array([
+            [1,0,0,0],
+            [0,-1,0,0],
+            [0,0,-1,0],
+            [0,0,0,1.],
+        ])
+        poses = poses @ transf
+        poses = poses[:,:3,:4].astype(np.float32)
+        return poses
+
+    posedir = os.path.join(deepvoxels_base, 'pose')
+    poses = dir2poses(posedir)
+    testposes = dir2poses('{}/test/{}/pose'.format(basedir, scene))
+    testposes = testposes[::testskip]
+    valposes = dir2poses('{}/validation/{}/pose'.format(basedir, scene))
+    valposes = valposes[::testskip]
+
+    imgfiles = [f for f in sorted(os.listdir(os.path.join(deepvoxels_base, 'rgb'))) if f.endswith('png')]
+    imgs = np.stack([imageio.imread(os.path.join(deepvoxels_base, 'rgb', f))/255. for f in imgfiles], 0).astype(np.float32)
+
+    testimgd = '{}/test/{}/rgb'.format(basedir, scene)
+    imgfiles = [f for f in sorted(os.listdir(testimgd)) if f.endswith('png')]
+    testimgs = np.stack([imageio.imread(os.path.join(testimgd, f))/255. for f in imgfiles[::testskip]], 0).astype(np.float32)
+
+    valimgd = '{}/validation/{}/rgb'.format(basedir, scene)
+    imgfiles = [f for f in sorted(os.listdir(valimgd)) if f.endswith('png')]
+    valimgs = np.stack([imageio.imread(os.path.join(valimgd, f))/255. for f in imgfiles[::testskip]], 0).astype(np.float32)
+
+    all_imgs = [imgs, valimgs, testimgs]
+    counts = [0] + [x.shape[0] for x in all_imgs]
+    counts = np.cumsum(counts)
+    i_split = [np.arange(counts[i], counts[i+1]) for i in range(3)]
+
+    imgs = np.concatenate(all_imgs, 0)
+    poses = np.concatenate([poses, valposes, testposes], 0)
+
+    render_poses = testposes
+
+    print(poses.shape, imgs.shape)
+
+    return imgs, poses, render_poses, [H, W, focal], i_split
+
+
diff --git a/lib/load_lerf.py b/lib/load_lerf.py
new file mode 100644
index 0000000000000000000000000000000000000000..440a97fff96bb86026572c145907e03e05c6bc51
--- /dev/null
+++ b/lib/load_lerf.py
@@ -0,0 +1,114 @@
+import os
+import torch
+import numpy as np
+import imageio
+import json
+import torch.nn.functional as F
+import cv2
+
+def normalize(x):
+    return x / np.linalg.norm(x)
+
+trans_t = lambda t : np.array([
+    [1,0,0,0],
+    [0,1,0,0],
+    [0,0,1,t],
+    [0,0,0,1]]).astype(np.float32)
+
+trans_center = lambda centroid : np.array([
+    [1,0,0,centroid[0]],
+    [0,1,0,centroid[1]],
+    [0,0,1,centroid[2]],
+    [0,0,0,1]]).astype(np.float32)
+
+rot_phi = lambda phi : np.array([ # rot dir: +y -> +z
+    [1,0,0,0],
+    [0,np.cos(phi),-np.sin(phi),0],
+    [0,np.sin(phi), np.cos(phi),0],
+    [0,0,0,1]]).astype(np.float32)
+
+rot_theta = lambda th : np.array([ # rot dir: +x -> +z
+    [np.cos(th),0,-np.sin(th),0],
+    [0,1,0,0],
+    [np.sin(th),0, np.cos(th),0],
+    [0,0,0,1]]).astype(np.float32)
+
+rot_gamma = lambda ga : np.array([ # rot dir: +x -> +y
+    [np.cos(ga),-np.sin(ga),0,0],
+    [np.sin(ga), np.cos(ga),0,0],
+    [0,0,1,0],
+    [0,0,0,1]]).astype(np.float32)
+
+
+def pose_spherical(gamma, phi, t):
+    c2w = np.array([
+            [1,0,0,0],
+            [0,1,0,0],
+            [0,0,1,0],
+            [0,0,0,1]]).astype(np.float32)
+    
+    c2w = rot_phi(phi/180.*np.pi) @ c2w
+    c2w = rot_gamma(gamma/180.*np.pi) @ c2w
+    c2w[:3, 3] = t
+    return c2w
+
+
+def load_lerf_data(basedir, factor=2, args=None, movie_render_kwargs={}):
+    with open(os.path.join(basedir, 'transforms.json'), 'r') as fp:
+        metas = json.load(fp)
+
+
+    imgs = []
+    poses = []
+    intrinsics = []
+    fts = []
+    skip = 1
+
+    for frame in metas['frames'][::skip]:
+        fname = os.path.join(basedir, frame['file_path'])
+        just_fname = fname.split('/')[-1]
+        if factor >= 2:
+            fname = os.path.join(basedir, 'images_{}'.format(factor), just_fname)
+        else:
+            fname = os.path.join(basedir, 'images', just_fname)
+        imgs.append(imageio.imread(fname))
+        poses.append(np.array(frame['transform_matrix']))
+        K = np.array([
+                [frame['fl_x']/factor, 0, frame['cx']/factor],
+                [0, frame['fl_y']/factor, frame['cy']/factor],
+                [0, 0, 1]
+            ]).astype(np.float32)
+        intrinsics.append(K)
+    imgs = (np.array(imgs) / 255.).astype(np.float32) # keep all 4 channels (RGBA)
+    poses = np.array(poses).astype(np.float32)
+    intrinsics = np.array(intrinsics).astype(np.float32)
+    f_avg = (intrinsics[:, 0, 0] + intrinsics[:, 1, 1]).mean() / 2.
+
+    i_test = np.arange(0, int(poses.shape[0]), 8)
+    i_val = i_test
+    i_train = np.array([i for i in np.arange(int(poses.shape[0])) if
+                        (i not in i_test and i not in i_val)])
+    i_split = [i_train, i_val, i_test]
+
+    H, W = imgs[0].shape[:2]
+
+    poses_ = poses.copy()
+    centroid = poses_[:,:3,3].mean(0)
+    radcircle = movie_render_kwargs.get('scale_r', 0) * np.linalg.norm(poses_[:,:3,3] - centroid, axis=-1).mean()
+    centroid[0] += movie_render_kwargs.get('shift_x', 0)
+    centroid[1] += movie_render_kwargs.get('shift_y', 0)
+    centroid[2] += movie_render_kwargs.get('shift_z', 0)
+    up_rad = movie_render_kwargs.get('pitch_deg', 0)
+    # render_poses = torch.stack([pose_spherical(angle, up_rad, centroid) for angle in np.linspace(-180,180,80+1)[:-1]], 0)
+
+    render_poses = []
+    camera_o = np.zeros_like(centroid)
+    num_render = 90
+    for th in np.linspace(0., 360., num_render):
+        camera_o[0] = centroid[0] + radcircle * np.cos(th/180.*np.pi)
+        camera_o[1] = centroid[1] + radcircle * np.sin(th/180.*np.pi)
+        camera_o[2] = centroid[2]
+        render_poses.append(pose_spherical(th+90.0, up_rad, camera_o))
+    render_poses = np.stack(render_poses, axis=0)
+
+    return imgs, poses, render_poses, [H, W, f_avg], intrinsics, i_split
diff --git a/lib/load_llff.py b/lib/load_llff.py
new file mode 100644
index 0000000000000000000000000000000000000000..771b3e0cc3fed3f2d7765e0229745c450fdaee88
--- /dev/null
+++ b/lib/load_llff.py
@@ -0,0 +1,425 @@
+import numpy as np
+import os, imageio
+import torch
+import scipy
+from tqdm import tqdm
+
+########## Slightly modified version of LLFF data loading code
+##########  see https://github.com/Fyusion/LLFF for original
+def imread(f):
+    if f.endswith('png'):
+        return imageio.imread(f, ignoregamma=True)
+    else:
+        return imageio.imread(f)
+
+def depthread(path):
+    with open(path, "rb") as fid:
+        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
+                                                usecols=(0, 1, 2), dtype=int)
+        fid.seek(0)
+        num_delimiter = 0
+        byte = fid.read(1)
+        while True:
+            if byte == b"&":
+                num_delimiter += 1
+                if num_delimiter >= 3:
+                    break
+            byte = fid.read(1)
+        array = np.fromfile(fid, np.float32)
+    array = array.reshape((width, height, channels), order="F")
+    return np.transpose(array, (1, 0, 2)).squeeze()
+
+
+def _minify(basedir, factors=[], resolutions=[]):
+    needtoload = False
+    for r in factors:
+        imgdir = os.path.join(basedir, 'images_{}'.format(r))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    for r in resolutions:
+        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    if not needtoload:
+        return
+
+    from shutil import copy
+    from subprocess import check_output
+
+    imgdir = os.path.join(basedir, 'images')
+    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]
+    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'png', 'jpeg', 'PNG']])]
+    imgdir_orig = imgdir
+
+    wd = os.getcwd()
+
+    for r in factors + resolutions:
+        if isinstance(r, int):
+            name = 'images_{}'.format(r)
+            resizearg = '{}%'.format(100./r)
+        else:
+            name = 'images_{}x{}'.format(r[1], r[0])
+            resizearg = '{}x{}'.format(r[1], r[0])
+        imgdir = os.path.join(basedir, name)
+        if os.path.exists(imgdir):
+            continue
+
+        print('Minifying', r, basedir)
+
+        os.makedirs(imgdir)
+        check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)
+
+        ext = imgs[0].split('.')[-1]
+        args = ' '.join(['mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])
+        print(args)
+        os.chdir(imgdir)
+        check_output(args, shell=True)
+        os.chdir(wd)
+
+        if ext != 'png':
+            check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)
+            print('Removed duplicates')
+        print('Done')
+
+
+def _load_data(basedir, factor=None, width=None, height=None, load_imgs=True, load_depths=False, args=None):
+
+    poses_arr = np.load(os.path.join(basedir, 'poses_bounds.npy'))
+    if poses_arr.shape[1] == 17:
+        poses = poses_arr[:, :-2].reshape([-1, 3, 5]).transpose([1,2,0])
+    elif poses_arr.shape[1] == 14:
+        poses = poses_arr[:, :-2].reshape([-1, 3, 4]).transpose([1,2,0])
+    else:
+        raise NotImplementedError
+    bds = poses_arr[:, -2:].transpose([1,0])
+
+    img0 = [os.path.join(basedir, 'images', f) for f in sorted(os.listdir(os.path.join(basedir, 'images'))) \
+            if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')][0]
+    sh = imageio.imread(img0).shape
+
+    sfx = ''
+
+    if height is not None and width is not None:
+        _minify(basedir, resolutions=[[height, width]])
+        sfx = '_{}x{}'.format(width, height)
+    elif factor is not None and factor != 1:
+        sfx = '_{}'.format(factor)
+        _minify(basedir, factors=[factor])
+        factor = factor
+    elif height is not None:
+        factor = sh[0] / float(height)
+        width = int(sh[1] / factor)
+        _minify(basedir, resolutions=[[height, width]])
+        sfx = '_{}x{}'.format(width, height)
+    elif width is not None:
+        factor = sh[1] / float(width)
+        height = int(sh[0] / factor)
+        _minify(basedir, resolutions=[[height, width]])
+        sfx = '_{}x{}'.format(width, height)
+    else:
+        factor = 1
+
+    imgdir = os.path.join(basedir, 'images' + sfx)
+    print(f'Loading images from {imgdir}')
+    if not os.path.exists(imgdir):
+        print( imgdir, 'does not exist, returning' )
+        return
+
+    imgfiles = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir)) if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')]
+    if poses.shape[-1] != len(imgfiles):
+        print()
+        print( 'Mismatch between imgs {} and poses {} !!!!'.format(len(imgfiles), poses.shape[-1]) )
+        names = set(name[:-4] for name in np.load(os.path.join(basedir, 'poses_names.npy')))
+        assert len(names) == poses.shape[-1]
+        print('Below failed files are skip due to SfM failure:')
+        new_imgfiles = []
+        for i in imgfiles:
+            fname = os.path.split(i)[1][:-4]
+            if fname in names:
+                new_imgfiles.append(i)
+            else:
+                print('==>', i)
+        imgfiles = new_imgfiles
+
+    if len(imgfiles) < 3:
+        print('Too few images...')
+        import sys; sys.exit()
+
+    sh = imageio.imread(imgfiles[0]).shape
+    if poses.shape[1] == 4:
+        poses = np.concatenate([poses, np.zeros_like(poses[:,[0]])], 1)
+        poses[2, 4, :] = np.load(os.path.join(basedir, 'hwf_cxcy.npy'))[2]
+    poses[:2, 4, :] = np.array(sh[:2]).reshape([2, 1])
+    poses[2, 4, :] = poses[2, 4, :] * 1./factor
+
+    if not load_imgs:
+        return poses, bds
+
+
+    imgs = imgs = [imread(f)[...,:3]/255. for f in tqdm(imgfiles)]
+    imgs = np.stack(imgs, -1)
+    height, width = imgs.shape[:2]
+
+    print('Loaded image data', imgs.shape, poses[:,-1,0])
+
+    if not load_depths:
+        return poses, bds, imgs
+
+    depthdir = os.path.join(basedir, 'stereo', 'depth_maps')
+    assert os.path.exists(depthdir), f'Dir not found: {depthdir}'
+
+    depthfiles = [os.path.join(depthdir, f) for f in sorted(os.listdir(depthdir)) if f.endswith('.geometric.bin')]
+    assert poses.shape[-1] == len(depthfiles), 'Mismatch between imgs {} and poses {} !!!!'.format(len(depthfiles), poses.shape[-1])
+
+    depths = [depthread(f) for f in depthfiles]
+    depths = np.stack(depths, -1)
+    print('Loaded depth data', depths.shape)
+    return poses, bds, imgs, depths
+
+
+def normalize(x):
+    return x / np.linalg.norm(x)
+
+def viewmatrix(z, up, pos):
+    vec2 = normalize(z)
+    vec1_avg = up
+    vec0 = normalize(np.cross(vec1_avg, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, pos], 1)
+    return m
+
+def ptstocam(pts, c2w):
+    tt = np.matmul(c2w[:3,:3].T, (pts-c2w[:3,3])[...,np.newaxis])[...,0]
+    return tt
+
+def poses_avg(poses):
+
+    hwf = poses[0, :3, -1:]
+
+    center = poses[:, :3, 3].mean(0)
+    vec2 = normalize(poses[:, :3, 2].sum(0))
+    up = poses[:, :3, 1].sum(0)
+    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)
+
+    return c2w
+
+
+
+def render_path_spiral(c2w, up, rads, focal, zdelta, zrate, rots, N):
+    render_poses = []
+    rads = np.array(list(rads) + [1.])
+    hwf = c2w[:,4:5]
+
+    for theta in np.linspace(0., 2. * np.pi * rots, N+1)[:-1]:
+        c = np.dot(c2w[:3,:4], np.array([np.cos(theta), -np.sin(theta), -np.sin(theta*zrate)*zdelta, 1.]) * rads) 
+        z = normalize(c - np.dot(c2w[:3,:4], np.array([0,0,-focal, 1.])))
+        render_poses.append(np.concatenate([viewmatrix(z, up, c), hwf], 1))
+    return render_poses
+
+
+
+def recenter_poses(poses):
+
+    poses_ = poses+0
+    bottom = np.reshape([0,0,0,1.], [1,4])
+    c2w = poses_avg(poses)
+    c2w = np.concatenate([c2w[:3,:4], bottom], -2)
+    bottom = np.tile(np.reshape(bottom, [1,1,4]), [poses.shape[0],1,1])
+    poses = np.concatenate([poses[:,:3,:4], bottom], -2)
+
+    poses = np.linalg.inv(c2w) @ poses
+    poses_[:,:3,:4] = poses[:,:3,:4]
+    poses = poses_
+    return poses
+
+
+def rerotate_poses(poses): # [B, 3, 5]
+    poses = np.copy(poses)
+    centroid = poses[:,:3,3].mean(0)
+
+    poses[:,:3,3] = poses[:,:3,3] - centroid
+
+    # Find the minimum pca vector with minimum eigen value
+    x = poses[:,:,3]
+    mu = x.mean(0)
+    cov = np.cov((x-mu).T)
+    ev , eig = np.linalg.eig(cov)
+    cams_up = eig[:,np.argmin(ev)] # [3, ]
+    if cams_up[1] < 0: 
+        cams_up = -cams_up
+
+    # Find rotation matrix that align cams_up with [0,1,0]
+    R = scipy.spatial.transform.Rotation.align_vectors(
+            [[0,1,0]], cams_up[None])[0].as_matrix()
+
+    # Apply rotation and add back the centroid position
+    poses[:,:3,:3] = R @ poses[:,:3,:3]
+    poses[:,:3,[3]] = R @ poses[:,:3,[3]]
+    poses[:,:3,3] = poses[:,:3,3] + centroid
+    return poses
+
+#####################
+
+
+def spherify_poses(poses, bds, depths):
+
+    p34_to_44 = lambda p : np.concatenate([p, np.tile(np.reshape(np.eye(4)[-1,:], [1,1,4]), [p.shape[0], 1,1])], 1)
+
+    rays_d = poses[:,:3,2:3]
+    rays_o = poses[:,:3,3:4]
+
+    def min_line_dist(rays_o, rays_d):
+        A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1])
+        b_i = -A_i @ rays_o
+        pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0))
+        return pt_mindist
+
+    pt_mindist = min_line_dist(rays_o, rays_d)
+
+    center = pt_mindist
+    up = (poses[:,:3,3] - center).mean(0)
+
+    vec0 = normalize(up)
+    vec1 = normalize(np.cross([.1,.2,.3], vec0))
+    vec2 = normalize(np.cross(vec0, vec1))
+    pos = center
+    c2w = np.stack([vec1, vec2, vec0, pos], 1)
+
+    poses_reset = np.linalg.inv(p34_to_44(c2w[None])) @ p34_to_44(poses[:,:3,:4])
+
+    radius = np.sqrt(np.mean(np.sum(np.square(poses_reset[:,:3,3]), -1)))
+
+    sc = 1./radius
+    poses_reset[:,:3,3] *= sc
+    bds *= sc
+    radius *= sc
+    depths *= sc
+
+    poses_reset = np.concatenate([poses_reset[:,:3,:4], np.broadcast_to(poses[0,:3,-1:], poses_reset[:,:3,-1:].shape)], -1)
+
+    return poses_reset, radius, bds, depths
+
+
+def load_llff_data(basedir, factor=8, width=None, height=None,
+                   recenter=True, rerotate=True,
+                   bd_factor=.75, spherify=False, path_zflat=False, load_depths=False,
+                   movie_render_kwargs={}, args=None):
+
+    poses, bds, imgs = _load_data(basedir, factor=factor, width=width, height=height,
+                                           load_depths=load_depths, args=args) # factor=8 downsamples original imgs by 8x
+    print('Loaded', basedir, bds.min(), bds.max())
+    if load_depths:
+        depths = depths[0]
+    else:
+        depths = 0
+
+    # Correct rotation matrix ordering and move variable dim to axis 0
+    poses = np.concatenate([poses[:, 1:2, :], -poses[:, 0:1, :], poses[:, 2:, :]], 1)
+    poses = np.moveaxis(poses, -1, 0).astype(np.float32)
+    imgs = np.moveaxis(imgs, -1, 0).astype(np.float32)
+    images = imgs
+    bds = np.moveaxis(bds, -1, 0).astype(np.float32)
+
+    # Rescale if bd_factor is provided
+    if bds.min() < 0 and bd_factor is not None:
+        print('Found negative z values from SfM sparse points!?')
+        print('Please try bd_factor=None')
+        import sys; sys.exit()
+    sc = 1. if bd_factor is None else 1./(bds.min() * bd_factor)
+    poses[:,:3,3] *= sc # [B, 3, 5]
+    bds *= sc
+    depths *= sc
+
+    if recenter:
+        poses = recenter_poses(poses) # [B, 3, 5]
+
+    if spherify:
+        poses, radius, bds, depths = spherify_poses(poses, bds, depths) # [B, 3, 5]
+        if rerotate:
+            poses = rerotate_poses(poses) # [B, 3, 5]
+
+        ### generate spiral poses for rendering fly-through movie
+        centroid = poses[:,:3,3].mean(0)
+        radcircle = movie_render_kwargs.get('scale_r', 1) * np.linalg.norm(poses[:,:3,3] - centroid, axis=-1).mean()
+        centroid[0] += movie_render_kwargs.get('shift_x', 0)
+        centroid[1] += movie_render_kwargs.get('shift_y', 0)
+        centroid[2] += movie_render_kwargs.get('shift_z', 0)
+        new_up_rad = movie_render_kwargs.get('pitch_deg', 0) * np.pi / 180
+        target_y = radcircle * np.tan(new_up_rad)
+
+        render_poses = []
+
+        for th in np.linspace(0., 2.*np.pi, 200):
+            camorigin = np.array([radcircle * np.cos(th), 0, radcircle * np.sin(th)])
+            if movie_render_kwargs.get('flip_up', False):
+                up = np.array([0,1.,0])
+            else:
+                up = np.array([0,-1.,0])
+            vec2 = normalize(camorigin)
+            vec0 = normalize(np.cross(vec2, up))
+            vec1 = normalize(np.cross(vec2, vec0))
+            pos = camorigin + centroid
+            # rotate to align with new pitch rotation
+            lookat = -vec2
+            lookat[1] = target_y
+            lookat = normalize(lookat)
+            vec2 = -lookat
+            vec1 = normalize(np.cross(vec2, vec0))
+
+            p = np.stack([vec0, vec1, vec2, pos], 1)
+
+            render_poses.append(p)
+
+        render_poses = np.stack(render_poses, 0)
+        render_poses = np.concatenate([render_poses, np.broadcast_to(poses[0,:3,-1:], render_poses[:,:3,-1:].shape)], -1)
+
+    else:
+
+        c2w = poses_avg(poses)
+        print('recentered', c2w.shape)
+        print(c2w[:3,:4])
+
+        ## Get spiral
+        # Get average pose
+        up = normalize(poses[:, :3, 1].sum(0))
+
+        # Find a reasonable "focus depth" for this dataset
+        close_depth, inf_depth = bds.min()*.9, bds.max()*5.
+        dt = .75
+        mean_dz = 1./(((1.-dt)/close_depth + dt/inf_depth))
+        focal = mean_dz * movie_render_kwargs.get('scale_f', 1)
+
+        # Get radii for spiral path
+        zdelta = movie_render_kwargs.get('zdelta', 0.5)
+        zrate = movie_render_kwargs.get('zrate', 1.0)
+        tt = poses[:,:3,3] # ptstocam(poses[:3,3,:].T, c2w).T
+        rads = np.percentile(np.abs(tt), 90, 0) * movie_render_kwargs.get('scale_r', 1)
+        c2w_path = c2w
+        N_views = 120
+        N_rots = movie_render_kwargs.get('N_rots', 1)
+        if path_zflat:
+#             zloc = np.percentile(tt, 10, 0)[2]
+            zloc = -close_depth * .1
+            c2w_path[:3,3] = c2w_path[:3,3] + zloc * c2w_path[:3,2]
+            rads[2] = 0.
+            N_rots = 1
+            N_views/=2
+
+        # Generate poses for spiral path
+        render_poses = render_path_spiral(c2w_path, up, rads, focal, zdelta, zrate=zrate, rots=N_rots, N=N_views)
+
+    render_poses = torch.Tensor(render_poses)
+
+    c2w = poses_avg(poses)
+    print('Data:')
+    print(poses.shape, images.shape, bds.shape)
+
+    dists = np.sum(np.square(c2w[:3,3] - poses[:,:3,3]), -1)
+    i_test = np.argmin(dists)
+    print('HOLDOUT view is', i_test)
+
+    images = images.astype(np.float32)
+    poses = poses.astype(np.float32)
+
+    return images, depths, poses, bds, render_poses, i_test
+
diff --git a/lib/load_nerfpp.py b/lib/load_nerfpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..12f50b5dcae78ccd2ec8a633fb085f43242546f3
--- /dev/null
+++ b/lib/load_nerfpp.py
@@ -0,0 +1,165 @@
+'''
+Modify from
+https://github.com/Kai-46/nerfplusplus/blob/master/data_loader_split.py
+'''
+import os
+import glob
+import scipy
+import imageio
+import numpy as np
+import torch
+
+########################################################################################################################
+# camera coordinate system: x-->right, y-->down, z-->scene (opencv/colmap convention)
+# poses is camera-to-world
+########################################################################################################################
+def find_files(dir, exts):
+    if os.path.isdir(dir):
+        files_grabbed = []
+        for ext in exts:
+            files_grabbed.extend(glob.glob(os.path.join(dir, ext)))
+        if len(files_grabbed) > 0:
+            files_grabbed = sorted(files_grabbed)
+        return files_grabbed
+    else:
+        return []
+
+
+def load_data_split(split_dir, skip=1, try_load_min_depth=True, only_img_files=False):
+
+    def parse_txt(filename):
+        assert os.path.isfile(filename)
+        nums = open(filename).read().split()
+        return np.array([float(x) for x in nums]).reshape([4, 4]).astype(np.float32)
+
+    if only_img_files:
+        img_files = find_files('{}/rgb'.format(split_dir), exts=['*.png', '*.jpg'])
+        return img_files
+
+    # camera parameters files
+    intrinsics_files = find_files('{}/intrinsics'.format(split_dir), exts=['*.txt'])
+    pose_files = find_files('{}/pose'.format(split_dir), exts=['*.txt'])
+
+    intrinsics_files = intrinsics_files[::skip]
+    pose_files = pose_files[::skip]
+    cam_cnt = len(pose_files)
+
+    # img files
+    img_files = find_files('{}/rgb'.format(split_dir), exts=['*.png', '*.jpg'])
+    if len(img_files) > 0:
+        img_files = img_files[::skip]
+        assert(len(img_files) == cam_cnt)
+    else:
+        img_files = [None, ] * cam_cnt
+
+    # mask files
+    mask_files = find_files('{}/mask'.format(split_dir), exts=['*.png', '*.jpg'])
+    if len(mask_files) > 0:
+        mask_files = mask_files[::skip]
+        assert(len(mask_files) == cam_cnt)
+    else:
+        mask_files = [None, ] * cam_cnt
+
+    # min depth files
+    mindepth_files = find_files('{}/min_depth'.format(split_dir), exts=['*.png', '*.jpg'])
+    if try_load_min_depth and len(mindepth_files) > 0:
+        mindepth_files = mindepth_files[::skip]
+        assert(len(mindepth_files) == cam_cnt)
+    else:
+        mindepth_files = [None, ] * cam_cnt
+
+    return intrinsics_files, pose_files, img_files, mask_files, mindepth_files
+
+
+def rerotate_poses(poses, render_poses):
+    poses = np.copy(poses)
+    centroid = poses[:,:3,3].mean(0)
+
+    poses[:,:3,3] = poses[:,:3,3] - centroid
+
+    # Find the minimum pca vector with minimum eigen value
+    x = poses[:,:3,3]
+    mu = x.mean(0)
+    cov = np.cov((x-mu).T)
+    ev , eig = np.linalg.eig(cov)
+    cams_up = eig[:,np.argmin(ev)]
+    if cams_up[1] < 0:
+        cams_up = -cams_up
+
+    # Find rotation matrix that align cams_up with [0,1,0]
+    R = scipy.spatial.transform.Rotation.align_vectors(
+            [[0,-1,0]], cams_up[None])[0].as_matrix()
+
+    # Apply rotation and add back the centroid position
+    poses[:,:3,:3] = R @ poses[:,:3,:3]
+    poses[:,:3,[3]] = R @ poses[:,:3,[3]]
+    poses[:,:3,3] = poses[:,:3,3] + centroid
+    render_poses = np.copy(render_poses)
+    render_poses[:,:3,3] = render_poses[:,:3,3] - centroid
+    render_poses[:,:3,:3] = R @ render_poses[:,:3,:3]
+    render_poses[:,:3,[3]] = R @ render_poses[:,:3,[3]]
+    render_poses[:,:3,3] = render_poses[:,:3,3] + centroid
+    return poses, render_poses
+
+
+def load_nerfpp_data(basedir, rerotate=True):
+    tr_K, tr_c2w, tr_im_path = load_data_split(os.path.join(basedir, 'train'))[:3]
+    te_K, te_c2w, te_im_path = load_data_split(os.path.join(basedir, 'test'))[:3]
+    assert len(tr_K) == len(tr_c2w) and len(tr_K) == len(tr_im_path)
+    assert len(te_K) == len(te_c2w) and len(te_K) == len(te_im_path)
+
+    # Determine split id list
+    i_split = [[], []]
+    i = 0
+    for _ in tr_c2w:
+        i_split[0].append(i)
+        i += 1
+    for _ in te_c2w:
+        i_split[1].append(i)
+        i += 1
+
+    # Load camera intrinsics. Assume all images share a intrinsic.
+    K_flatten = np.loadtxt(tr_K[0])
+    for path in tr_K:
+        assert np.allclose(np.loadtxt(path), K_flatten)
+    for path in te_K:
+        assert np.allclose(np.loadtxt(path), K_flatten)
+    K = K_flatten.reshape(4,4)[:3,:3]
+
+    # Load camera poses
+    poses = []
+    for path in tr_c2w:
+        poses.append(np.loadtxt(path).reshape(4,4))
+    for path in te_c2w:
+        poses.append(np.loadtxt(path).reshape(4,4))
+
+    # Load images
+    imgs = []
+    for path in tr_im_path:
+        imgs.append(imageio.imread(path) / 255.)
+    for path in te_im_path:
+        imgs.append(imageio.imread(path) / 255.)
+
+    # Bundle all data
+    imgs = np.stack(imgs, 0)
+    poses = np.stack(poses, 0)
+    i_split.append(i_split[1])
+    H, W = imgs.shape[1:3]
+    focal = K[[0,1], [0,1]].mean()
+
+    # Generate movie trajectory
+    render_poses_path = sorted(glob.glob(os.path.join(basedir, 'camera_path', 'pose', '*txt')))
+    render_poses = []
+    for path in render_poses_path:
+        render_poses.append(np.loadtxt(path).reshape(4,4))
+    render_poses = np.array(render_poses)
+    render_K = np.loadtxt(glob.glob(os.path.join(basedir, 'camera_path', 'intrinsics', '*txt'))[0]).reshape(4,4)[:3,:3]
+    render_poses[:,:,0] *= K[0,0] / render_K[0,0]
+    render_poses[:,:,1] *= K[1,1] / render_K[1,1]
+    if rerotate:
+        poses, render_poses = rerotate_poses(poses, render_poses)
+
+    render_poses = torch.Tensor(render_poses)
+
+    return imgs, poses, render_poses, [H, W, focal], K, i_split
+
diff --git a/lib/load_nsvf.py b/lib/load_nsvf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ec478908fbd01c548f43ac022b34b62398eab58
--- /dev/null
+++ b/lib/load_nsvf.py
@@ -0,0 +1,65 @@
+import os
+import glob
+import torch
+import numpy as np
+import imageio
+import json
+import torch.nn.functional as F
+import cv2
+
+
+trans_t = lambda t : torch.Tensor([
+    [1,0,0,0],
+    [0,1,0,0],
+    [0,0,1,t],
+    [0,0,0,1]]).float()
+
+rot_phi = lambda phi : torch.Tensor([
+    [1,0,0,0],
+    [0,np.cos(phi),-np.sin(phi),0],
+    [0,np.sin(phi), np.cos(phi),0],
+    [0,0,0,1]]).float()
+
+rot_theta = lambda th : torch.Tensor([
+    [np.cos(th),0,-np.sin(th),0],
+    [0,1,0,0],
+    [np.sin(th),0, np.cos(th),0],
+    [0,0,0,1]]).float()
+
+
+def pose_spherical(theta, phi, radius):
+    c2w = trans_t(radius)
+    c2w = rot_phi(phi/180.*np.pi) @ c2w
+    c2w = rot_theta(theta/180.*np.pi) @ c2w
+    c2w = torch.Tensor(np.array([[-1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1]])) @ c2w
+    c2w[:,[1,2]] *= -1
+    return c2w
+
+
+def load_nsvf_data(basedir):
+    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))
+    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*png')))
+
+    all_poses = []
+    all_imgs = []
+    i_split = [[], [], []]
+    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):
+        i_set = int(os.path.split(rgb_path)[-1][0])
+        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))
+        all_poses.append(np.loadtxt(pose_path).astype(np.float32))
+        i_split[i_set].append(i)
+    if i_split[2] == []:
+        i_split[2] = i_split[1]
+
+    imgs = np.stack(all_imgs, 0)
+    poses = np.stack(all_poses, 0)
+
+    H, W = imgs[0].shape[:2]
+    with open(os.path.join(basedir, 'intrinsics.txt')) as f:
+        focal = float(f.readline().split()[0])
+
+    R = np.sqrt((poses[...,:3,3]**2).sum(-1)).mean()
+    render_poses = torch.stack([pose_spherical(angle, -30.0, R) for angle in np.linspace(-180,180,200+1)[:-1]], 0)
+
+    return imgs, poses, render_poses, [H, W, focal], i_split
+
diff --git a/lib/load_nvos.py b/lib/load_nvos.py
new file mode 100644
index 0000000000000000000000000000000000000000..8269db2018ed17f4ac6a674b66db17577c554fcf
--- /dev/null
+++ b/lib/load_nvos.py
@@ -0,0 +1,188 @@
+import numpy as np
+import os, imageio
+import torch
+import scipy
+from tqdm import tqdm
+
+########## Slightly modified version of LLFF data loading code
+##########  see https://github.com/Fyusion/LLFF for original
+def _minify(basedir, factors=[], resolutions=[]):
+    needtoload = False
+    for r in factors:
+        imgdir = os.path.join(basedir, 'images_{}'.format(r))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    for r in resolutions:
+        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    if not needtoload:
+        return
+
+    from shutil import copy
+    from subprocess import check_output
+
+    imgdir = os.path.join(basedir, 'images')
+    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]
+    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'png', 'jpeg', 'PNG']])]
+    imgdir_orig = imgdir
+
+    wd = os.getcwd()
+
+    for r in factors + resolutions:
+        if isinstance(r, int):
+            name = 'images_{}'.format(r)
+            resizearg = '{}%'.format(100./r)
+        else:
+            name = 'images_{}x{}'.format(r[1], r[0])
+            resizearg = '{}x{}'.format(r[1], r[0])
+        imgdir = os.path.join(basedir, name)
+        if os.path.exists(imgdir):
+            continue
+
+        print('Minifying', r, basedir)
+
+        os.makedirs(imgdir)
+        check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)
+
+        ext = imgs[0].split('.')[-1]
+        args = ' '.join(['mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])
+        print(args)
+        os.chdir(imgdir)
+        check_output(args, shell=True)
+        os.chdir(wd)
+
+        if ext != 'png':
+            check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)
+            print('Removed duplicates')
+        print('Done')
+
+
+def load_nvos_data(basedir, factor=8):
+
+    poses_arr = np.load(os.path.join(basedir, 'poses_bounds.npy'))
+    if poses_arr.shape[1] == 17:
+        poses = poses_arr[:, :-2].reshape([-1, 3, 5]).transpose([1,2,0])
+    elif poses_arr.shape[1] == 14:
+        poses = poses_arr[:, :-2].reshape([-1, 3, 4]).transpose([1,2,0])
+    else:
+        raise NotImplementedError
+    bds = poses_arr[:, -2:].transpose([1,0])
+
+    img0 = [os.path.join(basedir, 'images', f) for f in sorted(os.listdir(os.path.join(basedir, 'images'))) \
+            if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')][0]
+    sh = imageio.imread(img0).shape
+
+    sfx = ''
+
+#     if factor is not None and factor != 1:
+#         sfx = '_{}'.format(factor)
+#         _minify(basedir, factors=[factor])
+#         factor = factor
+#     else:
+#         factor = 1
+    
+
+    imgdir = os.path.join(basedir, 'images' + sfx)
+    if not os.path.exists(imgdir):
+        print( imgdir, 'does not exist, returning' )
+        return
+
+    imgfiles = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir)) if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')]
+    
+    # the name of scene, e.g. horns, orchids, trex, ...    
+    scene_name = basedir.split('/')[-1]
+    prefix_path = basedir[:-len(scene_name)]
+    
+    if 'horns' in scene_name:
+        scene_name = 'horns_left' if int(input("Please choose the segmentation target of horns: 0. left; 1. center")) == 0 else 'horns_center'
+       
+    # get ref name
+    ref_name_pre = os.path.join(prefix_path, 'reference_image', scene_name)
+    ref_name = os.listdir(ref_name_pre)[0]
+    
+    # get target name and mask path
+    target_name_pre = os.path.join(prefix_path, 'masks', scene_name)
+    target_names = os.listdir(target_name_pre)
+    
+    mask_path = None
+    for name in target_names:
+        if "_mask" in name:
+            mask_path = os.path.join(prefix_path, 'masks', scene_name, name)
+        else:
+            target_name = name
+            
+#     print("mask_path", mask_path)
+#     print("target_name", target_name)
+#     print("ref_name", ref_name)
+    
+    # get reference image index
+    ref_ind = -1
+    for ind, img_name in enumerate(imgfiles):
+#         print(ind, img_name)
+        if ref_name in img_name:
+            ref_ind = ind
+            break
+    assert ref_ind != -1 and "no available reference image"
+    
+    # get target image index
+    target_ind = -1
+    for ind, img_name in enumerate(imgfiles):
+        
+        if target_name in img_name:
+            target_ind = ind
+            break
+    assert target_ind != -1 and "no available target image"
+    
+    # load target mask
+    target_mask = imageio.imread(mask_path)/255.
+    
+    # load scribbles
+    scribbles_path = os.path.join(prefix_path, 'scribbles', scene_name)
+    pos_path, neg_path = None, None
+    for scribble_name in os.listdir(scribbles_path):
+        if 'pos' in scribble_name:
+            pos_path = os.path.join(scribbles_path, scribble_name)
+        elif 'neg' in scribble_name:
+            neg_path = os.path.join(scribbles_path, scribble_name)
+            
+    
+    pos_scribbles = imageio.imread(pos_path)/255.
+    neg_scribbles = imageio.imread(neg_path)/255.
+    
+    if len(pos_scribbles.shape) == 3:
+        pos_scribbles = pos_scribbles.sum(-1)
+        pos_scribbles[pos_scribbles != 0] = 1
+    if len(neg_scribbles.shape) == 3:
+        neg_scribbles = neg_scribbles.sum(-1)
+        neg_scribbles[neg_scribbles != 0] = 1
+    
+    print("Skeletonizing the NVOS Scribbles")
+    from skimage import morphology
+    pos_scribbles = morphology.skeletonize(pos_scribbles).astype(np.float32)
+    neg_scribbles = morphology.skeletonize(neg_scribbles).astype(np.float32)
+        
+    pos_scribbles *= np.random.rand(pos_scribbles.shape[0], pos_scribbles.shape[1])
+    neg_scribbles *= np.random.rand(pos_scribbles.shape[0], pos_scribbles.shape[1])
+    
+    pos_scribbles[pos_scribbles < 0.98] = 0
+    neg_scribbles[neg_scribbles < 0.995] = 0
+    
+    sh = imageio.imread(imgfiles[0]).shape
+    if poses.shape[1] == 4:
+        poses = np.concatenate([poses, np.zeros_like(poses[:,[0]])], 1)
+        poses[2, 4, :] = np.load(os.path.join(basedir, 'hwf_cxcy.npy'))[2]
+    poses[:2, 4, :] = np.array(sh[:2]).reshape([2, 1])
+    poses[2, 4, :] = poses[2, 4, :] * 1./factor
+
+    
+    target_pose = poses[:,:,target_ind]
+    ref_pose = poses[:,:,ref_ind]
+    
+    pos_points = np.where(pos_scribbles)
+    pos_points = np.concatenate([pos_points[1][:, None], pos_points[0][:, None]], axis = 1)
+    
+    neg_points = np.where(neg_scribbles)
+    neg_points = np.concatenate([neg_points[1][:, None], neg_points[0][:, None]], axis = 1)
+
+    return ref_ind, ref_pose, pos_points, neg_points, target_ind, target_pose, target_mask
\ No newline at end of file
diff --git a/lib/load_replica.py b/lib/load_replica.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb534e517eb4afaf4d2b29fb9a053d2cce952aa8
--- /dev/null
+++ b/lib/load_replica.py
@@ -0,0 +1,226 @@
+import os
+import torch
+import numpy as np
+import imageio
+import json
+import scipy
+import torch.nn.functional as F
+import cv2
+import math
+import glob
+
+
+
+trans_t = lambda t : torch.Tensor([
+    [1,0,0,0],
+    [0,1,0,0],
+    [0,0,1,t],
+    [0,0,0,1]]).float()
+
+trans_center = lambda centroid : torch.Tensor([
+    [1,0,0,centroid[0]],
+    [0,1,0,centroid[1]],
+    [0,0,1,centroid[2]],
+    [0,0,0,1]]).float()
+
+rot_phi = lambda phi : torch.Tensor([ # rot dir: +y -> +z
+    [1,0,0,0],
+    [0,np.cos(phi),-np.sin(phi),0],
+    [0,np.sin(phi), np.cos(phi),0],
+    [0,0,0,1]]).float()
+
+rot_theta = lambda th : torch.Tensor([ # rot dir: +x -> +z
+    [np.cos(th),0,-np.sin(th),0],
+    [0,1,0,0],
+    [np.sin(th),0, np.cos(th),0],
+    [0,0,0,1]]).float()
+
+rot_gamma = lambda ga : torch.Tensor([ # rot dir: +x -> +y
+    [np.cos(ga),-np.sin(ga),0,0],
+    [np.sin(ga), np.cos(ga),0,0],
+    [0,0,1,0],
+    [0,0,0,1]]).float()
+
+
+
+def pose_spherical(gamma, phi, t):
+    c2w = torch.Tensor([
+            [1,0,0,0],
+            [0,1,0,0],
+            [0,0,1,0],
+            [0,0,0,1]]).float()
+    c2w = rot_phi(phi/180.*np.pi) @ c2w
+    c2w = rot_gamma(gamma/180.*np.pi) @ c2w
+    c2w[:3, 3] = t
+    return c2w
+
+def normalize(x):
+    return x / np.linalg.norm(x)
+
+def viewmatrix(z, up, pos):
+    vec2 = normalize(z)
+    vec1_avg = up
+    vec0 = normalize(np.cross(vec1_avg, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, pos], 1)
+    return m
+
+def ptstocam(pts, c2w):
+    tt = np.matmul(c2w[:3,:3].T, (pts-c2w[:3,3])[...,np.newaxis])[...,0]
+    return tt
+
+def poses_avg(poses):
+
+    hwf = poses[0, :3, -1:]
+
+    center = poses[:, :3, 3].mean(0)
+    vec2 = normalize(poses[:, :3, 2].sum(0))
+    up = poses[:, :3, 1].sum(0)
+    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)
+
+    return c2w
+
+
+
+def render_path_spiral(c2w, up, rads, focal, zdelta, zrate, rots, N):
+    render_poses = []
+    rads = np.array(list(rads) + [1.])
+    hwf = c2w[:,4:5]
+
+    for theta in np.linspace(0., 2. * np.pi * rots, N+1)[:-1]:
+        c = np.dot(c2w[:3,:4], np.array([np.cos(theta), -np.sin(theta), -np.sin(theta*zrate)*zdelta, 1.]) * rads) 
+        z = normalize(c - np.dot(c2w[:3,:4], np.array([0,0,-focal, 1.])))
+        render_poses.append(np.concatenate([viewmatrix(z, up, c), hwf], 1))
+    return render_poses
+
+
+
+def recenter_poses(poses):
+
+    poses_ = poses+0
+    bottom = np.reshape([0,0,0,1.], [1,4])
+    c2w = poses_avg(poses)
+    c2w = np.concatenate([c2w[:3,:4], bottom], -2)
+    bottom = np.tile(np.reshape(bottom, [1,1,4]), [poses.shape[0],1,1])
+    poses = np.concatenate([poses[:,:3,:4], bottom], -2)
+
+    poses = np.linalg.inv(c2w) @ poses
+    poses_[:,:3,:4] = poses[:,:3,:4]
+    poses = poses_
+    return poses
+
+
+def rerotate_poses(poses):
+    poses = np.copy(poses)
+    centroid = poses[:,:3,3].mean(0)
+
+    poses[:,:3,3] = poses[:,:3,3] - centroid
+
+    # Find the minimum pca vector with minimum eigen value
+    x = poses[:,:,3]
+    mu = x.mean(0)
+    cov = np.cov((x-mu).T)
+    ev , eig = np.linalg.eig(cov)
+    cams_up = eig[:,np.argmin(ev)]
+    if cams_up[1] < 0:
+        cams_up = -cams_up
+
+    # Find rotation matrix that align cams_up with [0,1,0]
+    R = scipy.spatial.transform.Rotation.align_vectors(
+            [[0,1,0]], cams_up[None])[0].as_matrix()
+
+    # Apply rotation and add back the centroid position
+    poses[:,:3,:3] = R @ poses[:,:3,:3]
+    poses[:,:3,[3]] = R @ poses[:,:3,[3]]
+    poses[:,:3,3] = poses[:,:3,3] + centroid
+    return poses
+
+#####################
+
+
+def spherify_poses(poses, bds, depths):
+
+    p34_to_44 = lambda p : np.concatenate([p, np.tile(np.reshape(np.eye(4)[-1,:], [1,1,4]), [p.shape[0], 1,1])], 1)
+
+    rays_d = poses[:,:3,2:3]
+    rays_o = poses[:,:3,3:4]
+
+    def min_line_dist(rays_o, rays_d):
+        A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1])
+        b_i = -A_i @ rays_o
+        pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0))
+        return pt_mindist
+
+    pt_mindist = min_line_dist(rays_o, rays_d)
+
+    center = pt_mindist
+    up = (poses[:,:3,3] - center).mean(0)
+
+    vec0 = normalize(up)
+    vec1 = normalize(np.cross([.1,.2,.3], vec0))
+    vec2 = normalize(np.cross(vec0, vec1))
+    pos = center
+    c2w = np.stack([vec1, vec2, vec0, pos], 1)
+
+    poses_reset = np.linalg.inv(p34_to_44(c2w[None])) @ p34_to_44(poses[:,:3,:4])
+
+    radius = np.sqrt(np.mean(np.sum(np.square(poses_reset[:,:3,3]), -1)))
+
+    sc = 1./radius
+    poses_reset[:,:3,3] *= sc
+    bds *= sc
+    radius *= sc
+    depths *= sc
+
+    poses_reset = np.concatenate([poses_reset[:,:3,:4], np.broadcast_to(poses[0,:3,-1:], poses_reset[:,:3,-1:].shape)], -1)
+
+    return poses_reset, radius, bds, depths
+
+
+def load_replica_data(basedir='./data/replica/office_0', half_res=False, \
+                      movie_render_kwargs={}, bds=[0.1, 10.0]):
+    poses = []
+    with open(os.path.join(basedir, 'traj_w_c.txt'), 'r') as fp:
+        for line in fp:
+            tokens = line.split(' ')
+            tokens = [float(token) for token in tokens]
+            tokens = np.array(tokens).reshape(4, 4)
+            poses.append(tokens)
+    poses =  np.stack(poses, 0)
+
+    # Ts_full = np.loadtxt(os.path.join(basedir, 'traj_w_c.txt'), delimiter=" ").reshape(-1, 4, 4)
+    all_imgs_paths = sorted(os.listdir(os.path.join(basedir, 'rgb')), key=lambda file_name: int(file_name.split("_")[-1][:-4]))
+
+
+    imgs = []
+    for i in range(len(all_imgs_paths)):
+        fname = os.path.join(basedir, 'rgb', all_imgs_paths[i])
+        imgs.append(imageio.imread(fname))
+    imgs = (np.array(imgs) / 255.).astype(np.float32) # keep all 4 channels (RGBA)
+    poses = np.array(poses).astype(np.float32)
+
+    H, W = imgs[0].shape[:2]
+    hfov = 90
+    focal = W / 2.0 / math.tan(math.radians(hfov / 2.0))
+
+    render_poses = torch.stack([pose_spherical(angle, -120.0, 0.0) for angle in np.linspace(-180,180,160+1)[:-1]], 0)
+    
+    if half_res:
+        H = H//2
+        W = W//2
+        focal = focal/2.
+
+        imgs_half_res = np.zeros((imgs.shape[0], H, W, 4))
+        for i, img in enumerate(imgs):
+            imgs_half_res[i] = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)
+        imgs = imgs_half_res
+
+    step = 5
+    train_ids = np.arange(0, poses.shape[0], step)
+    test_ids = np.array([x+step//2 for x in train_ids])
+    i_split = [train_ids, test_ids, test_ids]
+
+    return imgs, poses, render_poses, [H, W, focal], i_split
+
+if __name__ == "__main__":
+    load_replica_data()
diff --git a/lib/load_spin.py b/lib/load_spin.py
new file mode 100644
index 0000000000000000000000000000000000000000..b43f529c32eb2514e5137b317f9ce0d62d8f3176
--- /dev/null
+++ b/lib/load_spin.py
@@ -0,0 +1,108 @@
+import numpy as np
+import os, imageio
+import torch
+import scipy
+from tqdm import tqdm
+
+########## Slightly modified version of LLFF data loading code
+##########  see https://github.com/Fyusion/LLFF for original
+def _minify(basedir, factors=[], resolutions=[]):
+    needtoload = False
+    for r in factors:
+        imgdir = os.path.join(basedir, 'images_{}'.format(r))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    for r in resolutions:
+        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))
+        if not os.path.exists(imgdir):
+            needtoload = True
+    if not needtoload:
+        return
+
+    from shutil import copy
+    from subprocess import check_output
+
+    imgdir = os.path.join(basedir, 'images')
+    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]
+    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'png', 'jpeg', 'PNG']])]
+    imgdir_orig = imgdir
+
+    wd = os.getcwd()
+
+    for r in factors + resolutions:
+        if isinstance(r, int):
+            name = 'images_{}'.format(r)
+            resizearg = '{}%'.format(100./r)
+        else:
+            name = 'images_{}x{}'.format(r[1], r[0])
+            resizearg = '{}x{}'.format(r[1], r[0])
+        imgdir = os.path.join(basedir, name)
+        if os.path.exists(imgdir):
+            continue
+
+        print('Minifying', r, basedir)
+
+        os.makedirs(imgdir)
+        check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)
+
+        ext = imgs[0].split('.')[-1]
+        args = ' '.join(['mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])
+        print(args)
+        os.chdir(imgdir)
+        check_output(args, shell=True)
+        os.chdir(wd)
+
+        if ext != 'png':
+            check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)
+            print('Removed duplicates')
+        print('Done')
+
+
+def load_spin_data(basedir, spin_basedir, factor=None):
+
+    spin_annotation_paths = os.listdir(spin_basedir)
+    spin_annotation_paths = [n for n in spin_annotation_paths if 'cutout' not in n and 'pseudo' not in n and 'png' in n]
+    
+    
+#     spin_annotation_paths = [os.path.join(spin_basedir, n) for n in spin_annotation_paths]
+#     spin_annotation_paths = sorted(spin_annotation_paths)
+   
+    if factor is None:
+        sfx = '_4'
+    else:
+        sfx = '_'+str(factor)
+
+    imgdir = os.path.join(basedir, 'images' + sfx)
+    if not os.path.exists(imgdir) and 'Truck' in imgdir:
+        imgdir = os.path.join(basedir, 'train', 'rgb')
+    elif not os.path.exists(imgdir) and 'lego' in imgdir:
+        imgdir = os.path.join(basedir, 'rgb')
+        from skimage.transform import resize
+    elif not os.path.exists(imgdir):
+        print( imgdir, 'does not exist, returning' )
+        return
+    
+    sorted_image_names = [f for f in sorted(os.listdir(imgdir)) if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')]
+    
+    id_to_gt_mask = {}
+    ref_id = None
+    for spin_annotation_name in spin_annotation_paths:
+        for i in range(len(sorted_image_names)):
+            if sorted_image_names[i].split('.')[-2] in spin_annotation_name.split('.')[-2]:
+                if 'lego' in imgdir:
+                    tmp = resize(imageio.imread(os.path.join(spin_basedir, spin_annotation_name)).astype(np.float32), (768, 1020))
+                    tmp[tmp >= 0.5] = 1
+                    tmp[tmp != 1] = 0
+                    id_to_gt_mask[i] = tmp
+#                     id_to_gt_mask[i] = imageio.imread(os.path.join(spin_basedir, spin_annotation_name))
+#                     print(np.unique(id_to_gt_mask[i]), "??")
+                else:
+                    id_to_gt_mask[i] = imageio.imread(os.path.join(spin_basedir, spin_annotation_name))
+                if ref_id is None:
+                    ref_id = i
+                break
+    
+    return ref_id, id_to_gt_mask
+
+# if __name__ == '__main__':
+#     print(load_spin_data('/datasets/nerf_data/nerf_llff_data(NVOS)/room/', '/datasets/nerf_data/MVSeg_data/room/'))
\ No newline at end of file
diff --git a/lib/load_tankstemple.py b/lib/load_tankstemple.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d2a5c096780be9fd999989a3a859b4b3a27289
--- /dev/null
+++ b/lib/load_tankstemple.py
@@ -0,0 +1,81 @@
+import os
+import glob
+import torch
+import numpy as np
+import imageio
+import json
+import torch.nn.functional as F
+import cv2
+
+
+def normalize(x):
+    return x / np.linalg.norm(x)
+
+def load_tankstemple_data(basedir, movie_render_kwargs={}, args=None):
+    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))
+    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*jpg')))
+
+    all_poses = []
+    all_imgs = []
+    i_split = [[], []]
+    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):
+        i_set = int(os.path.split(rgb_path)[-1][0])
+        all_poses.append(np.loadtxt(pose_path).astype(np.float32))
+        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))
+        i_split[i_set].append(i)
+
+    imgs = np.stack(all_imgs, 0)
+    poses = np.stack(all_poses, 0)
+    i_split.append(i_split[-1])
+
+
+    height, width = imgs.shape[1:3]
+    if args is not None and args.distill_active:
+        fts_dict = load_features(file=os.path.join(basedir, "features.pt"), imhw=(height, width))
+        fts = torch.stack(list(fts_dict.values())).permute(2,3,1,0)
+    else:
+        fts = torch.zeros([height, width, 0, imgs.shape[-1]])
+
+    path_intrinsics = os.path.join(basedir, 'intrinsics.txt')
+    H, W = imgs[0].shape[:2]
+    K = np.loadtxt(path_intrinsics)
+    focal = float(K[0,0])
+
+    ### generate spiral poses for rendering fly-through movie
+    centroid = poses[:,:3,3].mean(0)
+    radcircle = movie_render_kwargs.get('scale_r', 1.0) * np.linalg.norm(poses[:,:3,3] - centroid, axis=-1).mean()
+    centroid[0] += movie_render_kwargs.get('shift_x', 0)
+    centroid[1] += movie_render_kwargs.get('shift_y', 0)
+    centroid[2] += movie_render_kwargs.get('shift_z', 0)
+    new_up_rad = movie_render_kwargs.get('pitch_deg', 0) * np.pi / 180
+    target_y = radcircle * np.tan(new_up_rad)
+
+    render_poses = []
+
+    for th in np.linspace(0., 2.*np.pi, 200):
+        camorigin = np.array([radcircle * np.cos(th), 0, radcircle * np.sin(th)])
+        if movie_render_kwargs.get('flip_up_vec', False):
+            up = np.array([0,-1.,0])
+        else:
+            up = np.array([0,1.,0])
+        vec2 = normalize(camorigin)
+        vec0 = normalize(np.cross(vec2, up))
+        vec1 = normalize(np.cross(vec2, vec0))
+        pos = camorigin + centroid
+        # rotate to align with new pitch rotation
+        lookat = -vec2
+        lookat[1] = target_y
+        lookat = normalize(lookat)
+        lookat *= -1
+        vec2 = -lookat
+        vec1 = normalize(np.cross(vec2, vec0))
+
+        p = np.stack([vec0, vec1, vec2, pos], 1)
+
+        render_poses.append(p)
+
+    render_poses = np.stack(render_poses, 0)
+    render_poses = np.concatenate([render_poses, np.broadcast_to(poses[0,:3,-1:], render_poses[:,:3,-1:].shape)], -1)
+
+    return imgs, poses, render_poses, [H, W, focal], K, i_split, fts
+
diff --git a/lib/masked_adam.py b/lib/masked_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d770c4136dd3d31c3c5ce79c6a513849cefc1696
--- /dev/null
+++ b/lib/masked_adam.py
@@ -0,0 +1,77 @@
+import os
+import torch
+from torch.utils.cpp_extension import load
+
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+sources=['cuda/adam_upd.cpp', 'cuda/adam_upd_kernel.cu']
+adam_upd_cuda = load(
+        name='adam_upd_cuda',
+        sources=[os.path.join(parent_dir, path) for path in sources],
+        verbose=True)
+
+
+''' Extend Adam optimizer
+1. support per-voxel learning rate
+2. masked update (ignore zero grad) which speeduping training
+'''
+class MaskedAdam(torch.optim.Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps)
+        self.per_lr = None
+        self.f_per_lr = None
+        super(MaskedAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(MaskedAdam, self).__setstate__(state)
+
+    def set_pervoxel_lr(self, count):
+        assert self.param_groups[0]['params'][0].shape == count.shape
+        self.per_lr = count.float() / count.max()
+
+
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            lr = group['lr']
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+            skip_zero_grad = group['skip_zero_grad']
+            for param in group['params']:
+                if param.grad is not None:
+                    state = self.state[param]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+
+                    state['step'] += 1
+
+                    if self.per_lr is not None and param.shape == self.per_lr.shape:
+                        adam_upd_cuda.adam_upd_with_perlr(
+                                param, param.grad, state['exp_avg'], state['exp_avg_sq'], self.per_lr,
+                                state['step'], beta1, beta2, lr, eps)
+                    elif self.f_per_lr is not None and param.shape == self.f_per_lr.shape:
+                        adam_upd_cuda.adam_upd_with_perlr(
+                                param, param.grad, state['exp_avg'], state['exp_avg_sq'], self.f_per_lr,
+                                state['step'], beta1, beta2, lr, eps)
+                    elif skip_zero_grad:
+                        adam_upd_cuda.masked_adam_upd(
+                                param, param.grad, state['exp_avg'], state['exp_avg_sq'],
+                                state['step'], beta1, beta2, lr, eps)
+                    else:
+                        adam_upd_cuda.adam_upd(
+                                param, param.grad, state['exp_avg'], state['exp_avg_sq'],
+                                state['step'], beta1, beta2, lr, eps)
+
diff --git a/lib/prepare_prompts.py b/lib/prepare_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..049af82f654dbef9b1ece829433bfa8a368f2b56
--- /dev/null
+++ b/lib/prepare_prompts.py
@@ -0,0 +1,88 @@
+'''
+prepare for the prompts
+there are several methods of generating prompts:
+1. load from scene property, i.e. INPUT_POINT, INPUT_BOX
+2. load from a json file
+3. directly input the prompts
+4. from a interactive backend
+5. from a text discription, i.e. grounded dino
+'''
+# import argparse
+import torch
+import numpy as np
+from .scene_property import INPUT_POINT, INPUT_BOX
+
+def get_prompt_points(args, **kwargs):
+    prompt_points, boxes, mask_id, masks = None, None, None, None
+    
+    mask_id = kwargs.get('mask_id', None)
+    masks = kwargs.get('masks', None)
+    if args.prompt_type == 'scene':
+        assert args.scene is not None, 'please input the scene'
+        # fisrt type, directly use the scene property which is defined in scene_property.py
+        if args.scene not in INPUT_POINT.keys():
+            raise ValueError('please specify a valid scene')
+        prompt_points = INPUT_POINT[f'{args.scene}']
+        # boxes = INPUT_BOX[f'{args.scene}']
+
+    elif args.prompt_type == 'file':
+        assert args.prompt_file is not None, 'please input the prompt file'
+        # second type, load from a json file
+        import json
+        with open(args.prompt_file, 'r') as f:
+            prompt_dict = json.load(f)
+            prompt_points = prompt_dict['prompt_points']
+
+    elif args.prompt_type == 'input':
+        assert args.coords is not None, 'please input the coords'
+        # third type, directly input the prompts, 
+        # i.e. python prepare_prompts.py --prompt_type input --coords 450 300 480 500
+        # NOTE: the coords should be in the format of x1 y1 x2 y2
+        assert len(args.coords) % 2 == 0, 'please input the coords in the format of x1 y1 x2 y2'
+        prompt_points = np.array(args.coords).reshape(-1, 2)
+
+    elif args.prompt_type == 'interactive':
+        # TODO
+        # fourth type, interactive backend, input points are from interactive GUI
+        # raise NotImplementedError
+        from .interactive_prompt import interactive_prompting
+        prompt_points, mask_id, masks = interactive_prompting(kwargs['sam'], kwargs['ctx'], kwargs['init_rgb'])
+        # prompt_points = np.array([prompt_points])
+
+    elif args.prompt_type == 'text':
+        assert args.text is not None, 'please input the text (args.text)'
+        # fifth type, from a text discription, i.e. grounded dino
+        # TODO
+        
+        # text prompt
+        # masks <class 'torch.Tensor'> torch.Size([2, 1, 756, 1008])
+        image = kwargs['init_rgb']
+        from .self_prompting import grounding_dino_prompt
+        input_boxes = grounding_dino_prompt(image, args.text)
+        boxes = torch.tensor(input_boxes)[0:1]
+#         transformed_boxes = kwargs['sam'].transform.apply_boxes_torch(input_boxes, image.shape[:2])
+
+#         masks, scores, logits = kwargs['sam'].predict_torch(
+#             point_coords=None,
+#             point_labels=None,
+#             boxes=transformed_boxes,
+#             multimask_output=False,
+#         )
+#         masks = masks[0].detach().cpu().numpy()
+#         raise NotImplementedError
+
+    else:
+        raise ValueError('please specify a valid prompt type')
+    
+    return {
+        'prompt_points': prompt_points, # points used for segmentation [num_points, 2]
+        'boxes': boxes, # boxes used for segmentation
+        'mask_id': mask_id, # selected mask id, 0 or 1 or 2
+        'masks': masks # one numpy array with shape [3, H, W]
+    }
+
+
+if __name__=="__main__":
+    prompt_points = get_prompt_points()
+    print(prompt_points)
+    print(prompt_points.shape)
diff --git a/lib/render_utils.py b/lib/render_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cffb886f44f2bcc0ecaf3375ba5c1860776dc5
--- /dev/null
+++ b/lib/render_utils.py
@@ -0,0 +1,189 @@
+import torch
+from tqdm import tqdm, trange
+import numpy as np
+from .dvgo import get_rays_of_a_view
+import os
+import imageio
+from .utils import to8b, rgb_lpips, rgb_ssim, gen_rand_colors
+import matplotlib.pyplot as plt
+
+
+@torch.no_grad()
+def render_viewpoints(model, render_poses, HW, Ks, ndc, render_kwargs,
+                      gt_imgs=None, savedir=None, dump_images=False, cfg=None,
+                      render_factor=0, render_video_flipy=False, render_video_rot90=0,
+                      eval_ssim=False, eval_lpips_alex=False, eval_lpips_vgg=False, 
+                      seg_mask=True, render_fct=0.0, seg_type='seg_density'):
+    '''Render images for the given viewpoints; run evaluation if gt given.'''
+    assert len(render_poses) == len(HW) and len(HW) == len(Ks)
+
+    if render_factor!=0:
+        HW = np.copy(HW)
+        Ks = np.copy(Ks)
+        HW = (HW/render_factor).astype(int)
+        Ks[:, :2, :3] /= render_factor
+
+    rgbs, segs, depths, bgmaps, psnrs, ssims, lpips_alex, lpips_vgg = [], [], [], [], [], [], [], []
+
+    for i, c2w in enumerate(tqdm(render_poses, desc='Render {}...'.format(seg_type))):
+        H, W = HW[i]
+        K = Ks[i]
+        c2w = torch.Tensor(c2w)
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H, W, K, c2w, ndc, inverse_y=render_kwargs['inverse_y'],
+                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)
+        keys = ['rgb_marched', 'depth', 'alphainv_last']
+        if seg_mask: keys.append('seg_mask_marched')
+        rays_o = rays_o.flatten(0,-2)
+        rays_d = rays_d.flatten(0,-2)
+        viewdirs = viewdirs.flatten(0,-2)
+        render_result_chunks = [
+            {k: v for k, v in model(ro, rd, vd, render_fct=render_fct, **render_kwargs).items() if k in keys}
+            for ro, rd, vd in zip(rays_o.split(8192, 0), rays_d.split(8192, 0), viewdirs.split(8192, 0))
+        ]
+        render_result = {
+            k: torch.cat([ret[k] for ret in render_result_chunks]).reshape(H,W,-1)
+            for k in render_result_chunks[0].keys()
+        }
+        
+        rgb = render_result['rgb_marched'].cpu().numpy()
+            
+        if seg_mask:
+            seg_m = render_result['seg_mask_marched'].cpu()
+        else:
+            seg_m = None
+            
+        depth = render_result['depth'].cpu().numpy()
+        bgmap = render_result['alphainv_last'].cpu().numpy()
+
+        rgbs.append(rgb)
+        if seg_mask:
+            segs.append(seg_m)
+        depths.append(depth)
+        bgmaps.append(bgmap)
+        if i==0:
+            print('Testing, rgb shape: ', rgb.shape)
+
+        if gt_imgs is not None and render_factor==0:
+            p = -10. * np.log10(np.mean(np.square(rgb - gt_imgs[i])))
+            psnrs.append(p)
+            if eval_ssim:
+                ssims.append(rgb_ssim(rgb, gt_imgs[i], max_val=1))
+            if eval_lpips_alex:
+                lpips_alex.append(rgb_lpips(rgb, gt_imgs[i], net_name='alex', device=c2w.device))
+            if eval_lpips_vgg:
+                lpips_vgg.append(rgb_lpips(rgb, gt_imgs[i], net_name='vgg', device=c2w.device))
+
+    if len(psnrs):
+        print('Testing psnr', np.mean(psnrs), '(avg)')
+        if eval_ssim: print('Testing ssim', np.mean(ssims), '(avg)')
+        if eval_lpips_vgg: print('Testing lpips (vgg)', np.mean(lpips_vgg), '(avg)')
+        if eval_lpips_alex: print('Testing lpips (alex)', np.mean(lpips_alex), '(avg)')
+
+    if render_video_flipy:
+        for i in range(len(rgbs)):
+            rgbs[i] = np.flip(rgbs[i], axis=0)
+            depths[i] = np.flip(depths[i], axis=0)
+            bgmaps[i] = np.flip(bgmaps[i], axis=0)
+            segs[i] = np.flip(segs[i], axis=0)
+
+    if render_video_rot90 != 0:
+        for i in range(len(rgbs)):
+            rgbs[i] = np.rot90(rgbs[i], k=render_video_rot90, axes=(0,1))
+            depths[i] = np.rot90(depths[i], k=render_video_rot90, axes=(0,1))
+            bgmaps[i] = np.rot90(bgmaps[i], k=render_video_rot90, axes=(0,1))
+            segs[i] = np.rot90(segs[i], k=render_video_rot90, axes=(0,1))
+            
+    if savedir is not None and dump_images:
+        if seg_type == 'seg_density':
+            img_dir = 'seged_img'
+        elif seg_type == 'seg_img':
+            img_dir = 'ori_img'
+        else:
+            raise NotImplementedError
+        img_dir = os.path.join(savedir, img_dir)
+        os.makedirs(img_dir, exist_ok=True)
+        for i in trange(len(rgbs), desc='dumping images...'):
+            rgb8 = to8b(rgbs[i])
+            filename = os.path.join(img_dir, '{:03d}.png'.format(i))
+            imageio.imwrite(filename, rgb8)
+
+    rgbs = np.array(rgbs)
+    depths = np.array(depths)
+    bgmaps = np.array(bgmaps)
+    if len(segs): segs = np.stack(segs)
+
+    return rgbs, depths, bgmaps, segs
+
+
+def fetch_render_params(render_type, data_dict):
+    if render_type == 'train':
+        render_poses=data_dict['poses'][data_dict['i_train']]
+        HW=data_dict['HW'][data_dict['i_train']]
+        Ks=data_dict['Ks'][data_dict['i_train']]
+        gt_imgs=[data_dict['images'][i].cpu().numpy() for i in data_dict['i_train']]
+    elif render_type == 'test':
+        render_poses=data_dict['poses'][data_dict['i_test']]
+        HW=data_dict['HW'][data_dict['i_test']]
+        Ks=data_dict['Ks'][data_dict['i_test']]
+        gt_imgs=[data_dict['images'][i].cpu().numpy() for i in data_dict['i_test']]
+    elif render_type == 'video':
+        render_poses=data_dict['render_poses']
+        HW=data_dict['HW'][data_dict['i_test']][[0]].repeat(len(data_dict['render_poses']), 0)
+        Ks=data_dict['Ks'][data_dict['i_test']][[0]].repeat(len(data_dict['render_poses']), 0)
+        gt_imgs=None
+    else:
+        raise NotImplementedError
+    
+    return render_poses, HW, Ks, gt_imgs
+        
+
+@torch.no_grad()
+def render_fn(args, cfg, ckpt_name, flag, e_flag, num_obj, data_dict, render_viewpoints_kwargs, seg_type='seg_density'):
+    rand_colors = gen_rand_colors(num_obj)
+    testsavedir = os.path.join(cfg.basedir, cfg.expname, f'render_{args.render_opt}_{ckpt_name}')
+    os.makedirs(testsavedir, exist_ok=True)
+    print('All results are dumped into', testsavedir)
+    render_poses, HW, Ks, gt_imgs = fetch_render_params(args.render_opt, data_dict)
+    rgbs, depths, bgmaps, segs = render_viewpoints(
+            render_poses=render_poses,
+            HW=HW, Ks=Ks, gt_imgs=gt_imgs,
+            cfg=cfg,savedir=testsavedir, dump_images=args.dump_images,
+            eval_ssim=args.eval_ssim, eval_lpips_alex=args.eval_lpips_alex, eval_lpips_vgg=args.eval_lpips_vgg,
+            seg_type=seg_type,
+            **render_viewpoints_kwargs)
+    
+    imageio.mimwrite(os.path.join(testsavedir, 'video.rgb'+flag+e_flag+'_'+seg_type+'.mp4'), to8b(rgbs), fps=30, quality=8)
+    imageio.mimwrite(os.path.join(testsavedir, 'video.seg'+flag+e_flag+'_'+seg_type+'.mp4'), to8b(segs>0), fps=30, quality=8)
+    # imageio.mimwrite(os.path.join(testsavedir, 'video.depth'+flag+e_flag+'_'+seg_type+'.mp4'), \
+    #                  to8b(1 - depths / np.max(depths)), fps=30, quality=8)
+    depth_vis = plt.get_cmap('rainbow')(1 - depths / np.max(depths)).squeeze()[..., :3]
+    imageio.mimwrite(os.path.join(testsavedir, 'video.depth'+flag+e_flag+'_'+seg_type+'.mp4'), to8b(depth_vis), fps=30, quality=8)
+    if False:
+        depths_vis = depths * (1-bgmaps) + bgmaps
+        dmin, dmax = np.percentile(depths_vis[bgmaps < 0.1], q=[5, 95])
+        depth_vis = plt.get_cmap('rainbow')(1 - np.clip((depths_vis - dmin) / (dmax - dmin), 0, 1)).squeeze()[..., :3]
+        imageio.mimwrite(os.path.join(testsavedir, 'video.depth'+flag+e_flag+'_'+seg_type+'.mp4'), to8b(depth_vis), fps=30, quality=8)
+
+    if seg_type == 'seg_img':
+        seg_on_rgb = []
+        if args.dump_images:
+            masked_img_dir = os.path.join(testsavedir, 'masked_img')
+            os.makedirs(masked_img_dir, exist_ok=True)
+            masks_dir = os.path.join(testsavedir, 'masks')
+            os.makedirs(masks_dir, exist_ok=True)
+        for i, rgb, seg in zip(range(rgbs.shape[0]), rgbs, segs):
+            # Winner takes all
+            max_logit = np.expand_dims(np.max(seg, axis = -1), -1)
+            tmp_seg = seg
+            tmp_seg = np.argmax(tmp_seg, axis = -1)
+            tmp_seg[max_logit[:,:,0] <= 0.1] = num_obj
+            recolored_rgb = 0.3*rgb + 0.7*(rand_colors[tmp_seg])
+            seg_on_rgb.append(recolored_rgb)
+            if args.dump_images:
+                imageio.imwrite(os.path.join(masked_img_dir, 'rgb_{:07d}.png'.format(i)), to8b(recolored_rgb))
+                imageio.imwrite(os.path.join(masks_dir, 'mask_{:07d}.png'.format(i)), to8b(seg>0))
+        imageio.mimwrite(os.path.join(testsavedir, 'video.seg_on_rgb'+e_flag+'_'+seg_type+'.mp4'), to8b(seg_on_rgb), fps=30, quality=8)
+        return to8b(np.stack(seg_on_rgb))
+    
+    return to8b(rgbs)
diff --git a/lib/sam3d.py b/lib/sam3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8952b241e93178067195c6fa80a099582af30ce
--- /dev/null
+++ b/lib/sam3d.py
@@ -0,0 +1,594 @@
+import json
+import os
+import time
+from abc import ABC
+from typing import Optional
+
+import imageio
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch import Tensor
+from segment_anything import (SamAutomaticMaskGenerator, SamPredictor,
+                              sam_model_registry)
+from tqdm import tqdm
+
+from . import utils
+# from .scene_property import INPUT_BOX, INPUT_POINT
+from .self_prompting import mask_to_prompt
+from .prepare_prompts import get_prompt_points
+from .render_utils import render_fn
+
+
+class Sam3D(ABC):
+    '''TODO, add discription'''
+    def __init__(self, args, cfg, xyz_min, xyz_max, cfg_model, cfg_train, \
+                 data_dict, device=torch.device('cuda'), stage='coarse', coarse_ckpt_path=None):
+        self.cfg = cfg
+        self.args = args
+        if args.mobile_sam:
+            from mobile_sam import sam_model_registry
+
+            model_type = "vit_t"
+            sam_checkpoint = "./dependencies/sam_ckpt/mobile_sam.pt"
+
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+
+            self.sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+            self.sam.to(device=device)
+            self.sam.eval()
+
+        else:
+            sam_checkpoint = "./dependencies/sam_ckpt/sam_vit_h_4b8939.pth"
+            model_type = "vit_h"
+            self.sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device)
+        self.predictor = SamPredictor(self.sam)
+        print("SAM initializd.")
+        self.step_size = cfg.fine_model_and_render.stepsize
+        self.device = device
+        self.segment = args.segment
+        self.e_flag = args.sp_name if args.sp_name is not None else ''
+        self.base_save_dir = os.path.join(cfg.basedir, cfg.expname)
+        # for interactive backend
+        self.context = {'num_clicks': 0, 'click': []}
+
+        self.cfg_model, self.cfg_train = cfg_model, cfg_train
+        self.xyz_min, self.xyz_max = xyz_min, xyz_max
+        self.data_dict = data_dict
+        self.stage = stage
+        self.coarse_ckpt_path = coarse_ckpt_path
+
+
+    def init_model(self):
+        '''TODO, add discription'''
+        if abs(self.cfg_model.world_bound_scale - 1) > 1e-9:
+            xyz_shift = (xyz_max - xyz_min) * (self.cfg_model.world_bound_scale - 1) / 2
+            xyz_min -= xyz_shift
+            xyz_max += xyz_shift
+        
+        # find whether there is existing checkpoint path
+        last_ckpt_path = os.path.join(self.base_save_dir, f'fine_last.tar')
+        if self.args.no_reload:
+            reload_ckpt_path = None
+        elif self.args.ft_path:
+            reload_ckpt_path = self.args.ft_path
+        elif self.coarse_ckpt_path is not None and os.path.isfile(last_ckpt_path):
+            reload_ckpt_path = self.coarse_ckpt_path
+        elif os.path.isfile(last_ckpt_path):
+            reload_ckpt_path = last_ckpt_path
+        else:
+            reload_ckpt_path = None
+
+        # init model and optimizer
+        assert reload_ckpt_path is not None and 'segmentation must based on a pretrained NeRF'
+        print(f'scene_rep_reconstruction ({self.stage}): reload from {reload_ckpt_path}')
+        model, optimizer, start = utils.load_existed_model(self.args, self.cfg, 
+            self.cfg_train, reload_ckpt_path, self.device)
+
+        if self.segment:
+            for param in model.named_parameters():
+                if ('density' in param[0]) or ('rgbnet' in param[0]) or ('k0' in param[0]):
+                    param[1].requires_grad = False
+
+        if self.stage == 'fine':
+            model.change_to_fine_mode()
+            print("Segmentation model: FINE MODE.")
+        else:
+            print("Segmentation model: COARSE MODE.")
+
+        # in case OOM
+        torch.cuda.empty_cache()
+
+        self.render_viewpoints_kwargs = {
+                'model': model,
+                'ndc': self.cfg.data.ndc,
+                'render_kwargs': {
+                    'near': self.data_dict['near'],
+                    'far': self.data_dict['far'],
+                    'bg': 1 if self.cfg.data.white_bkgd else 0,
+                    'stepsize': self.step_size,
+                    'inverse_y': self.cfg.data.inverse_y,
+                    'flip_x': self.cfg.data.flip_x,
+                    'flip_y': self.cfg.data.flip_y,
+                    'render_depth': True,
+                },
+            }
+        self.optimizer = utils.create_segmentation_optimizer(model, self.cfg_train)
+
+        with torch.no_grad():
+            rgb, _, _, _, _ = self.render_view(idx=0)
+            init_image = utils.to8b(rgb.cpu().numpy())
+            self.predictor.set_image(init_image)
+        
+        return init_image
+
+
+    def render_view(self, idx, cam_params=None, render_fct=0.0):
+        # Training seg
+        if cam_params is None:
+            render_poses, HW, Ks = fetch_seg_poses(self.args.seg_poses, self.data_dict)
+            assert(idx < len(render_poses))
+        else:
+            render_poses, HW, Ks = cam_params
+
+        model = self.render_viewpoints_kwargs['model']
+        render_kwargs = self.render_viewpoints_kwargs['render_kwargs']
+        # get data
+        c2w = render_poses[idx]
+        H, W = HW[idx]; K = Ks[idx]
+        ndc = self.cfg.data.ndc
+        rays_o, rays_d, viewdirs = utils.get_rays_of_a_view(
+                H, W, K, c2w, ndc, inverse_y=render_kwargs['inverse_y'],
+                flip_x=self.cfg.data.flip_x, flip_y=self.cfg.data.flip_y)
+        
+        keys = ['rgb_marched', 'depth', 'alphainv_last', 'seg_mask_marched']
+        if self.stage == 'fine': keys.append('dual_seg_mask_marched')
+        rays_o, rays_d, viewdirs = [arr.flatten(0, -2) for arr in [rays_o, rays_d, viewdirs]]
+        render_result_chunks = [
+            {k: v for k, v in model(ro, rd, vd, distill_active=False, render_fct=render_fct, **render_kwargs).items() if k in keys}
+            for ro, rd, vd in zip(rays_o.split(8192, 0), rays_d.split(8192, 0), viewdirs.split(8192, 0))
+        ]
+        render_result = {
+            k: torch.cat([ret[k] for ret in render_result_chunks]).reshape(H,W,-1)
+            for k in render_result_chunks[0].keys()
+        }
+        rgb = render_result['rgb_marched']
+        depth = render_result['depth']
+        bgmap = render_result['alphainv_last']
+        seg_m = render_result['seg_mask_marched'] if self.segment else None
+        dual_seg_m = render_result['dual_seg_mask_marched'] if self.stage == 'fine' else None
+
+        return rgb, depth, bgmap, seg_m, dual_seg_m
+    
+
+    def prompt_and_inverse(self, idx, HW, seg_m, dual_seg_m, depth, num_obj=1):
+        H, W = HW[idx]
+        index_matrix = _generate_index_matrix(H, W, depth.detach().clone())
+
+        if self.stage == 'coarse': # coarse stage, get sam seg
+            loss, sam_seg_show = self.prompting_coarse(H, W, seg_m, index_matrix, num_obj)
+        elif self.stage == 'fine':
+            loss, sam_seg_show, _ = self.prompting_fine(H, W, seg_m, dual_seg_m, index_matrix, num_obj)
+        else:
+            raise NotImplementedError
+        optim(self.optimizer, loss, model=self.render_viewpoints_kwargs['model'])
+
+        return sam_seg_show
+
+
+    def inverse(self, seg_m, sam_mask):
+        loss = seg_loss(sam_mask, None, seg_m)
+        optim(self.optimizer, loss, model=self.render_viewpoints_kwargs['model'])
+
+
+    def train_step(self, idx, sam_mask=None):
+        render_poses, HW, Ks = fetch_seg_poses(self.args.seg_poses, self.data_dict)
+        assert(idx < len(render_poses))
+
+        rgb, depth, bgmap, seg_m, dual_seg_m = self.render_view(idx, [render_poses, HW, Ks])
+        if sam_mask is None:
+            self.predictor.set_image(utils.to8b(rgb.cpu().numpy()))
+            sam_seg_show = self.prompt_and_inverse(idx, HW, seg_m, dual_seg_m, depth)
+        else:
+            self.inverse(seg_m, sam_mask)
+            sam_seg_show = None
+        
+        # logging
+        rgb = rgb.detach().cpu().numpy()
+        seg_m = seg_m.detach().cpu().numpy()
+        recolored_img = utils.to8b(0.4 * rgb + 0.6 * (seg_m>0))
+        if sam_seg_show is not None: sam_seg_show = utils.to8b(sam_seg_show)
+        return recolored_img, sam_seg_show, idx >= len(render_poses)-1
+
+
+    def save_ckpt(self):
+        if self.args.save_ckpt:
+            model = self.render_viewpoints_kwargs['model']
+            torch.save({
+                'model_kwargs': model.get_kwargs(),
+                'model_state_dict': model.state_dict(),
+                'optimizer_state_dict': self.optimizer.state_dict(),
+            }, os.path.join(self.base_save_dir, f'{self.stage}_segmentation'+self.e_flag+'.tar'))
+            print(f'scene_rep_reconstruction ({self.stage}): saved checkpoints at', os.path.join(self.base_save_dir, f'{self.stage}_segmentation'+self.e_flag+'.tar'))
+        else:
+            print('Did not add --save_ckpt in parser. Therefore, ckpt is not saved.')
+    
+    @torch.no_grad()
+    def render_test(self):
+        if self.args.ft_path:
+            ckpt_path = self.args.ft_path
+        else:
+            fine_path = os.path.join(self.cfg.basedir, self.cfg.expname, 'fine_segmentation'+self.e_flag+'.tar')
+            coarse_path = os.path.join(self.cfg.basedir, self.cfg.expname, 'coarse_segmentation'+self.e_flag+'.tar')
+            ckpt_path = fine_path if os.path.exists(fine_path) else coarse_path
+        # print("\033[96mRendering with ckpt "+ckpt_path+"\033[0m")
+        ckpt_name = ckpt_path.split('/')[-1][:-4]
+        
+        videos = []
+        for seg_type in ['seg_img', 'seg_density']:
+            # rendering
+            flag = "seg" if self.args.segment else ""
+            if self.args.segment:
+                if seg_type == 'seg_density':
+                    self.render_viewpoints_kwargs['model'].segmentation_to_density()
+                elif seg_type == 'seg_img':
+                    self.render_viewpoints_kwargs['model'].segmentation_only()
+                else:
+                    raise NotImplementedError('seg type {} is not implemented!'.format(seg_type))
+
+            # default: one object    
+            num_obj = self.render_viewpoints_kwargs['model'].seg_mask_grid.grid.shape[1]
+            self.render_viewpoints_kwargs['model'] = self.render_viewpoints_kwargs['model'].cuda()
+            video = render_fn(self.args, self.cfg, ckpt_name, flag, self.e_flag, num_obj, \
+                                   self.data_dict, self.render_viewpoints_kwargs, seg_type=seg_type)
+            videos.append(video)
+        return videos
+
+    def seg_init_frame_coarse(self):
+        '''for coarse stage init, we need to set a prompt for the user to select a mask'''
+        with torch.no_grad():
+            prompts = get_prompt_points(self.args, sam=self.predictor, 
+                    ctx=self.context, init_rgb=self.init_image)
+            input_point = prompts['prompt_points']
+            input_label = np.ones(len(input_point))
+
+            masks, scores, logits = self.predictor.predict(
+                point_coords=input_point,
+                point_labels=input_label,
+                multimask_output=True,
+            )
+
+        if prompts['mask_id'] is None:
+            for j, mask in enumerate(masks): 
+                ### for selection
+                plt.figure(figsize=(10,10))
+                plt.imshow(mask)
+                plt.axis('on')
+                plt.savefig('tmp_mask_'+str(j)+'.jpg')
+            selected_mask = int(input("Please select a mask:"))
+        else:
+            selected_mask = prompts['mask_id']
+
+        # record the selected prompt and mask
+        with open(os.path.join(self.base_save_dir, "user-specific-prompt.json"), 'w') as f:
+            prompt_dict = {
+                "mask_id": selected_mask,
+                "prompt_points": input_point.tolist()
+            }
+            json.dump(prompt_dict, f)
+        print(f"Prompt saved in {os.path.join(self.base_save_dir, 'user-specific-prompt.json')}")
+        
+        sam_seg_show = masks[selected_mask].astype(np.float32)
+        sam_seg_show = np.stack([sam_seg_show,sam_seg_show,sam_seg_show], axis = -1)
+        for ip, point in enumerate(input_point):
+            sam_seg_show[point[1]-3 : point[1]+3, point[0] - 3 : point[0]+3, :] = 0
+            if ip < 3:
+                sam_seg_show[point[1]-3 : point[1]+3, point[0] - 3 : point[0]+3, ip] = 1
+            else:
+                sam_seg_show[point[1]-3 : point[1]+3, point[0] - 3 : point[0]+3, 2] = 1
+
+        return masks, scores, logits, selected_mask, sam_seg_show
+
+
+    def seg_init_frame_fine(self, seg_m, model, dual_seg_m):
+        '''for fine stage, we load the user-specific prompt and mask'''
+        # get the recorded user-specific prompt and the corresponding mask
+        with open(os.path.join(self.base_save_dir, "user-specific-prompt.json"), 'r') as f:
+            prompt_dict = json.load(f)
+            mask_id = prompt_dict['mask_id']
+            input_point = np.array(prompt_dict['prompt_points'])
+
+        with torch.no_grad():
+            # input_point = get_prompt_points(self.args)['prompt_points']
+            input_label = np.ones(len(input_point))
+
+            masks, scores, logits = self.predictor.predict(
+                point_coords=input_point,
+                point_labels=input_label,
+                multimask_output=True,
+            )
+
+        print("user-specific-prompt loaded, the specified prompt mask id is:", mask_id)
+        target_mask = torch.as_tensor(masks[mask_id]).float().to(seg_m.device)
+        
+        # the rendered segmentation result
+        tmp_rendered_mask = seg_m[:,:,0].detach().clone()
+        tmp_rendered_mask[torch.logical_or(tmp_rendered_mask <= tmp_rendered_mask.mean(), tmp_rendered_mask <= 0)] = 0
+        tmp_rendered_mask[tmp_rendered_mask != 0] = 1
+
+        # get the dual segmentation target
+        dual_target = torch.zeros_like(tmp_rendered_mask)
+        dual_target[(tmp_rendered_mask - target_mask) == 1] = 1
+        
+        IoU = utils.cal_IoU(tmp_rendered_mask, target_mask)
+        print("Current IoU is", IoU)
+        if IoU > 0.9:
+            print("IoU is larger than 0.9, no refinement is required. Use Ctrl+C to cancel the fine stage training.")
+            time.sleep(5)
+            print("Begin refinement.")
+            
+        
+        model.seg_mask_grid.grid.data = torch.zeros_like(model.seg_mask_grid.grid)
+        model.dual_seg_mask_grid.grid.data = torch.zeros_like(model.seg_mask_grid.grid)
+
+        sam_seg_show = masks[mask_id].astype(np.float32)
+        sam_seg_show = np.stack([sam_seg_show,sam_seg_show,sam_seg_show], axis = -1)
+        dual_sam_seg_show = dual_target.detach().cpu().numpy().astype(np.float32)
+        dual_sam_seg_show = np.stack([dual_sam_seg_show,dual_sam_seg_show,dual_sam_seg_show], axis = -1)
+
+        return target_mask, dual_target, sam_seg_show, dual_sam_seg_show
+
+
+    def prompting_coarse(self, H, W, seg_m, index_matrix, num_obj):
+        '''TODO, for coarse stage, we use the self-prompting method to generate the prompt and mask'''
+        seg_m_clone = seg_m.detach().clone()
+        seg_m_for_prompt = seg_m_clone
+        # kernel_size = 3
+        # padding = kernel_size // 2
+        # seg_m_for_prompt = torch.nn.functional.avg_pool2d(seg_m_clone.permute([2,0,1]).unsqueeze(0), kernel_size, stride = 1, padding = padding)
+        # seg_m_for_prompt = seg_m_for_prompt.squeeze(0).permute([1,2,0])
+
+        loss = 0
+
+        for num in range(num_obj):
+            with torch.no_grad():
+                # self-prompting
+                prompt_points, input_label = mask_to_prompt(predictor = self.predictor, rendered_mask_score = seg_m_for_prompt[:,:,num][:,:,None], 
+                                                            index_matrix = index_matrix, num_prompts = self.args.num_prompts)
+
+                masks, selected = None, -1
+                if len(prompt_points) != 0:
+                    masks, scores, logits = self.predictor.predict(
+                        point_coords=prompt_points,
+                        point_labels=input_label,
+                        multimask_output=False,
+                    )
+                    selected = np.argmax(scores)
+
+            if num == 0:
+                # used for single object only
+                sam_seg_show = masks[selected].astype(np.float32) if masks is not None else np.zeros((H,W))
+                sam_seg_show = np.stack([sam_seg_show,sam_seg_show,sam_seg_show], axis = -1)
+                r = 8
+                for ip, point in enumerate(prompt_points):
+                    sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, :] = 0
+                    if ip < 3:
+                        sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, ip] = 1
+                    else:
+                        sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, -1] = 1
+                    
+
+            if masks is not None:
+                tmp_seg_m = seg_m[:,:,num]
+                tmp_rendered_mask = tmp_seg_m.detach().clone()
+                tmp_rendered_mask[torch.logical_or(tmp_rendered_mask <= tmp_rendered_mask.mean(), tmp_rendered_mask <= 0)] = 0
+                tmp_rendered_mask[tmp_rendered_mask != 0] = 1
+                tmp_IoU = utils.cal_IoU(torch.as_tensor(masks[selected]).float(), tmp_rendered_mask)
+                print(f"current IoU is: {tmp_IoU}")
+                if tmp_IoU < 0.5:
+                    print("SKIP, unacceptable sam prediction, IoU is", tmp_IoU)
+                    continue
+
+                loss += seg_loss(masks, selected, tmp_seg_m, self.args.lamb)
+                for neg_i in range(seg_m.shape[-1]):
+                    if neg_i == num:
+                        continue
+                    loss += (torch.tensor(masks[selected]).to(seg_m.device) * seg_m[:,:,neg_i]).sum()
+        return loss, sam_seg_show
+
+
+    def prompting_fine(self, H, W, seg_m, dual_seg_m, index_matrix, num_obj):
+        '''TODO, for fine stage, we use the self-prompting method to generate the prompt and mask'''
+        loss = 0
+        # get the prompt of interest
+        seg_m_clone = seg_m.detach().clone()
+        seg_m_for_prompt = torch.nn.functional.avg_pool2d(seg_m_clone.permute([2,0,1]).unsqueeze(0), 25, stride = 1, padding = 12)
+        seg_m_for_prompt = seg_m_for_prompt.squeeze(0).permute([1,2,0])
+        # get the dual prompt of interest
+        dual_seg_m_clone = dual_seg_m.detach().clone()
+        dual_seg_m_for_prompt = torch.nn.functional.avg_pool2d(dual_seg_m_clone.permute([2,0,1]).unsqueeze(0), 25, stride = 1, padding = 12)
+        dual_seg_m_for_prompt = dual_seg_m_for_prompt.squeeze(0).permute([1,2,0])
+        
+        for num in range(num_obj):
+            tmp_seg_m = seg_m[:,:,num]
+            dual_tmp_seg_m = dual_seg_m[:,:,num]
+            
+            with torch.no_grad():
+                # rendered segmentation mask
+                tmp_rendered_mask = tmp_seg_m.detach().clone()
+                tmp_rendered_mask[torch.logical_or(tmp_rendered_mask <= tmp_rendered_mask.mean(), tmp_rendered_mask <= 0)] = 0
+                tmp_rendered_mask[tmp_rendered_mask != 0] = 1
+
+                # rendered dual segmentation mask
+                tmp_rendered_dual_mask = dual_tmp_seg_m.detach().clone()
+                tmp_rendered_dual_mask[torch.logical_or(tmp_rendered_dual_mask <= tmp_rendered_dual_mask.mean(), tmp_rendered_dual_mask <= 0)] = 0
+                tmp_rendered_dual_mask[tmp_rendered_dual_mask != 0] = 1
+
+            
+                # self-prompting
+                ori_prompt_points, ori_input_label = mask_to_prompt(predictor = self.predictor, \
+                    rendered_mask_score = seg_m_for_prompt[:,:,num].unsqueeze(-1), index_matrix = index_matrix, num_prompts = self.args.num_prompts)
+                num_self_prompts = len(ori_prompt_points)
+
+                # dual self-prompting
+                dual_prompt_points, dual_input_label = mask_to_prompt(predictor = self.predictor, \
+                    rendered_mask_score = dual_seg_m_for_prompt[:,:,num].unsqueeze(-1), index_matrix = index_matrix, num_prompts = self.args.num_prompts)                
+                num_dual_self_prompts = len(dual_prompt_points)
+
+                masks, dual_masks = None, None
+                # self-prompting
+                if num_self_prompts != 0:
+                    prompt_points = np.concatenate([ori_prompt_points, dual_prompt_points], axis = 0)
+                    input_label = np.concatenate([ori_input_label, 1-dual_input_label], axis = 0)
+                    # generate mask
+                    masks, scores, logits = self.predictor.predict(
+                        point_coords=prompt_points,
+                        point_labels=input_label,
+                        multimask_output=False,
+                    )
+                    
+                # dual self-prompting
+                if num_dual_self_prompts != 0:
+                    prompt_points = np.concatenate([ori_prompt_points, dual_prompt_points], axis = 0)
+                    input_label = np.concatenate([1-ori_input_label, dual_input_label], axis = 0)
+                    # generate dual mask
+                    dual_masks, dual_scores, dual_logits = self.predictor.predict(
+                        point_coords=prompt_points,
+                        point_labels=input_label,
+                        multimask_output=False,
+                    )
+
+            r = 8
+            if num == 0:
+                # used for single object only
+                sam_seg_show = masks[0].astype(np.float32) if masks is not None else np.zeros((H,W))
+                sam_seg_show = np.stack([sam_seg_show,sam_seg_show,sam_seg_show], axis = -1)
+                for point in ori_prompt_points:
+                    sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, :] = 0
+                    sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, 0] = 1
+                for point in dual_prompt_points:
+                    sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, :] = 0
+                    sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, 2] = 1
+                
+                dual_sam_seg_show = dual_masks[0].astype(np.float32)  if dual_masks is not None else np.zeros((H,W))
+                dual_sam_seg_show = np.stack([dual_sam_seg_show,dual_sam_seg_show,dual_sam_seg_show], axis = -1)
+                for point in dual_prompt_points:
+                    dual_sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, :] = 0
+                    dual_sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, 0] = 1
+                for point in ori_prompt_points:
+                    dual_sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, :] = 0
+                    dual_sam_seg_show[point[1]-r : point[1]+r, point[0] - r : point[0]+r, 2] = 1
+                
+            if masks is not None:
+                tmp_IoU = utils.cal_IoU(torch.as_tensor(masks[0]).float(), tmp_rendered_mask)
+                print("tmp_IoU:", tmp_IoU)
+                if tmp_IoU < 0.5:
+                    print("SKIP, unacceptable sam prediction for original seg, IoU is", tmp_IoU)
+                else:
+                    loss += seg_loss(masks[0], None, tmp_seg_m, self.args.lamb)
+                    # loss += -(torch.tensor(masks[0]).to(seg_m.device) * tmp_seg_m).sum() + 0.15 * (torch.tensor(1-masks[0]).to(seg_m.device) * tmp_seg_m).sum()
+                    for neg_i in range(seg_m.shape[-1]):
+                        if neg_i == num: 
+                            continue
+                        loss -= seg_loss(masks[0], None, seg_m[:,:,neg_i], 0)
+                        # loss += (torch.tensor(masks[0]).to(seg_m.device) * seg_m[:,:,neg_i]).sum()
+
+                if dual_masks is not None:
+                    tmp_IoU = utils.cal_IoU(torch.as_tensor(dual_masks[0]).float(), tmp_rendered_dual_mask)
+                    print("tmp_dual_IoU:", tmp_IoU)
+                    if tmp_IoU < 0.5:
+                        print("SKIP, unacceptable sam prediction for dual seg, IoU is", tmp_IoU)
+                    else:
+                        loss += seg_loss(dual_masks[0], None, dual_tmp_seg_m, self.args.lamb)
+                        # loss += -(torch.tensor(dual_masks[0]).to(seg_m.device) * dual_tmp_seg_m).sum() + 0.15 * (torch.tensor(1-dual_masks[0]).to(dual_seg_m.device) * dual_tmp_seg_m).sum()
+                        for neg_i in range(dual_seg_m.shape[-1]):
+                            if neg_i == num: 
+                                continue
+                            loss -= seg_loss(dual_masks[0], None, dual_seg_m[:,:,neg_i], 0)
+                            # loss += (torch.tensor(dual_masks[0]).to(seg_m.device) * dual_seg_m[:,:,neg_i]).sum()
+        
+        return loss, sam_seg_show, dual_sam_seg_show
+
+
+def seg_loss(mask: Tensor, selected_mask: Optional[Tensor], seg_m: Tensor, lamda: float = 5.0) -> Tensor:
+    """
+    Compute segmentation loss using binary mask and predicted mask.
+
+    Args:
+        mask: Binary ground truth segmentation mask tensor.
+        selected_mask: Tensor indicating which indices in `mask` to select. Can be `None`.
+        seg_m: Predicted segmentation mask tensor.
+        lamda: Weighting factor for outside mask loss. Default is 5.0.
+
+    Returns:
+        Computed segmentation loss.
+
+    Raises:
+        AssertionError: If `seg_m` is `None`.
+    """
+    assert seg_m is not None, "Segmentation mask is None."
+    device = seg_m.device
+    if selected_mask is not None:
+        mask_loss = -(utils.to_tensor(mask[selected_mask], device) * seg_m.squeeze(-1)).sum()
+        out_mask_loss = lamda * (utils.to_tensor(1 - mask[selected_mask], device) * seg_m.squeeze(-1)).sum()
+    else:
+        mask_loss = -(utils.to_tensor(mask, device) * seg_m.squeeze(-1)).sum()
+        out_mask_loss = lamda * ((1 - utils.to_tensor(mask, device)) * seg_m.squeeze(-1)).sum()
+    return mask_loss + out_mask_loss
+
+
+def optim(optimizer, loss, clip=None, model=None):
+    """Perform a single optimization step using the given optimizer and loss.
+
+    Args:
+        optimizer: PyTorch optimizer to use for the optimization step.
+        loss: The loss tensor to optimize.
+        clip: Optional gradient clipping value.
+        model: Optional PyTorch model whose parameters to clip.
+
+    Raises:
+        TypeError: If the loss is not a tensor.
+    """
+    if isinstance(loss, torch.Tensor):
+        optimizer.zero_grad()
+        loss.backward()
+        if clip is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        if model is not None:
+            with torch.no_grad():
+                model.seg_mask_grid.grid *= model.mask_view_counts
+                prev_mask_grid = model.seg_mask_grid.grid.detach().clone()
+        optimizer.step()
+        # average mask score by view counts
+        if model is not None:
+            with torch.no_grad():
+                model.mask_view_counts += (model.seg_mask_grid.grid != prev_mask_grid)
+                model.seg_mask_grid.grid /= (model.mask_view_counts + 1e-9)
+    else:
+        pass
+
+
+def _generate_index_matrix(H, W, depth_map):
+    '''generate the index matrix, which contains the coordinate of each pixel and cooresponding depth'''
+    xs = torch.arange(1, H+1) / H # NOTE, range (1, H) = arange(1, H+1)
+    ys = torch.arange(1, W+1) / W
+    grid_x, grid_y = torch.meshgrid(xs, ys)
+    index_matrix = torch.stack([grid_x, grid_y], dim = -1) # [H, W, 2]
+    depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) # [H, W, 1]
+    index_matrix = torch.cat([index_matrix, depth_map], dim = -1)
+    return index_matrix
+
+
+def fetch_seg_poses(seg_poses_type, data_dict):
+    if seg_poses_type == 'train':
+        render_poses=data_dict['poses'][data_dict['i_train']]
+        HW=data_dict['HW'][data_dict['i_train']]
+        Ks=data_dict['Ks'][data_dict['i_train']]
+    elif seg_poses_type == 'video':
+        render_poses=data_dict['render_poses']
+        HW=data_dict['HW'][data_dict['i_test']][[0]].repeat(len(render_poses), 0)
+        Ks=data_dict['Ks'][data_dict['i_test']][[0]].repeat(len(render_poses), 0)
+    else:
+        raise NotImplementedError
+
+    return render_poses, HW, Ks
diff --git a/lib/scene_property.py b/lib/scene_property.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d772e59555e1c37359f9a75b483045a9782ec85
--- /dev/null
+++ b/lib/scene_property.py
@@ -0,0 +1,65 @@
+##NOTE, define the scene property with const values
+##TODO: use a interactive backend.
+import numpy as np
+import torch
+
+INPUT_POINT = {
+    'horns': np.array([[450, 300], [480, 500]]),
+    'horns_center': np.array([[450, 300], [480, 500]]),
+    'horns_left': np.array([[150, 400], [50, 520]]),
+    'bicycle': np.array([[700, 0], [600, 100]]),
+    'room': np.array([[450,625],[425,600]]),
+    'garden': np.array([[380, 200], [380, 135]]),
+    'shoerack': np.array([[600,400],[400,280]]),
+    'fortress': np.array([[400,500],[500,300]]),
+    'orchids': np.array([[400, 300], [400, 180], [800, 580], [750, 300]]),
+    'chesstable': np.array([[600,200],[600,500]]),
+    'stove': np.array([[0,2300],[80,250]]),
+    'trex': np.array([[300, 300], [800, 480], [680, 600], [500, 400],[430,400]]),
+    '360v2_bonsai': np.array([[450, 250], [450, 150]]),
+    '360v2_kitchen_part': np.array([[280,260],[300,230]]),
+    '360v2_kitchen_part2': np.array([[560,300],[580,300]]),
+    '360v2_kitchen': np.array([[400,300],[500,200]]),
+    '360v2_counter': np.array([[175,60],[175, 100]]),
+    'replica_office0': np.array([[400, 200],[350, 250]]),
+    'auto': np.array([[]]),
+}
+
+INPUT_BOX = {
+    'horns': torch.tensor([
+                                [200, 0, 700, 650],
+                                [0, 300, 200, 600],
+                                [780, 400, 920, 500],
+                            ]),
+    '360v2_bonsai': torch.tensor([
+                                [500, 380, 720, 590],
+                                [200, 550, 700, 750],
+                                [200, 270, 460, 460],
+                            ]),
+    'orchids': torch.tensor([
+                                [500, 380, 720, 590],
+                                [200, 550, 700, 750],
+                                [200, 270, 460, 460],
+                            ]),
+    'santarex': torch.tensor([
+                                [300, 100, 430, 250],
+                                [200, 180, 1000, 750],
+                            ]),
+    'butcher': torch.tensor([
+                                [200, 350, 400, 500],
+                                [430, 400, 720, 580],
+                                [300, 550, 630, 750],
+                                [450,150,700,300],
+                                [50, 230, 250, 430],
+                            ]),
+    'pond' : torch.tensor([
+                                [520, 580, 610, 650],
+                                [700, 30, 830, 140],
+                                [260, 50, 420, 190],
+                            ]),
+    'bonsai': torch.tensor([
+                                [400, 100, 600, 280],
+                                [380, 230, 600, 360],
+                                [340, 300, 600, 420],
+                            ]),
+}
\ No newline at end of file
diff --git a/lib/seg_dcvgo.py b/lib/seg_dcvgo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a698dbc96a298134ab5f43e32517e894015ea7
--- /dev/null
+++ b/lib/seg_dcvgo.py
@@ -0,0 +1,651 @@
+import os
+import time
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch_scatter import segment_coo
+
+from . import grid
+from .dvgo import Raw2Alpha, Alphas2Weights
+from .dmpigo import create_full_step_id
+
+from torch.utils.cpp_extension import load
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+ub360_utils_cuda = load(
+        name='ub360_utils_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/ub360_utils.cpp', 'cuda/ub360_utils_kernel.cu']],
+        verbose=True)
+
+
+#TODO ORIGINAL bg_len=0.2
+'''Model'''
+class DirectContractedVoxGO(nn.Module):
+    def __init__(self, xyz_min, xyz_max,
+                 num_voxels=0, num_voxels_base=0, num_objects = 1,
+                 alpha_init=None,
+                 mask_cache_world_size=None,
+                 fast_color_thres=0, bg_len=0.2,
+                 contracted_norm='inf',
+                 density_type='DenseGrid', k0_type='DenseGrid',
+                 density_config={}, k0_config={},
+                 rgbnet_dim=0,
+                 rgbnet_depth=3, rgbnet_width=128,
+                 viewbase_pe=4,
+                 **kwargs):
+        super(DirectContractedVoxGO, self).__init__()
+        # xyz_min/max are the boundary that separates fg and bg scene
+        xyz_min = torch.Tensor(xyz_min)
+        xyz_max = torch.Tensor(xyz_max)
+        assert len(((xyz_max - xyz_min) * 100000).long().unique()), 'scene bbox must be a cube in DirectContractedVoxGO'
+        self.register_buffer('scene_center', (xyz_min + xyz_max) * 0.5)
+        self.register_buffer('scene_radius', (xyz_max - xyz_min) * 0.5)
+        self.register_buffer('xyz_min', torch.Tensor([-1,-1,-1]) - bg_len)
+        self.register_buffer('xyz_max', torch.Tensor([1,1,1]) + bg_len)
+        if isinstance(fast_color_thres, dict):
+            self._fast_color_thres = fast_color_thres
+            self.fast_color_thres = fast_color_thres[0]
+        else:
+            self._fast_color_thres = None
+            self.fast_color_thres = fast_color_thres
+        self.bg_len = bg_len
+        self.contracted_norm = contracted_norm
+
+        # determine based grid resolution
+        self.num_voxels_base = num_voxels_base
+        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)
+
+        # determine init grid resolution
+        self._set_grid_resolution(num_voxels)
+
+        # determine the density bias shift
+        self.alpha_init = alpha_init
+        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))
+        print('dcvgo: set density bias shift to', self.act_shift)
+
+        # init density voxel grid
+        self.density_type = density_type
+        self.density_config = density_config
+        self.density = grid.create_grid(
+            density_type, channels=1, world_size=self.world_size,
+            xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+            config=self.density_config)
+        
+        self.mode = 'coarse'
+        self.num_objects = num_objects
+        self.seg_mask_grid = grid.create_grid(
+                density_type, channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.mask_view_counts = torch.zeros_like(self.seg_mask_grid.grid, requires_grad=False)
+        
+        self.dual_seg_mask_grid = grid.create_grid(
+                density_type, channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+
+        # init color representation
+        self.rgbnet_kwargs = {
+            'rgbnet_dim': rgbnet_dim,
+            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,
+            'viewbase_pe': viewbase_pe,
+        }
+        self.k0_type = k0_type
+        self.k0_config = k0_config
+        
+        if rgbnet_dim <= 0:
+            # color voxel grid  (coarse stage)
+            self.k0_dim = 3
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.rgbnet = None
+        else:
+            # feature voxel grid + shallow MLP  (fine stage)
+            self.k0_dim = rgbnet_dim
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))
+            dim0 = (3+3*viewbase_pe*2)
+            dim0 += self.k0_dim
+            self.rgbnet = nn.Sequential(
+                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),
+                *[
+                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))
+                    for _ in range(rgbnet_depth-2)
+                ],
+                nn.Linear(rgbnet_width, 3),
+            )
+            nn.init.constant_(self.rgbnet[-1].bias, 0)
+            print('dcvgo: feature voxel grid', self.k0)
+            print('dcvgo: mlp', self.rgbnet)
+
+        # Using the coarse geometry if provided (used to determine known free space and unknown space)
+        # Re-implement as occupancy grid (2021/1/31)
+        if mask_cache_world_size is None:
+            mask_cache_world_size = self.world_size
+        mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)
+        self.mask_cache = grid.MaskGrid(
+            path=None, mask=mask,
+            xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+    def _set_grid_resolution(self, num_voxels):
+        # Determine grid resolution
+        self.num_voxels = num_voxels
+        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)
+        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()
+        self.world_len = self.world_size[0].item()
+        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base
+
+        print('dcvgo: voxel_size      ', self.voxel_size)
+        print('dcvgo: world_size      ', self.world_size)
+        print('dcvgo: voxel_size_base ', self.voxel_size_base)
+        print('dcvgo: voxel_size_ratio', self.voxel_size_ratio)
+
+    def get_kwargs(self):
+        return {
+            'xyz_min': self.xyz_min.cpu().numpy(),
+            'xyz_max': self.xyz_max.cpu().numpy(),
+            'num_voxels': self.num_voxels,
+            'num_voxels_base': self.num_voxels_base,
+            'alpha_init': self.alpha_init,
+            'voxel_size_ratio': self.voxel_size_ratio,
+            'mask_cache_world_size': list(self.mask_cache.mask.shape),
+            'fast_color_thres': self.fast_color_thres,
+            'contracted_norm': self.contracted_norm,
+            'density_type': self.density_type,
+            'k0_type': self.k0_type,
+            'density_config': self.density_config,
+            'k0_config': self.k0_config,
+            **self.rgbnet_kwargs,
+        }
+    
+    @torch.no_grad()
+    def change_num_objects(self, num_obj):
+        self.num_objects = num_obj
+        device = self.seg_mask_grid.grid.device
+        self.seg_mask_grid = grid.create_grid(
+                'DenseGrid', channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.dual_seg_mask_grid = grid.create_grid(
+                'DenseGrid', channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.seg_mask_grid.to(device)
+        self.dual_seg_mask_grid.to(device)
+        print("Reset the seg_mask_grid with num_objects =", num_obj)
+        
+    @torch.no_grad()
+    def segmentation_to_density(self):
+        assert self.seg_mask_grid.grid.shape[1] == 1 and "multi-object seg label cannot be applied directly to the density grid"
+        mask_grid = torch.zeros_like(self.seg_mask_grid.grid)
+        mask_grid[self.seg_mask_grid.grid > 0] = 1
+        self.density.grid *= mask_grid
+        self.density.grid[self.density.grid == 0] = -1e7
+
+
+    @torch.no_grad()
+    def segmentation_only(self):
+        assert self.seg_mask_grid.grid.shape[1] == 1 and "multi-object seg label cannot be applied directly to the density grid"
+        pass
+
+        
+    @torch.no_grad()
+    def change_to_fine_mode(self):
+        self.mode = 'fine'
+
+    @torch.no_grad()
+    def scale_volume_grid(self, num_voxels):
+        print('dcvgo: scale_volume_grid start')
+        ori_world_size = self.world_size
+        self._set_grid_resolution(num_voxels)
+        print('dcvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())
+
+        self.density.scale_volume_grid(self.world_size)
+        self.seg_mask_grid.scale_volume_grid(self.world_size)
+        self.dual_seg_mask_grid.scale_volume_grid(self.world_size)
+        self.k0.scale_volume_grid(self.world_size)
+
+        if np.prod(self.world_size.tolist()) <= 256**3:
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+            ), -1)
+            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]
+            self.mask_cache = grid.MaskGrid(
+                path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+        print('dcvgo: scale_volume_grid finish')
+
+    @torch.no_grad()
+    def update_occupancy_cache(self):
+        ori_p = self.mask_cache.mask.float().mean().item()
+        cache_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),
+        ), -1)
+        cache_grid_density = self.density(cache_grid_xyz)[None,None]
+        cache_grid_alpha = self.activate_density(cache_grid_density)
+        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]
+        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+
+    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):
+        print('dcvgo: update mask_cache lt_nviews start')
+        eps_time = time.time()
+        count = torch.zeros_like(self.density.get_dense_grid()).long()
+        device = count.device
+        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):
+            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)
+            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):
+                ray_pts, inner_mask, t = self.sample_ray(
+                        ori_rays_o=rays_o.to(device), ori_rays_d=rays_d.to(device),
+                        **render_kwargs)
+                ones(ray_pts).sum().backward()
+            count.data += (ones.grid.grad > 1)
+        ori_p = self.mask_cache.mask.float().mean().item()
+        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]
+        new_p = self.mask_cache.mask.float().mean().item()
+        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')
+        eps_time = time.time() - eps_time
+        print(f'dcvgo: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')
+
+    def density_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.density.total_variation_add_grad(w, w, w, dense_mode)
+
+    def k0_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.k0.total_variation_add_grad(w, w, w, dense_mode)
+
+    def activate_density(self, density, interval=None):
+        interval = interval if interval is not None else self.voxel_size_ratio
+        shape = density.shape
+        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)
+
+    def sample_ray(self, ori_rays_o, ori_rays_d, stepsize, is_train=False, **render_kwargs):
+        '''Sample query points on rays.
+        All the output points are sorted from near to far.
+        Input:
+            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.
+            stepsize:         the number of voxels of each sample step.
+        Output:
+            ray_pts:          [M, 3] storing all the sampled points.
+            ray_id:           [M]    the index of the ray of each point.
+            step_id:          [M]    the i'th step on a ray of each point.
+        '''
+        rays_o = (ori_rays_o - self.scene_center) / self.scene_radius
+        rays_d = ori_rays_d / ori_rays_d.norm(dim=-1, keepdim=True)
+        N_inner = int(2 / (2+2*self.bg_len) * self.world_len / stepsize) + 1
+        N_outer = N_inner
+        b_inner = torch.linspace(0, 2, N_inner+1)
+        b_outer = 2 / torch.linspace(1, 1/128, N_outer+1)
+        t = torch.cat([
+            (b_inner[1:] + b_inner[:-1]) * 0.5,
+            (b_outer[1:] + b_outer[:-1]) * 0.5,
+        ])
+        ray_pts = rays_o[:,None,:] + rays_d[:,None,:] * t[None,:,None]
+        if self.contracted_norm == 'inf':
+            norm = ray_pts.abs().amax(dim=-1, keepdim=True)
+        elif self.contracted_norm == 'l2':
+            norm = ray_pts.norm(dim=-1, keepdim=True)
+        else:
+            raise NotImplementedError
+        inner_mask = (norm<=1)
+        ray_pts = torch.where(
+            inner_mask,
+            ray_pts,
+            ray_pts / norm * ((1+self.bg_len) - self.bg_len/norm)
+        )
+        return ray_pts, inner_mask.squeeze(-1), t
+
+    @torch.no_grad()
+    def forward(self, rays_o, rays_d, viewdirs, global_step=None, is_train=False, render_fct=0.0, **render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+        if isinstance(self._fast_color_thres, dict) and global_step in self._fast_color_thres:
+            print(f'dcvgo: update fast_color_thres {self.fast_color_thres} => {self._fast_color_thres[global_step]}')
+            self.fast_color_thres = self._fast_color_thres[global_step]
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, inner_mask, t = self.sample_ray(
+                ori_rays_o=rays_o, ori_rays_d=rays_d, is_train=global_step is not None, **render_kwargs)
+        n_max = len(t)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+        ray_id, step_id = create_full_step_id(ray_pts.shape[:2])
+
+        # cumsum ray_pts to get distance from ray_o to any ray_pt in a ray
+        ray_distance = torch.zeros_like(ray_pts)
+        ray_distance[:, 1:] = torch.abs(ray_pts[:, 1:] - ray_pts[:, :-1])
+        ray_distance = torch.cumsum(ray_distance, dim=1)
+
+        # skip oversampled points outside scene bbox
+        mask = inner_mask.clone()
+        dist_thres = (2+2*self.bg_len) / self.world_len * render_kwargs['stepsize'] * 0.95
+        dist = (ray_pts[:,1:] - ray_pts[:,:-1]).norm(dim=-1)
+        mask[:, 1:] |= ub360_utils_cuda.cumdist_thres(dist, dist_thres)
+        ray_pts = ray_pts[mask]
+        ray_distance = ray_distance[mask]
+        inner_mask = inner_mask[mask]
+        t = t[None].repeat(N,1)[mask]
+        ray_id = ray_id[mask.flatten()]
+        step_id = step_id[mask.flatten()]
+
+        # skip known free space
+        mask = self.mask_cache(ray_pts)
+        ray_pts = ray_pts[mask]
+        ray_distance = ray_distance[mask]
+        inner_mask = inner_mask[mask]
+        t = t[mask]
+        ray_id = ray_id[mask]
+        step_id = step_id[mask]
+
+#         print(self.fast_color_thres, "self.fast_color_thres")
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        alpha = self.activate_density(density, interval)
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_distance = ray_distance[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_distance = ray_distance[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+            weights = weights[mask]
+
+        # query for segmentation mask
+        # only optimize the mask volume
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                mask_pred = self.seg_mask_grid(ray_pts)
+                if self.mode == 'fine':
+                    dual_mask_pred = self.dual_seg_mask_grid(ray_pts)
+        else:
+            mask_pred = self.seg_mask_grid(ray_pts)
+            if self.mode == 'fine':
+                dual_mask_pred = self.dual_seg_mask_grid(ray_pts)
+        
+        # query for color
+        k0 = self.k0(ray_pts)
+        if self.rgbnet is None:
+            # no view-depend effect
+            rgb = torch.sigmoid(k0)
+        else:
+            # view-dependent color emission
+            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)
+            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)
+            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]
+            rgb_feat = torch.cat([k0, viewdirs_emb], -1)
+            rgb_logit = self.rgbnet(rgb_feat)
+            rgb = torch.sigmoid(rgb_logit)
+
+        # Ray marching
+        rgb_marched = segment_coo(
+                src=(weights.unsqueeze(-1) * rgb),
+                index=ray_id,
+                out=torch.zeros([N, 3]),
+                reduce='sum')
+        
+        dual_seg_mask_marched = None
+        if self.num_objects == 1:
+            if self.seg_mask_grid.grid.requires_grad:
+                with torch.enable_grad():
+                    seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1).detach().clone() * mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                    if self.mode == 'fine':
+                        dual_seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1).detach().clone() * dual_mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+            else:
+                seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                if self.mode == 'fine':
+                    dual_seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+        else:
+            if self.seg_mask_grid.grid.requires_grad:
+                with torch.enable_grad():
+                    seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1).detach().clone() * mask_pred),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                    if self.mode == 'fine':
+                        dual_seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1).detach().clone() * dual_mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+            else:
+                seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                if self.mode == 'fine':
+                    dual_seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+
+        if render_kwargs.get('rand_bkgd', False) and is_train:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))
+        else:
+            rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])
+        wsum_mid = segment_coo(
+                src=weights[inner_mask],
+                index=ray_id[inner_mask],
+                out=torch.zeros([N]),
+                reduce='sum')
+
+        s = 1 - 1/(1+t)  # [0, inf] => [0, 1]
+        ray_distance = ray_distance.norm(dim=-1)
+        ret_dict.update({
+            'alphainv_last': alphainv_last,
+            'weights': weights,
+            'wsum_mid': wsum_mid,
+            'rgb_marched': rgb_marched,
+            'raw_density': density,
+            'raw_alpha': alpha,
+            'raw_rgb': rgb,
+            'ray_id': ray_id,
+            'step_id': step_id,
+            'n_max': n_max,
+            't': t,
+            's': s,
+            'seg_mask_marched': seg_mask_marched,
+            'dual_seg_mask_marched': dual_seg_mask_marched,
+            'ray_distance': ray_distance
+        })
+
+        if render_kwargs.get('render_depth', False):
+            with torch.no_grad():
+                depth = segment_coo(
+                        src=(weights * s),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+                distance = segment_coo(
+                        src=(weights * ray_distance),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+            ret_dict.update({'depth': depth})
+            ret_dict.update({'distance': distance})
+
+        return ret_dict
+    
+    @torch.no_grad()
+    def forward_mask(self, rays_o, rays_d, render_fct=0.0,**render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+#         if isinstance(self._fast_color_thres, dict) and global_step in self._fast_color_thres:
+#             print(f'dcvgo: update fast_color_thres {self.fast_color_thres} => {self._fast_color_thres[global_step]}')
+#             self.fast_color_thres = self._fast_color_thres[global_step]
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, inner_mask, t = self.sample_ray(
+                ori_rays_o=rays_o, ori_rays_d=rays_d, is_train=False, **render_kwargs)
+        n_max = len(t)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+        ray_id, step_id = create_full_step_id(ray_pts.shape[:2])
+
+        # skip oversampled points outside scene bbox
+        mask = inner_mask.clone()
+        dist_thres = (2+2*self.bg_len) / self.world_len * render_kwargs['stepsize'] * 0.95
+        dist = (ray_pts[:,1:] - ray_pts[:,:-1]).norm(dim=-1)
+        mask[:, 1:] |= ub360_utils_cuda.cumdist_thres(dist, dist_thres)
+        ray_pts = ray_pts[mask]
+        inner_mask = inner_mask[mask]
+        t = t[None].repeat(N,1)[mask]
+        ray_id = ray_id[mask.flatten()]
+        step_id = step_id[mask.flatten()]
+
+        # skip known free space
+        mask = self.mask_cache(ray_pts)
+        ray_pts = ray_pts[mask]
+        inner_mask = inner_mask[mask]
+        t = t[mask]
+        ray_id = ray_id[mask]
+        step_id = step_id[mask]
+        
+        
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        alpha = self.activate_density(density, interval)
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            ray_pts = ray_pts[mask]
+            inner_mask = inner_mask[mask]
+            t = t[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+            weights = weights[mask]
+
+        # query for segmentation mask
+        # only optimize the mask volume
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                mask_pred = self.seg_mask_grid(ray_pts)
+        else:
+            mask_pred = self.seg_mask_grid(ray_pts)
+
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * mask_pred),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+        else:
+            seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * mask_pred),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+            
+
+        ret_dict.update({
+            'seg_mask_marched': seg_mask_marched,
+        })
+
+        return ret_dict
+
+class DistortionLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w, s, n_max, ray_id):
+        n_rays = ray_id.max()+1
+        interval = 1/n_max
+        w_prefix, w_total, ws_prefix, ws_total = ub360_utils_cuda.segment_cumsum(w, s, ray_id)
+        loss_uni = (1/3) * interval * w.pow(2)
+        loss_bi = 2 * w * (s * w_prefix - ws_prefix)
+        ctx.save_for_backward(w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id)
+        ctx.interval = interval
+        return (loss_bi.sum() + loss_uni.sum()) / n_rays
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id = ctx.saved_tensors
+        interval = ctx.interval
+        grad_uni = (1/3) * interval * 2 * w
+        w_suffix = w_total[ray_id] - (w_prefix + w)
+        ws_suffix = ws_total[ray_id] - (ws_prefix + w*s)
+        grad_bi = 2 * (s * (w_prefix - w_suffix) + (ws_suffix - ws_prefix))
+        grad = grad_back * (grad_bi + grad_uni)
+        return grad, None, None, None
+
+distortion_loss = DistortionLoss.apply
+
diff --git a/lib/seg_dvgo.py b/lib/seg_dvgo.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d42cdeb5223624d874bb5e9b49ecf9868b18fd
--- /dev/null
+++ b/lib/seg_dvgo.py
@@ -0,0 +1,854 @@
+import os
+import time
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch_scatter import segment_coo
+
+from . import grid
+from torch.utils.cpp_extension import load
+parent_dir = os.path.dirname(os.path.abspath(__file__))
+render_utils_cuda = load(
+        name='render_utils_cuda',
+        sources=[
+            os.path.join(parent_dir, path)
+            for path in ['cuda/render_utils.cpp', 'cuda/render_utils_kernel.cu']],
+        verbose=True)
+
+
+'''Model'''
+class DirectVoxGO(torch.nn.Module):
+    def __init__(self, xyz_min, xyz_max,
+                 num_voxels=0, num_voxels_base=0, num_objects = 1,
+                 alpha_init=None,
+                 mask_cache_path=None, mask_cache_thres=1e-3, mask_cache_world_size=None,
+                 fast_color_thres=0,
+                 density_type='DenseGrid', k0_type='DenseGrid',
+                 density_config={}, k0_config={},
+                 rgbnet_dim=0, rgbnet_direct=False, rgbnet_full_implicit=False,
+                 rgbnet_depth=3, rgbnet_width=128,
+                 viewbase_pe=4,
+                 **kwargs):
+        super(DirectVoxGO, self).__init__()
+        self.register_buffer('xyz_min', torch.Tensor(xyz_min))
+        self.register_buffer('xyz_max', torch.Tensor(xyz_max))
+        self.fast_color_thres = fast_color_thres
+
+        # determine based grid resolution
+        self.num_voxels_base = num_voxels_base
+        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)
+
+        # determine the density bias shift
+        self.alpha_init = alpha_init
+        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))
+        print('dvgo: set density bias shift to', self.act_shift)
+
+        # determine init grid resolution
+        self._set_grid_resolution(num_voxels)
+
+        # init density voxel grid
+        self.density_type = density_type
+        self.density_config = density_config
+        self.density = grid.create_grid(
+                density_type, channels=1, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+
+        # The segmentation mode is initialized to coarse
+        self.mode = 'coarse'
+        self.num_objects = num_objects
+        self.seg_mask_grid = grid.create_grid(
+                density_type, channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.mask_view_counts = torch.zeros_like(self.seg_mask_grid.grid, requires_grad=False)
+        
+        self.dual_seg_mask_grid = grid.create_grid(
+                density_type, channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+
+
+        # init color representation
+        self.rgbnet_kwargs = {
+            'rgbnet_dim': rgbnet_dim, 'rgbnet_direct': rgbnet_direct,
+            'rgbnet_full_implicit': rgbnet_full_implicit,
+            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,
+            'viewbase_pe': viewbase_pe,
+        }
+        self.k0_type = k0_type
+        self.k0_config = k0_config
+        self.rgbnet_full_implicit = rgbnet_full_implicit
+
+        
+        if rgbnet_dim <= 0:
+            # color voxel grid  (coarse stage)
+            self.k0_dim = 3
+            self.k0 = grid.create_grid(
+                k0_type, channels=self.k0_dim, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.k0_config)
+            self.rgbnet = None
+        else:
+            # feature voxel grid + shallow MLP  (fine stage)
+            if self.rgbnet_full_implicit:
+                self.k0_dim = 0
+            else:
+                self.k0_dim = rgbnet_dim
+            self.k0 = grid.create_grid(
+                    k0_type, channels=self.k0_dim, world_size=self.world_size,
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                    config=self.k0_config)
+            self.rgbnet_direct = rgbnet_direct
+            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))
+            dim0 = (3+3*viewbase_pe*2)
+            if self.rgbnet_full_implicit:
+                pass
+            elif rgbnet_direct:
+                dim0 += self.k0_dim
+            else:
+                dim0 += self.k0_dim-3
+            self.rgbnet = nn.Sequential(
+                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),
+                *[
+                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))
+                    for _ in range(rgbnet_depth-2)
+                ],
+                nn.Linear(rgbnet_width, 3),
+            )
+            nn.init.constant_(self.rgbnet[-1].bias, 0)
+            print('dvgo: feature voxel grid', self.k0)
+            print('dvgo: mlp', self.rgbnet)
+
+        # Using the coarse geometry if provided (used to determine known free space and unknown space)
+        # Re-implement as occupancy grid (2021/1/31)
+        self.mask_cache_path = mask_cache_path
+        self.mask_cache_thres = mask_cache_thres
+        if mask_cache_world_size is None:
+            mask_cache_world_size = self.world_size
+        if mask_cache_path is not None and mask_cache_path:
+            mask_cache = grid.MaskGrid(
+                    path=mask_cache_path,
+                    mask_cache_thres=mask_cache_thres).to(self.xyz_min.device)
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], mask_cache_world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], mask_cache_world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], mask_cache_world_size[2]),
+            ), -1)
+            mask = mask_cache(self_grid_xyz)
+        else:
+            mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)
+        self.mask_cache = grid.MaskGrid(
+                path=None, mask=mask,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+    def _set_grid_resolution(self, num_voxels):
+        # Determine grid resolution
+        self.num_voxels = num_voxels
+        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)
+        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()
+        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base
+
+        print('dvgo: voxel_size      ', self.voxel_size)
+        print('dvgo: world_size      ', self.world_size)
+        print('dvgo: voxel_size_base ', self.voxel_size_base)
+        print('dvgo: voxel_size_ratio', self.voxel_size_ratio)
+
+    def get_kwargs(self):
+        return {
+            'xyz_min': self.xyz_min.cpu().numpy(),
+            'xyz_max': self.xyz_max.cpu().numpy(),
+            'num_voxels': self.num_voxels,
+            'num_voxels_base': self.num_voxels_base,
+            'alpha_init': self.alpha_init,
+            'voxel_size_ratio': self.voxel_size_ratio,
+            'mask_cache_path': self.mask_cache_path,
+            'mask_cache_thres': self.mask_cache_thres,
+            'mask_cache_world_size': list(self.mask_cache.mask.shape),
+            'fast_color_thres': self.fast_color_thres,
+            'density_type': self.density_type,
+            'k0_type': self.k0_type,
+            'density_config': self.density_config,
+            'k0_config': self.k0_config,
+            **self.rgbnet_kwargs,
+        }
+    
+    @torch.no_grad()
+    def change_num_objects(self, num_obj):
+        self.num_objects = num_obj
+        device = self.seg_mask_grid.grid.device
+        self.seg_mask_grid = grid.create_grid(
+                'DenseGrid', channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.dual_seg_mask_grid = grid.create_grid(
+                'DenseGrid', channels=self.num_objects, world_size=self.world_size,
+                xyz_min=self.xyz_min, xyz_max=self.xyz_max,
+                config=self.density_config)
+        self.seg_mask_grid.to(device)
+        self.dual_seg_mask_grid.to(device)
+        print("Reset the seg_mask_grid with num_objects =", num_obj)
+        
+
+    @torch.no_grad()
+    def segmentation_to_density(self):
+        assert self.seg_mask_grid.grid.shape[1] == 1 and "multi-object seg label cannot be applied directly to the density grid"
+        mask_grid = torch.zeros_like(self.seg_mask_grid.grid)
+        mask_grid[self.seg_mask_grid.grid > 0] = 1
+        
+        self.density.grid *= mask_grid
+        self.density.grid[self.density.grid == 0] = -1e7
+
+    @torch.no_grad()
+    def segmentation_only(self):
+        assert self.seg_mask_grid.grid.shape[1] == 1 and "multi-object seg label cannot be applied directly to the density grid"
+        pass
+
+    @torch.no_grad()
+    def change_to_fine_mode(self):
+        self.mode = 'fine'
+    
+    @torch.no_grad()
+    def maskout_near_cam_vox(self, cam_o, near_clip):
+        # maskout grid points that between cameras and their near planes
+        self_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+        ), -1)
+        nearest_dist = torch.stack([
+            (self_grid_xyz.unsqueeze(-2) - co).pow(2).sum(-1).sqrt().amin(-1)
+            for co in cam_o.split(100)  # for memory saving
+        ]).amin(0)
+        self.density.grid[nearest_dist[None,None] <= near_clip] = -100
+
+    @torch.no_grad()
+    def scale_volume_grid(self, num_voxels):
+        print('dvgo: scale_volume_grid start')
+        ori_world_size = self.world_size
+        self._set_grid_resolution(num_voxels)
+        print('dvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())
+
+        self.density.scale_volume_grid(self.world_size)
+        self.seg_mask_grid.scale_volume_grid(self.world_size)
+        self.dual_seg_mask_grid.scale_volume_grid(self.world_size)
+        self.k0.scale_volume_grid(self.world_size)
+
+        if np.prod(self.world_size.tolist()) <= 256**3:
+            self_grid_xyz = torch.stack(torch.meshgrid(
+                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),
+                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),
+                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),
+            ), -1)
+            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]
+            self.mask_cache = grid.MaskGrid(
+                    path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),
+                    xyz_min=self.xyz_min, xyz_max=self.xyz_max)
+
+        print('dvgo: scale_volume_grid finish')
+
+    @torch.no_grad()
+    def update_occupancy_cache(self):
+        cache_grid_xyz = torch.stack(torch.meshgrid(
+            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),
+            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),
+            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),
+        ), -1)
+        cache_grid_density = self.density(cache_grid_xyz)[None,None]
+        cache_grid_alpha = self.activate_density(cache_grid_density)
+        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]
+        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)
+
+    def voxel_count_views(self, rays_o_tr, rays_d_tr, imsz, near, far, stepsize, downrate=1, irregular_shape=False):
+        print('dvgo: voxel_count_views start')
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        eps_time = time.time()
+        N_samples = int(np.linalg.norm(np.array(self.world_size.cpu())+1) / stepsize) + 1
+        rng = torch.arange(N_samples)[None].float()
+        count = torch.zeros_like(self.density.get_dense_grid())
+
+        device = rng.device
+        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):
+            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)
+            if irregular_shape:
+                rays_o_ = rays_o_.split(10000)
+                rays_d_ = rays_d_.split(10000)
+            else:
+                rays_o_ = rays_o_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)
+                rays_d_ = rays_d_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)
+
+            for rays_o, rays_d in zip(rays_o_, rays_d_):
+                vec = torch.where(rays_d==0, torch.full_like(rays_d, 1e-6), rays_d)
+                rate_a = (self.xyz_max - rays_o) / vec
+                rate_b = (self.xyz_min - rays_o) / vec
+                t_min = torch.minimum(rate_a, rate_b).amax(-1).clamp(min=near, max=far)
+                t_max = torch.maximum(rate_a, rate_b).amin(-1).clamp(min=near, max=far)
+                step = stepsize * self.voxel_size * rng
+                interpx = (t_min[...,None] + step/rays_d.norm(dim=-1,keepdim=True))
+                rays_pts = rays_o[...,None,:] + rays_d[...,None,:] * interpx[...,None]
+                ones(rays_pts).sum().backward()
+            with torch.no_grad():
+                count += (ones.grid.grad > 1)
+        eps_time = time.time() - eps_time
+        print('dvgo: voxel_count_views finish (eps time:', eps_time, 'sec)')
+
+        return count
+
+    def density_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.density.total_variation_add_grad(w, w, w, dense_mode)
+
+    def k0_total_variation_add_grad(self, weight, dense_mode):
+        w = weight * self.world_size.max() / 128
+        self.k0.total_variation_add_grad(w, w, w, dense_mode)
+
+
+    def activate_density(self, density, interval=None):
+        interval = interval if interval is not None else self.voxel_size_ratio
+        shape = density.shape
+        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)
+
+    def hit_coarse_geo(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):
+        '''Check whether the rays hit the solved coarse geometry or not'''
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        shape = rays_o.shape[:-1]
+        rays_o = rays_o.reshape(-1, 3).contiguous()
+        rays_d = rays_d.reshape(-1, 3).contiguous()
+        stepdist = stepsize * self.voxel_size
+        ray_pts, mask_outbbox, ray_id = render_utils_cuda.sample_pts_on_rays(
+                rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)[:3]
+        mask_inbbox = ~mask_outbbox
+        hit = torch.zeros([len(rays_o)], dtype=torch.bool)
+        hit[ray_id[mask_inbbox][self.mask_cache(ray_pts[mask_inbbox])]] = 1
+        return hit.reshape(shape)
+
+    def sample_ray(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):
+        '''Sample query points on rays.
+        All the output points are sorted from near to far.
+        Input:
+            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.
+            near, far:        the near and far distance of the rays.
+            stepsize:         the number of voxels of each sample step.
+        Output:
+            ray_pts:          [M, 3] storing all the sampled points.
+            ray_id:           [M]    the index of the ray of each point.
+            step_id:          [M]    the i'th step on a ray of each point.
+        '''
+        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox
+        rays_o = rays_o.contiguous()
+        rays_d = rays_d.contiguous()
+        stepdist = stepsize * self.voxel_size
+        ray_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max = render_utils_cuda.sample_pts_on_rays(
+            rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)
+        mask_inbbox = ~mask_outbbox
+        ray_pts = ray_pts[mask_inbbox]
+        ray_id = ray_id[mask_inbbox]
+        step_id = step_id[mask_inbbox]
+        return ray_pts, ray_id, step_id
+
+    @torch.no_grad()
+    def forward(self, rays_o, rays_d, viewdirs, global_step=None, distill_active=False, render_fct=0.0,**render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, ray_id, step_id = self.sample_ray(
+                rays_o=rays_o, rays_d=rays_d, **render_kwargs)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+
+        # skip known free space
+        if self.mask_cache is not None:
+            mask = self.mask_cache(ray_pts)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # self.fast_color_thres = 0.1
+#         print(self.fast_color_thres, "self.fast_color_thres")
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        alpha = self.activate_density(density, interval) 
+            
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        
+            
+#         print(alpha_with_mask.requires_grad, "alpha_with_mask.requires_grad")
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            weights = weights[mask]
+            alpha = alpha[mask]
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # query for segmentation mask
+        # only optimize the mask volume
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                mask_pred = self.seg_mask_grid(ray_pts)
+                if self.mode == 'fine':
+                    dual_mask_pred = self.dual_seg_mask_grid(ray_pts)
+        else:
+            mask_pred = self.seg_mask_grid(ray_pts)
+            if self.mode == 'fine':
+                dual_mask_pred = self.dual_seg_mask_grid(ray_pts)
+                
+
+        # query for color
+        if self.rgbnet_full_implicit:
+            pass
+        else:
+            k0 = self.k0(ray_pts)
+            
+        
+        if self.rgbnet is None:
+            # no view-depend effect
+            rgb = torch.sigmoid(k0)
+        else:
+            # view-dependent color emission
+            if self.rgbnet_direct:
+                k0_view = k0
+            else:
+                k0_view = k0[:, 3:]
+                k0_diffuse = k0[:, :3]
+            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)
+            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)
+            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]
+            rgb_feat = torch.cat([k0_view, viewdirs_emb], -1)
+            rgb_logit = self.rgbnet(rgb_feat)
+            if self.rgbnet_direct:
+                rgb = torch.sigmoid(rgb_logit)
+            else:
+                rgb = torch.sigmoid(rgb_logit + k0_diffuse)
+
+        # Ray marching
+#         print("weight, rgb shape", weights.shape, rgb.shape)
+        rgb_marched = segment_coo(
+                src=(weights.unsqueeze(-1) * rgb),
+                index=ray_id,
+                out=torch.zeros([N, 3]),
+                reduce='sum')
+    
+        dual_seg_mask_marched = None
+        if self.num_objects == 1:
+            if self.seg_mask_grid.grid.requires_grad:
+                with torch.enable_grad():
+                    seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                    if self.mode == 'fine':
+                        dual_seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+            else:
+                seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                if self.mode == 'fine':
+                    dual_seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+        else:
+            if self.seg_mask_grid.grid.requires_grad:
+                with torch.enable_grad():
+                    seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                    if self.mode == 'fine':
+                        dual_seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+            else:
+                seg_mask_marched = segment_coo(
+                            src=(weights.unsqueeze(-1) * mask_pred),
+                            index=ray_id,
+                            out=torch.zeros([N, self.num_objects]),
+                            reduce='sum')
+                if self.mode == 'fine':
+                    dual_seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * dual_mask_pred.unsqueeze(-1)),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+                
+        rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])
+        ret_dict.update({
+            'alphainv_last': alphainv_last,
+            'weights': weights,
+            'rgb_marched': rgb_marched,
+            'raw_alpha': alpha,
+            'raw_rgb': rgb,
+            'ray_id': ray_id,
+            'seg_mask_marched': seg_mask_marched,
+            'dual_seg_mask_marched': dual_seg_mask_marched,
+        })
+
+        if render_kwargs.get('render_depth', False):
+            with torch.no_grad():
+                depth = segment_coo(
+                        src=(weights * step_id),
+                        index=ray_id,
+                        out=torch.zeros([N]),
+                        reduce='sum')
+            ret_dict.update({'depth': depth})
+
+        return ret_dict
+
+    @torch.no_grad()
+    def forward_mask(self, rays_o, rays_d, render_fct=0.0,**render_kwargs):
+        '''Volume rendering
+        @rays_o:   [N, 3] the starting point of the N shooting rays.
+        @rays_d:   [N, 3] the shooting direction of the N rays.
+        '''
+        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'
+
+        ret_dict = {}
+        N = len(rays_o)
+
+        # sample points on rays
+        ray_pts, ray_id, step_id = self.sample_ray(
+                rays_o=rays_o, rays_d=rays_d, **render_kwargs)
+        interval = render_kwargs['stepsize'] * self.voxel_size_ratio
+
+        # skip known free space
+        if self.mask_cache is not None:
+            mask = self.mask_cache(ray_pts)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # self.fast_color_thres = 0.1
+        render_fct = max(render_fct, self.fast_color_thres)
+
+        # query for alpha w/ post-activation
+        density = self.density(ray_pts)
+        
+        alpha = self.activate_density(density, interval)
+        if render_fct > 0:
+            mask = (alpha > render_fct)
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+            density = density[mask]
+            alpha = alpha[mask]
+
+        # compute accumulated transmittance
+        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)
+        if render_fct > 0:
+            mask = (weights > render_fct)
+            weights = weights[mask]
+            alpha = alpha[mask]
+            ray_pts = ray_pts[mask]
+            ray_id = ray_id[mask]
+            step_id = step_id[mask]
+
+        # query for segmentation mask
+        # only optimize the mask volume
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                mask_pred = self.seg_mask_grid(ray_pts)
+        else:
+            mask_pred = self.seg_mask_grid(ray_pts)
+
+        if self.seg_mask_grid.grid.requires_grad:
+            with torch.enable_grad():
+                seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * mask_pred),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+        else:
+            seg_mask_marched = segment_coo(
+                        src=(weights.unsqueeze(-1) * mask_pred),
+                        index=ray_id,
+                        out=torch.zeros([N, self.num_objects]),
+                        reduce='sum')
+            
+
+        ret_dict.update({
+            'seg_mask_marched': seg_mask_marched,
+        })
+
+        return ret_dict
+
+''' Misc
+'''
+class Raw2Alpha(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, density, shift, interval):
+        '''
+        alpha = 1 - exp(-softplus(density + shift) * interval)
+              = 1 - exp(-log(1 + exp(density + shift)) * interval)
+              = 1 - exp(log(1 + exp(density + shift)) ^ (-interval))
+              = 1 - (1 + exp(density + shift)) ^ (-interval)
+        '''
+        exp, alpha = render_utils_cuda.raw2alpha(density, shift, interval)
+        if density.requires_grad:
+            ctx.save_for_backward(exp)
+            ctx.interval = interval
+        return alpha
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        '''
+        alpha' = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)'
+               = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)
+        '''
+        exp = ctx.saved_tensors[0]
+        interval = ctx.interval
+        return render_utils_cuda.raw2alpha_backward(exp, grad_back.contiguous(), interval), None, None
+
+class Raw2Alpha_nonuni(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, density, shift, interval):
+        exp, alpha = render_utils_cuda.raw2alpha_nonuni(density, shift, interval)
+        if density.requires_grad:
+            ctx.save_for_backward(exp)
+            ctx.interval = interval
+        return alpha
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_back):
+        exp = ctx.saved_tensors[0]
+        interval = ctx.interval
+        return render_utils_cuda.raw2alpha_nonuni_backward(exp, grad_back.contiguous(), interval), None, None
+
+class Alphas2Weights(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alpha, ray_id, N):
+        weights, T, alphainv_last, i_start, i_end = render_utils_cuda.alpha2weight(alpha, ray_id, N)
+        if alpha.requires_grad:
+            ctx.save_for_backward(alpha, weights, T, alphainv_last, i_start, i_end)
+            ctx.n_rays = N
+        return weights, alphainv_last
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_weights, grad_last):
+        alpha, weights, T, alphainv_last, i_start, i_end = ctx.saved_tensors
+        grad = render_utils_cuda.alpha2weight_backward(
+                alpha, weights, T, alphainv_last,
+                i_start, i_end, ctx.n_rays, grad_weights, grad_last)
+        return grad, None, None
+
+
+''' Ray and batch
+'''
+def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
+    i, j = torch.meshgrid(
+        torch.linspace(0, W-1, W, device=c2w.device),
+        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
+    i = i.t().float()
+    j = j.t().float()
+    if mode == 'lefttop':
+        pass
+    elif mode == 'center':
+        i, j = i+0.5, j+0.5
+    elif mode == 'random':
+        i = i+torch.rand_like(i)
+        j = j+torch.rand_like(j)
+    else:
+        raise NotImplementedError
+
+    if flip_x:
+        i = i.flip((1,))
+    if flip_y:
+        j = j.flip((0,))
+    if inverse_y:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
+    else:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = c2w[:3,3].expand(rays_d.shape)
+    return rays_o, rays_d
+
+
+def get_rays_np(H, W, K, c2w):
+    i, j = np.meshgrid(np.arange(W, dtype=np.float32), np.arange(H, dtype=np.float32), indexing='xy')
+    dirs = np.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -np.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = np.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = np.broadcast_to(c2w[:3,3], np.shape(rays_d))
+    return rays_o, rays_d
+
+
+def ndc_rays(H, W, focal, near, rays_o, rays_d):
+    # Shift ray origins to near plane
+    t = -(near + rays_o[...,2]) / rays_d[...,2]
+    rays_o = rays_o + t[...,None] * rays_d
+
+    # Projection
+    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
+    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
+    o2 = 1. + 2. * near / rays_o[...,2]
+
+    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
+    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
+    d2 = -2. * near / rays_o[...,2]
+
+    rays_o = torch.stack([o0,o1,o2], -1)
+    rays_d = torch.stack([d0,d1,d2], -1)
+
+    return rays_o, rays_d
+
+
+def get_rays_of_a_view(H, W, K, c2w, ndc, inverse_y, flip_x, flip_y, mode='center'):
+    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
+    viewdirs = rays_d / rays_d.norm(dim=-1, keepdim=True)
+    if ndc:
+        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
+    return rays_o, rays_d, viewdirs
+
+
+@torch.no_grad()
+def get_training_rays(rgb_tr, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):
+    print('get_training_rays: start')
+    assert len(np.unique(HW, axis=0)) == 1
+    assert len(np.unique(Ks.reshape(len(Ks),-1), axis=0)) == 1
+    assert len(rgb_tr) == len(train_poses) and len(rgb_tr) == len(Ks) and len(rgb_tr) == len(HW)
+    H, W = HW[0]
+    K = Ks[0]
+    eps_time = time.time()
+    rays_o_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    rays_d_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    viewdirs_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)
+    imsz = [1] * len(rgb_tr)
+    for i, c2w in enumerate(train_poses):
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        rays_o_tr[i].copy_(rays_o.to(rgb_tr.device))
+        rays_d_tr[i].copy_(rays_d.to(rgb_tr.device))
+        viewdirs_tr[i].copy_(viewdirs.to(rgb_tr.device))
+        del rays_o, rays_d, viewdirs
+    eps_time = time.time() - eps_time
+    print('get_training_rays: finish (eps time:', eps_time, 'sec)')
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+@torch.no_grad()
+def get_training_rays_flatten(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):
+    print('get_training_rays_flatten: start')
+    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)
+    eps_time = time.time()
+    DEVICE = rgb_tr_ori[0].device
+    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)
+    rgb_tr = torch.zeros([N,3], device=DEVICE)
+
+    rays_o_tr = torch.zeros_like(rgb_tr)
+    rays_d_tr = torch.zeros_like(rgb_tr)
+    viewdirs_tr = torch.zeros_like(rgb_tr)
+    imsz = []
+    top = 0
+
+    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):
+        assert img.shape[:2] == (H, W)
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc,
+                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        n = H * W
+        rgb_tr[top:top+n].copy_(img.flatten(0,1))
+        rays_o_tr[top:top+n].copy_(rays_o.flatten(0,1).to(DEVICE))
+        rays_d_tr[top:top+n].copy_(rays_d.flatten(0,1).to(DEVICE))
+        viewdirs_tr[top:top+n].copy_(viewdirs.flatten(0,1).to(DEVICE))
+        imsz.append(n)
+        top += n
+
+    assert top == N
+    eps_time = time.time() - eps_time
+    print('get_training_rays_flatten: finish (eps time:', eps_time, 'sec)')
+
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+@torch.no_grad()
+def get_training_rays_in_maskcache_sampling(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y, model, render_kwargs):
+    print('get_training_rays_in_maskcache_sampling: start')
+    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)
+    CHUNK = 64
+    DEVICE = rgb_tr_ori[0].device
+    eps_time = time.time()
+    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)
+
+    rgb_tr = torch.zeros([N,3], device=DEVICE)
+    rays_o_tr = torch.zeros_like(rgb_tr)
+    rays_d_tr = torch.zeros_like(rgb_tr)
+    viewdirs_tr = torch.zeros_like(rgb_tr)
+    imsz = []
+    top = 0
+
+    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):
+        assert img.shape[:2] == (H, W)
+        rays_o, rays_d, viewdirs = get_rays_of_a_view(
+                H=H, W=W, K=K, c2w=c2w, ndc=ndc,
+                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)
+        mask = torch.empty(img.shape[:2], device=DEVICE, dtype=torch.bool)
+        for i in range(0, img.shape[0], CHUNK):
+            mask[i:i+CHUNK] = model.hit_coarse_geo(
+                    rays_o=rays_o[i:i+CHUNK], rays_d=rays_d[i:i+CHUNK], **render_kwargs).to(DEVICE)
+        n = mask.sum()
+        rgb_tr[top:top+n].copy_(img[mask])
+        rays_o_tr[top:top+n].copy_(rays_o[mask].to(DEVICE))
+        rays_d_tr[top:top+n].copy_(rays_d[mask].to(DEVICE))
+        viewdirs_tr[top:top+n].copy_(viewdirs[mask].to(DEVICE))
+        imsz.append(n)
+        top += n
+
+    print('get_training_rays_in_maskcache_sampling: ratio', top / N)
+    rgb_tr = rgb_tr[:top]
+    rays_o_tr = rays_o_tr[:top]
+    rays_d_tr = rays_d_tr[:top]
+    viewdirs_tr = viewdirs_tr[:top]
+    eps_time = time.time() - eps_time
+    print('get_training_rays_in_maskcache_sampling: finish (eps time:', eps_time, 'sec)')
+    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz
+
+
+def batch_indices_generator(N, BS):
+    # torch.randperm on cuda produce incorrect results in my machine
+    idx, top = torch.LongTensor(np.random.permutation(N)), 0
+    while True:
+        if top + BS > N:
+            idx, top = torch.LongTensor(np.random.permutation(N)), 0
+        yield idx[top:top+BS]
+        top += BS
+
diff --git a/lib/self_prompting.py b/lib/self_prompting.py
new file mode 100644
index 0000000000000000000000000000000000000000..516cb524792ec5208253797a3e0a8de749762b09
--- /dev/null
+++ b/lib/self_prompting.py
@@ -0,0 +1,189 @@
+'''
+    Self prompting strategy
+    
+    INPUT: 
+        predictor: the initialized sam predictor :
+        rendered_mask_score: - : H*W*1
+        num_prompt: -  
+        index_matrix: the matrix contains the 3D index of the rendered view : H*W*3
+    OUTPUT: a list of prompts
+'''
+import os
+import torch
+import math
+import numpy as np
+
+to8b = lambda x : (255*np.clip(x,0,1)).astype(np.uint8)
+
+@torch.no_grad()
+def mask_to_prompt(predictor, rendered_mask_score, index_matrix, num_prompts = 3):
+    '''main function for self prompting'''
+    h, w, _ = rendered_mask_score.shape
+    tmp = rendered_mask_score.view(-1)
+    print("tmp min:", tmp.min(), "tmp max:", tmp.max())
+    rand = torch.ones_like(tmp)
+    topk_v, topk_p = torch.topk(tmp*rand, k = 1)[0].cpu(), torch.topk(tmp*rand, k = 1)[1].cpu()
+
+    if topk_v <= 0:
+        print("No prompt is available")
+        return np.zeros((0,2)), np.ones((0))
+
+    prompt_points = []
+    prompt_points.append([topk_p[0] % w, topk_p[0] // w])
+
+    print((topk_p[0] % w).item(), (topk_p[0] // w).item(), h, w)
+
+    tmp_mask = rendered_mask_score.clone().detach()
+
+    area = to8b(tmp_mask.cpu().numpy()).sum() / 255
+    r = np.sqrt(area / math.pi)
+    masked_r = max(int(r) // 2, 2)
+    # masked_r = max(int(r) // 3, 2)
+
+    pre_tmp_mask_score = None
+    for _ in range(num_prompts - 1):
+        # mask out a region around the last prompt point
+        input_label = np.ones(len(prompt_points))
+        previous_masks, previous_scores, previous_logits = predictor.predict(
+            point_coords=np.array(prompt_points),
+            point_labels=input_label,
+            multimask_output=False,
+        )
+
+        l = 0 if prompt_points[-1][0]-masked_r <= 0 else prompt_points[-1][0]-masked_r
+        r = w-1 if prompt_points[-1][0]+masked_r >= w-1 else prompt_points[-1][0]+masked_r
+
+        t = 0 if prompt_points[-1][1]-masked_r <= 0 else prompt_points[-1][1]-masked_r
+        b = h-1 if prompt_points[-1][1]+masked_r >= h-1 else prompt_points[-1][1]+masked_r
+        tmp_mask[t:b+1, l:r+1, :] = -1e5
+
+        # bool: H W
+        previous_mask_tensor = torch.tensor(previous_masks[0])
+        previous_mask_tensor = previous_mask_tensor.unsqueeze(0).unsqueeze(0).float()
+        previous_mask_tensor = torch.nn.functional.max_pool2d(previous_mask_tensor, 25, stride = 1, padding = 12)
+        previous_mask_tensor = previous_mask_tensor.squeeze(0).permute([1,2,0])
+#         tmp_mask[previous_mask_tensor > 0] = -1e5
+        previous_max_score = torch.max(rendered_mask_score[previous_mask_tensor > 0])
+
+        previous_point_index = torch.zeros_like(index_matrix)
+        previous_point_index[:,:,0] = prompt_points[-1][1] / h
+        previous_point_index[:,:,1] = prompt_points[-1][0] / w
+        previous_point_index[:,:,2] = index_matrix[int(prompt_points[-1][1]), int(prompt_points[-1][0]), 2]
+        distance_matrix = torch.sqrt(((index_matrix - previous_point_index)**2).sum(-1))
+        distance_matrix = (distance_matrix.unsqueeze(-1) - distance_matrix.min()) / (distance_matrix.max() - distance_matrix.min())
+
+        cur_tmp_mask = tmp_mask - distance_matrix * max(previous_max_score, 0)
+
+        if pre_tmp_mask_score is None:
+            pre_tmp_mask_score = cur_tmp_mask
+        else:
+            pre_tmp_mask_score[pre_tmp_mask_score < cur_tmp_mask] = cur_tmp_mask[pre_tmp_mask_score < cur_tmp_mask]
+            pre_tmp_mask_score[tmp_mask == -1e5] = -1e5
+
+        tmp_val_point = pre_tmp_mask_score.view(-1).max(dim = 0)
+
+        if tmp_val_point[0] <= 0:
+            print("There are", len(prompt_points), "prompts")
+            break
+        prompt_points.append([int(tmp_val_point[1].cpu() % w), int(tmp_val_point[1].cpu() // w)])
+
+    prompt_points = np.array(prompt_points)
+    input_label = np.ones(len(prompt_points))
+
+    return prompt_points, input_label
+
+
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import cv2
+import groundingdino.datasets.transforms as T
+from torchvision.ops import box_convert
+from PIL import Image
+
+def image_transform(image) -> torch.Tensor:
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image_transformed, _ = transform(image, None)
+    return image_transformed
+
+
+def grounding_dino_prompt(image, text):
+    
+    image_tensor = image_transform(Image.fromarray(image))
+    model_root = './dependencies/GroundingDINO'
+    
+    model = load_model(os.path.join(model_root, "groundingdino/config/GroundingDINO_SwinT_OGC.py"), os.path.join(model_root, "weights/groundingdino_swint_ogc.pth"))
+    
+    BOX_TRESHOLD = 0.35
+    TEXT_TRESHOLD = 0.25
+
+    boxes, logits, phrases = predict(
+        model=model,
+        image=image_tensor,
+        caption=text,
+        box_threshold=BOX_TRESHOLD,
+        text_threshold=TEXT_TRESHOLD
+    )
+    
+    h, w, _ = image.shape
+    print("boxes device", boxes.device)
+    boxes = boxes * torch.Tensor([w, h, w, h]).to(boxes.device)
+    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+    
+    print(xyxy)
+    return xyxy
+
+
+
+'''
+# new prompt strategy: bbox based
+# cannot be applied to 360
+try:
+    prompt = seg_m_for_prompt[:,:,no]
+    prompt = prompt > 0
+    box_prompt = masks_to_boxes(prompt.unsqueeze(0))
+    width = box_prompt[0,2] - box_prompt[0,0]
+    height = box_prompt[0,3] - box_prompt[0,1]
+    box_prompt[0,0] -= 0.05*width
+    box_prompt[0,2] += 0.05*width
+    box_prompt[0,1] -= 0.05*height
+    box_prompt[0,3] += 0.05*height
+#                             print(box_prompt)
+    transformed_boxes = predictor.transform.apply_boxes_torch(box_prompt, image.shape[:2])
+    masks, _, _ = predictor.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed_boxes,
+        multimask_output=False,
+    )
+    masks = masks.float()
+except:
+    continue
+'''
+
+
+'''
+# mask based 
+
+H,W,_ = prompt.shape
+target_size = RLS.get_preprocess_shape(H,W, 256)
+prompt = torch.nn.functional.interpolate(torch.tensor(prompt).float().unsqueeze(0).permute([0,3,1,2]), target_size, mode = 'bilinear')
+h,w = prompt.shape[-2:]
+padh = 256 - h
+padw = 256 - w
+prompt = F.pad(prompt, (0, padw, 0, padh))
+prompt = (prompt / 255) * 40 - 20
+print(prompt.shape)
+prompt = prompt.squeeze(1).cpu().numpy()
+
+masks, scores, logits = predictor.predict(
+        point_coords=None,
+        point_labels=None,
+        mask_input=prompt,
+        multimask_output=False,
+    )
+'''
\ No newline at end of file
diff --git a/lib/utils.py b/lib/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..02dbf4eaec7ac5fa66afe5f225cd4c90d726daaa
--- /dev/null
+++ b/lib/utils.py
@@ -0,0 +1,385 @@
+import copy
+import random
+
+import numpy as np
+import scipy.signal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from lib import seg_dvgo as dvgo
+from lib import seg_dcvgo as dcvgo
+
+from .load_data import load_data
+from .masked_adam import MaskedAdam
+from torch import Tensor
+
+''' Misc
+'''
+mse2psnr = lambda x : -10. * torch.log10(x)
+to8b = lambda x : (255*np.clip(x,0,1)).astype(np.uint8)
+
+def seed_everything(args):
+    '''Seed everything for better reproducibility.
+    NOTE that some pytorch operation is non-deterministic like the backprop of grid_samples
+    '''
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+
+@torch.jit.script
+def cal_IoU(a: Tensor, b: Tensor) -> Tensor:
+    """Calculates the Intersection over Union (IoU) between two tensors.
+
+    Args:
+        a: A tensor of shape (N, H, W).
+        b: A tensor of shape (N, H, W).
+
+    Returns:
+        A tensor of shape (N,) containing the IoU score between each pair of
+        elements in a and b.
+    """
+    intersection = torch.count_nonzero(torch.logical_and(a == b, a != 0))
+    union = torch.count_nonzero(a + b)
+    return intersection / union
+
+
+def load_everything(args, cfg):
+    '''Load images / poses / camera settings / data split.
+    '''
+    data_dict = load_data(cfg.data)
+
+    # remove useless field
+    kept_keys = {
+            'hwf', 'HW', 'Ks', 'near', 'far', 'near_clip',
+            'i_train', 'i_val', 'i_test', 'irregular_shape',
+            'poses', 'render_poses', 'images'}
+    for k in list(data_dict.keys()):
+        if k not in kept_keys:
+            data_dict.pop(k)
+
+    # construct data tensor
+    if data_dict['irregular_shape']:
+        data_dict['images'] = [torch.FloatTensor(im, device='cpu') for im in data_dict['images']]
+    else:
+        data_dict['images'] = torch.FloatTensor(data_dict['images'], device='cpu')
+    data_dict['poses'] = torch.Tensor(data_dict['poses'])
+    data_dict['render_poses'] = torch.Tensor(data_dict['render_poses'])
+
+    return data_dict
+
+# semantic nerf is used for reproducing the segmentation of SPIn-NeRF
+def load_existed_model(args, cfg, cfg_train, reload_ckpt_path, device):
+    model_class = find_model(cfg)
+    model = load_model(model_class, reload_ckpt_path).to(device)
+    optimizer = create_optimizer_or_freeze_model(model, cfg_train, global_step=0)
+    model, optimizer, start = load_checkpoint(
+            model, optimizer, reload_ckpt_path, no_reload_optimizer = True)
+    return model, optimizer, start
+    
+
+
+def gen_rand_colors(num_obj):
+    rand_colors = np.random.rand(num_obj + 1, 3)
+    rand_colors[-1,:] = 0
+    return rand_colors
+
+
+def to_cuda(batch, device=torch.device('cuda')):
+    if isinstance(batch, tuple) or isinstance(batch, list):
+        batch = [to_cuda(b, device) for b in batch]
+    elif isinstance(batch, dict):
+        batch_ = {}
+        for key in batch:
+            if key == 'meta':
+                batch_[key] = batch[key]
+            else:
+                batch_[key] = to_cuda(batch[key], device)
+        batch = batch_
+    elif isinstance(batch, np.ndarray):
+        batch = torch.from_numpy(batch).to(device)
+    else:
+        batch = batch.to(device)
+    return batch
+
+
+def to_tensor(array, device=torch.device('cuda')):
+    '''cvt numpy array to cuda tensor, if already tensor, do nothing
+    '''
+    if isinstance(array, np.ndarray):
+        array = torch.from_numpy(array).to(device)
+    elif isinstance(array, torch.Tensor) and not array.is_cuda:
+        array = array.to(device)
+    else:
+        pass
+    return array.float()
+
+
+''' optimizer
+'''
+def create_optimizer_or_freeze_model(model, cfg_train, global_step):
+    decay_steps = cfg_train.lrate_decay * 1000
+    decay_factor = 0.1 ** (global_step/decay_steps)
+
+    param_group = []
+    for k in cfg_train.keys():
+        if not k.startswith('lrate_'):
+            continue
+        k = k[len('lrate_'):]
+
+        if not hasattr(model, k):
+            continue
+
+        param = getattr(model, k)
+        if param is None:
+            print(f'create_optimizer_or_freeze_model: param {k} not exist')
+            continue
+
+        lr = getattr(cfg_train, f'lrate_{k}') * decay_factor
+        if lr > 0:
+            print(f'create_optimizer_or_freeze_model: param {k} lr {lr}')
+            if isinstance(param, nn.Module):
+                param = param.parameters()
+            param_group.append({'params': param, 'lr': lr, 'skip_zero_grad': (k in cfg_train.skip_zero_grad_fields)})
+        else:
+            print(f'create_optimizer_or_freeze_model: param {k} freeze')
+            param.requires_grad = False
+    return MaskedAdam(param_group)
+
+
+def create_segmentation_optimizer(model, cfg_train):
+
+    param_group = []
+    for k in cfg_train.keys():
+        if not k.startswith('lrate_'):
+            continue
+        k = k[len('lrate_'):]
+
+        if not hasattr(model, k):
+            continue
+
+        param = getattr(model, k)
+        if param is None:
+            print(f'create_optimizer_or_freeze_model: param {k} not exist')
+            continue
+
+        lr = getattr(cfg_train, f'lrate_{k}')
+        if lr > 0:
+            print(f'create_optimizer_or_freeze_model: param {k} lr {lr}')
+            if isinstance(param, nn.Module):
+                param = param.parameters()
+            param_group.append({'params': param, 'lr': lr})
+        else:
+            print(f'create_optimizer_or_freeze_model: param {k} freeze')
+            param.requires_grad = False
+    return torch.optim.SGD(param_group)
+
+
+''' Checkpoint utils
+'''
+def load_checkpoint(model, optimizer, ckpt_path, no_reload_optimizer):
+    ckpt = torch.load(ckpt_path)
+    try:
+        start = ckpt['global_step']
+    except:
+        start = 0
+    msg = model.load_state_dict(ckpt['model_state_dict'], strict = False)
+    print("NeRF loaded with msg: ", msg)
+    if not no_reload_optimizer:
+        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+    return model, optimizer, start
+
+
+def find_model(cfg):
+    if cfg.data.ndc:
+        model_class = dvgo.DirectVoxGO
+    elif cfg.data.unbounded_inward:
+        model_class = dcvgo.DirectContractedVoxGO
+    else:
+        model_class = dvgo.DirectVoxGO
+    return model_class
+
+
+def load_model(model_class, ckpt_path):
+    ckpt = torch.load(ckpt_path)
+    num_objects = 1
+    if 'seg_mask_grid.grid' in ckpt['model_state_dict'].keys():
+        num_objects = ckpt['model_state_dict']['seg_mask_grid.grid'].shape[1]
+        
+    print("Load model with num_objects =", num_objects)
+
+    model = model_class(num_objects = num_objects, **ckpt['model_kwargs'])
+    msg = model.load_state_dict(ckpt['model_state_dict'], strict = False)
+    print("NeRF loaded with msg: ", msg)
+    return model
+
+
+def create_new_model(cfg, cfg_model, cfg_train, xyz_min, xyz_max, stage, coarse_ckpt_path, device=torch.device('cuda')):
+    model_kwargs = copy.deepcopy(cfg_model)
+    num_voxels = model_kwargs.pop('num_voxels')
+    if len(cfg_train.pg_scale):
+        num_voxels = int(num_voxels / (2**len(cfg_train.pg_scale)))
+
+    if cfg.data.ndc:
+        #print(f'scene_rep_reconstruction ({stage}): \033[96muse multiplane images\033[0m')
+        #model = dmpigo.DirectMPIGO(
+        #    xyz_min=xyz_min, xyz_max=xyz_max,
+        #    num_voxels=num_voxels,
+        #    **model_kwargs)
+        print(f'scene_rep_reconstruction ({stage}): \033[96muse dense voxel grid\033[0m')
+        model = dvgo.DirectVoxGO(
+            xyz_min=xyz_min, xyz_max=xyz_max,
+            num_voxels=num_voxels,
+            mask_cache_path=coarse_ckpt_path,
+            **model_kwargs)
+    elif cfg.data.unbounded_inward:
+        print(f'scene_rep_reconstruction ({stage}): \033[96muse contraced voxel grid (covering unbounded)\033[0m')
+        model = dcvgo.DirectContractedVoxGO(
+            xyz_min=xyz_min, xyz_max=xyz_max,
+            num_voxels=num_voxels,
+            **model_kwargs)
+    else:
+        print(f'scene_rep_reconstruction ({stage}): \033[96muse dense voxel grid\033[0m')
+        model = dvgo.DirectVoxGO(
+            xyz_min=xyz_min, xyz_max=xyz_max,
+            num_voxels=num_voxels,
+            mask_cache_path=coarse_ckpt_path,
+            **model_kwargs)
+    model = model.to(device)
+    optimizer = create_optimizer_or_freeze_model(model, cfg_train, global_step=0)
+    return model, optimizer
+
+
+''' Evaluation metrics (ssim, lpips)
+'''
+def rgb_ssim(img0, img1, max_val,
+             filter_size=11,
+             filter_sigma=1.5,
+             k1=0.01,
+             k2=0.03,
+             return_map=False):
+    # Modified from https://github.com/google/mipnerf/blob/16e73dfdb52044dcceb47cda5243a686391a6e0f/internal/math.py#L58
+    assert len(img0.shape) == 3
+    assert img0.shape[-1] == 3
+    assert img0.shape == img1.shape
+
+    # Construct a 1D Gaussian blur filter.
+    hw = filter_size // 2
+    shift = (2 * hw - filter_size + 1) / 2
+    f_i = ((np.arange(filter_size) - hw + shift) / filter_sigma)**2
+    filt = np.exp(-0.5 * f_i)
+    filt /= np.sum(filt)
+
+    # Blur in x and y (faster than the 2D convolution).
+    def convolve2d(z, f):
+        return scipy.signal.convolve2d(z, f, mode='valid')
+
+    filt_fn = lambda z: np.stack([
+        convolve2d(convolve2d(z[...,i], filt[:, None]), filt[None, :])
+        for i in range(z.shape[-1])], -1)
+    mu0 = filt_fn(img0)
+    mu1 = filt_fn(img1)
+    mu00 = mu0 * mu0
+    mu11 = mu1 * mu1
+    mu01 = mu0 * mu1
+    sigma00 = filt_fn(img0**2) - mu00
+    sigma11 = filt_fn(img1**2) - mu11
+    sigma01 = filt_fn(img0 * img1) - mu01
+
+    # Clip the variances and covariances to valid values.
+    # Variance must be non-negative:
+    sigma00 = np.maximum(0., sigma00)
+    sigma11 = np.maximum(0., sigma11)
+    sigma01 = np.sign(sigma01) * np.minimum(
+        np.sqrt(sigma00 * sigma11), np.abs(sigma01))
+    c1 = (k1 * max_val)**2
+    c2 = (k2 * max_val)**2
+    numer = (2 * mu01 + c1) * (2 * sigma01 + c2)
+    denom = (mu00 + mu11 + c1) * (sigma00 + sigma11 + c2)
+    ssim_map = numer / denom
+    ssim = np.mean(ssim_map)
+    return ssim_map if return_map else ssim
+
+
+__LPIPS__ = {}
+def init_lpips(net_name, device):
+    assert net_name in ['alex', 'vgg']
+    import lpips
+    print(f'init_lpips: lpips_{net_name}')
+    return lpips.LPIPS(net=net_name, version='0.1').eval().to(device)
+
+
+def rgb_lpips(np_gt, np_im, net_name, device):
+    if net_name not in __LPIPS__:
+        __LPIPS__[net_name] = init_lpips(net_name, device)
+    gt = torch.from_numpy(np_gt).permute([2, 0, 1]).contiguous().to(device)
+    im = torch.from_numpy(np_im).permute([2, 0, 1]).contiguous().to(device)
+    return __LPIPS__[net_name](gt, im, normalize=True).item()
+
+
+''' generate rays
+'''
+def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
+    i, j = torch.meshgrid(
+        torch.linspace(0, W-1, W, device=c2w.device),
+        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
+    i = i.t().float()
+    j = j.t().float()
+    if mode == 'lefttop':
+        pass
+    elif mode == 'center':
+        i, j = i+0.5, j+0.5
+    elif mode == 'random':
+        i = i+torch.rand_like(i)
+        j = j+torch.rand_like(j)
+    else:
+        raise NotImplementedError
+
+    if flip_x:
+        i = i.flip((1,))
+    if flip_y:
+        j = j.flip((0,))
+    if inverse_y:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
+    else:
+        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = c2w[:3,3].expand(rays_d.shape)
+    return rays_o, rays_d
+
+
+def ndc_rays(H, W, focal, near, rays_o, rays_d):
+    # Shift ray origins to near plane
+    t = -(near + rays_o[...,2]) / rays_d[...,2]
+    rays_o = rays_o + t[...,None] * rays_d
+
+    # Projection
+    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
+    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
+    o2 = 1. + 2. * near / rays_o[...,2]
+
+    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
+    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
+    d2 = -2. * near / rays_o[...,2]
+
+    rays_o = torch.stack([o0,o1,o2], -1)
+    rays_d = torch.stack([d0,d1,d2], -1)
+
+    return rays_o, rays_d
+
+
+def get_rays_of_a_view(H, W, K, c2w, ndc, inverse_y, flip_x, flip_y, mode='center'):
+    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
+    viewdirs = rays_d / rays_d.norm(dim=-1, keepdim=True)
+    if ndc:
+        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
+    return rays_o, rays_d, viewdirs
+
+
+''' interactive mode TODO'''
+def fetch_user_define_points():
+    pass
+
diff --git a/output/SPIn-NeRF/Truck/point_cloud_projection.png b/output/SPIn-NeRF/Truck/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8e99d209db9257a32a05a926b8c1656470e3191
Binary files /dev/null and b/output/SPIn-NeRF/Truck/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/Truck/point_cloud_projection_mask.png b/output/SPIn-NeRF/Truck/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..8aa3083706bdc0a9f97d395b4566c00466eb34a0
Binary files /dev/null and b/output/SPIn-NeRF/Truck/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/fern/point_cloud_projection.png b/output/SPIn-NeRF/fern/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..8d350a24aa025e7ad0e8fbff6139acb612c010b0
Binary files /dev/null and b/output/SPIn-NeRF/fern/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/fern/point_cloud_projection_mask.png b/output/SPIn-NeRF/fern/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..f28902b3a2d6901b9b9f8f89519c04fd6feedbc8
Binary files /dev/null and b/output/SPIn-NeRF/fern/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/fork/point_cloud_projection.png b/output/SPIn-NeRF/fork/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac40dc1b053ab794220c6df85c6995a354dbbe63
Binary files /dev/null and b/output/SPIn-NeRF/fork/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/fork/point_cloud_projection_mask.png b/output/SPIn-NeRF/fork/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..a27553c9af5fe053ec2f550480643edeb0e299a7
Binary files /dev/null and b/output/SPIn-NeRF/fork/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/fortress/point_cloud_projection.png b/output/SPIn-NeRF/fortress/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..859e6f2aeb402b45fe8097436fb03684009b767b
Binary files /dev/null and b/output/SPIn-NeRF/fortress/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/fortress/point_cloud_projection_mask.png b/output/SPIn-NeRF/fortress/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc4a07c3c3a13370d0d41ebe0e3c644500ae8e42
Binary files /dev/null and b/output/SPIn-NeRF/fortress/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/iouAndAcc.xlsx b/output/SPIn-NeRF/iouAndAcc.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..5c4f902acc33afd20eb249c2e354601417cde6d9
Binary files /dev/null and b/output/SPIn-NeRF/iouAndAcc.xlsx differ
diff --git a/output/SPIn-NeRF/leaves/point_cloud_projection.png b/output/SPIn-NeRF/leaves/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..53f6ab9bb1fc42ba93b89acfe493f36c2dc43652
Binary files /dev/null and b/output/SPIn-NeRF/leaves/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/leaves/point_cloud_projection_mask.png b/output/SPIn-NeRF/leaves/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..a9c55e2dbaebd55e3eb2a9cd76dbfc9d4700a9e2
Binary files /dev/null and b/output/SPIn-NeRF/leaves/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection.png b/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbab909daf297f827ae1356e6d8a281d5121f3b9
Binary files /dev/null and b/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection_mask.png b/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..415781ba1eae7b6366832e896c856ff5f285a5f7
Binary files /dev/null and b/output/SPIn-NeRF/lego_real_night_radial/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/orchids/point_cloud_projection.png b/output/SPIn-NeRF/orchids/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..0c89b7477ec74ae7201f31f2a92c5d7dd4df78fc
Binary files /dev/null and b/output/SPIn-NeRF/orchids/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/orchids/point_cloud_projection_mask.png b/output/SPIn-NeRF/orchids/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..e739b42053276982bb2fc53cb6b599a286e82708
Binary files /dev/null and b/output/SPIn-NeRF/orchids/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/pinecone/point_cloud_projection.png b/output/SPIn-NeRF/pinecone/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..fae543faed01ea34eb76c053fed4f9af06d6143d
Binary files /dev/null and b/output/SPIn-NeRF/pinecone/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/pinecone/point_cloud_projection_mask.png b/output/SPIn-NeRF/pinecone/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7521881b7dc02f9bb315478a2c34251dd81ea
Binary files /dev/null and b/output/SPIn-NeRF/pinecone/point_cloud_projection_mask.png differ
diff --git a/output/SPIn-NeRF/room/point_cloud_projection.png b/output/SPIn-NeRF/room/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..576ab0608ceb4416373c5639af191f285b8101a1
Binary files /dev/null and b/output/SPIn-NeRF/room/point_cloud_projection.png differ
diff --git a/output/SPIn-NeRF/room/point_cloud_projection_mask.png b/output/SPIn-NeRF/room/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..492e3e2c5bb235bcf71c4661e7f1513c5e7d5913
Binary files /dev/null and b/output/SPIn-NeRF/room/point_cloud_projection_mask.png differ
diff --git a/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection.png b/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..900e05ff0586650daf9d3e13e36a3bd0ba8d9d19
Binary files /dev/null and b/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection.png differ
diff --git a/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection_mask.png b/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac4e8c45f44b4d78fe967e5e36b16e3572e9c55c
Binary files /dev/null and b/output/data/nerf_llff_data(NVOS-all)/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/cecread/point_cloud_projection.png b/output/llff(sanerf-hq)/cecread/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..4eac3b7f8860263071fd3fb61fa13ab4120e23fb
Binary files /dev/null and b/output/llff(sanerf-hq)/cecread/point_cloud_projection.png differ
diff --git a/output/llff(sanerf-hq)/cecread/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/cecread/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..49a9d2a7b8e31a37a281339a02902763f3ce5758
Binary files /dev/null and b/output/llff(sanerf-hq)/cecread/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/colinepiano/point_cloud_projection.png b/output/llff(sanerf-hq)/colinepiano/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..4daa9f283a5dc6f44b86049551f22885da05995b
Binary files /dev/null and b/output/llff(sanerf-hq)/colinepiano/point_cloud_projection.png differ
diff --git a/output/llff(sanerf-hq)/colinepiano/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/colinepiano/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbaab80b74d8ea37ee216e84263e24101f7971a8
Binary files /dev/null and b/output/llff(sanerf-hq)/colinepiano/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/fenceflower/point_cloud_projection.png b/output/llff(sanerf-hq)/fenceflower/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..b07cc573de7adf647d22028dae1ae9fb3ed91342
--- /dev/null
+++ b/output/llff(sanerf-hq)/fenceflower/point_cloud_projection.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99ddebdbc966806ef50a843c7204b319830bcd458b36346cb1ba2fd7c791507b
+size 2808788
diff --git a/output/llff(sanerf-hq)/fenceflower/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/fenceflower/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..ae907cb599623c99ecca5304f5e06e78849422ae
Binary files /dev/null and b/output/llff(sanerf-hq)/fenceflower/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/iouAndAcc.xlsx b/output/llff(sanerf-hq)/iouAndAcc.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..529fd418893fbe3b02173febdf74d6c5538d3152
Binary files /dev/null and b/output/llff(sanerf-hq)/iouAndAcc.xlsx differ
diff --git a/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection.png b/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..f84166a2654d74114bca173aca02b405f6c2055b
--- /dev/null
+++ b/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea933a02a65433a1caff07b1051dadcd4262b9453af950444f7e90866397763d
+size 1856895
diff --git a/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..8d8097a5ab25e2b320d7f4d18e8e9bd6cde8b1f5
Binary files /dev/null and b/output/llff(sanerf-hq)/mattcecsit/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/mattwrite/point_cloud_projection.png b/output/llff(sanerf-hq)/mattwrite/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..b4fd6da8e1240bbbc857bb0e28e62b3529556a57
--- /dev/null
+++ b/output/llff(sanerf-hq)/mattwrite/point_cloud_projection.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f22871baa044f36e725217746690559b77cc2625cf36d8c5925c6133b9955293
+size 4089671
diff --git a/output/llff(sanerf-hq)/mattwrite/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/mattwrite/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..189152674ef6de0e554401aaae6ce299c74494a8
Binary files /dev/null and b/output/llff(sanerf-hq)/mattwrite/point_cloud_projection_mask.png differ
diff --git a/output/llff(sanerf-hq)/piano/point_cloud_projection.png b/output/llff(sanerf-hq)/piano/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..8490b8bd918f0dc2f9ac896567a3971a486b3b90
Binary files /dev/null and b/output/llff(sanerf-hq)/piano/point_cloud_projection.png differ
diff --git a/output/llff(sanerf-hq)/piano/point_cloud_projection_mask.png b/output/llff(sanerf-hq)/piano/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..03bf90fadc6f85f8de38236d5932835ee35e8ad2
Binary files /dev/null and b/output/llff(sanerf-hq)/piano/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a565180c2ae1af3e0c50d580e51f13ecd29e70e
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..429bc08b688286c05e4f1630bf91e0962287fec5
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/fern/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..74199e02c97e378bee316809a8891ff0269a1ae6
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b79694cc02d0b3f9c62af4e1bab40cedb3e5026
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/flower/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..949f8b67f98cb850790ed98619916cdc9b12f6da
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..7530b411d654f725f63c06f180c59053111d7698
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/fortress/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..01a64a081f416bbbd84bf34d957f88af5ae25aad
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..f6bf39a56206152514f3becb9ebb7b41a0b65c5c
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/iouAndAcc.xlsx b/output/nerf_llff_data(NVOS-all)/iouAndAcc.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..2ba41cef935d4e952cbeaae759b7cc6e4332b40f
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/iouAndAcc.xlsx differ
diff --git a/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fb2e40592ffcb829543bcdfa8d533a04983a6ed
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..fb5216280f158707645e1322ea940ce38d962c17
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/leaves/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..261091cf81cb0eda64cc86c0554187c5983a87ac
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..ad0b611b1e7d4225fbcc2cb056103b173275e56a
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/orchids/point_cloud_projection_mask.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection.png b/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection.png
new file mode 100644
index 0000000000000000000000000000000000000000..34f325332e15316828d55f32e006ad5cb29bdb3d
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection.png differ
diff --git a/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection_mask.png b/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection_mask.png
new file mode 100644
index 0000000000000000000000000000000000000000..0e224e0dd27c64a23ba7da5793d75f11d68fc1b8
Binary files /dev/null and b/output/nerf_llff_data(NVOS-all)/trex/point_cloud_projection_mask.png differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1722b0394afdfe7b9609e043c043cd72a7e226e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+pyglet<2
\ No newline at end of file
diff --git a/requirements_optional.txt b/requirements_optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7fd52ab30ab0499f6fd7b59bb6e9e1f4e833d5c
--- /dev/null
+++ b/requirements_optional.txt
@@ -0,0 +1 @@
+pillow-heif  # add heif/heic image support
\ No newline at end of file
diff --git a/segment_eval_mask.py b/segment_eval_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e55d10354990a75f36fc36ccf43d2338379c42
--- /dev/null
+++ b/segment_eval_mask.py
@@ -0,0 +1,653 @@
+import re
+import os
+import copy
+from threading import enumerate
+import numpy as np
+import torch
+from dust3r.inference import inference, load_model
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from scipy.spatial.transform import Rotation
+import matplotlib.pyplot as plt
+import pandas as pd
+
+def get_reconstructed_scene(model, device, image_size, filelist, schedule, niter, scenegraph_type, winsize, refid):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    """
+    imgs = load_images(filelist, size=image_size)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+    # 图片两两组合
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=batch_size) # 将输入的图片两两成对输入dust3r模型
+    #关于output：view1、view2分别表示输入模型的两张图片，pred1、pred2分别表示两个分支的输出结果，pred包含pointmap和confidence两个结果
+    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+    lr = 0.01
+
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        try: # scene：PointCloudOptimizer
+            scene = global_aligner(output, device=device, mode=mode)
+            # ==========Golbal optimization章节，根据公式(5)梯度下降估算世界坐标系下的三维点和外参矩阵=============
+            loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+        except Exception as e:# 论文中的Golbal optimization章节，根据公式(5)梯度下降估算世界坐标系下的三维点和外参矩阵
+            print(e)
+            scene = global_aligner(output, device='cpu', mode=mode)
+            print('retrying with cpu')
+            loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+
+    # also return rgb, depth and confidence imgs
+    # depth is normalized with the max value for all images
+    # we apply the jet colormap on the confidence maps
+    rgbimg = scene.imgs
+    depths = to_numpy(scene.get_depthmaps()) # 深度信息D，即公式(1)上的D
+    confs = to_numpy([c for c in scene.im_conf])
+    cmap = plt.get_cmap('jet')
+    depths_max = max([d.max() for d in depths]) # 获取最大深度值
+    depths = [d/depths_max for d in depths] # 归一化
+    confs_max = max([d.max() for d in confs]) # 获取置信度最大值
+    confs = [cmap(d/confs_max) for d in confs] # 归一化
+
+    imgs = []
+    for i in range(len(rgbimg)):
+        imgs.append(rgbimg[i])
+        imgs.append(rgb(depths[i]))
+        imgs.append(rgb(confs[i]))
+
+    return scene, rgbimg
+
+
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+    
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   
+
+def seg(predictor, rgbimgs, masks, target_ind):
+    print('SAM step...')
+    # return masks
+    fig, ax = plt.subplots(len(rgbimgs), 4, figsize=(20, 20))
+    for i, img in zip(range(len(rgbimgs)),rgbimgs):
+        predictor.set_image((img * 255).astype(np.uint8))
+        h,w,c = img.shape
+        input_point = np.array([
+            [int(w/2), int(h/2)],
+            [int(w/2), int(h/2)-30],
+        ])
+        input_label = np.array([1,1])
+        m, scores, logits = predictor.predict( # 输入SAM，提示信息是中心的两个point提示
+            point_coords=input_point,
+            point_labels=input_label,
+            multimask_output=False,
+        )
+        show_mask(m[0], ax[i][0], random_color=True) # 第一列即m[0]展示的是SAM以上面两个point为提示信息输出的分割结果
+        show_points(input_point, input_label, ax[i][0])
+        ax[i][1].imshow(img) # 第二列展示的是原图
+        ax[i][2].imshow(masks[i].detach().cpu().numpy()) # 第三列展示的是置信度大于阈值的像素点
+        masks[i] = masks[i].detach().cpu().numpy() & m[0]
+        ax[i][3].imshow(masks[i]) # 第四列是一、三列分割图的交集
+        masks[i] = m[0]
+    plt.savefig("masks.png")
+    return masks  # 返回的是针对每张图片，SAM以上面两个point为提示信息输出的分割结果
+
+def rgb_to_grayscale(img):
+    """将RGB图像转换为灰度图。"""
+    return np.dot(img[..., :3], [0.2989, 0.5870, 0.1140])
+
+def binarize_image(img, threshold=0):
+    grayscale = rgb_to_grayscale(img)
+    return (~(grayscale > threshold)).astype(np.uint8) * 255
+
+# by guoyansong
+'''
+接受的输入是：data/数据集名称/scene，如data/nerf_llff_data(NVOS-all)/horns
+'''
+def run(img_dir):
+    dataset_name = img_dir.split('/')[-2]
+    scene_name = img_dir.split('/')[-1]
+    outdir = os.path.join("output", dataset_name, scene_name)
+
+
+    model_path = args.model_path
+    device = 'cuda'
+    print("=============================================")
+    print(torch.cuda.is_available())
+
+    # 1、===============================加载数据集==============================
+    from load_nvos import  load_nvos_data_evaluate
+    target_ind, target_mask, all_imgfiles = load_nvos_data_evaluate(
+        basedir=img_dir)
+    # print(ref_ind, ref_pose.shape, target_ind, all_imgfiles, all_poses.shape)
+    # print(target_pose)
+
+    from SAM import SamPredictor
+    from SAM.build_sam import sam_model_registry
+    sam = sam_model_registry[args.sam_model](checkpoint=args.sam_checkpoint)
+    sam.to(device=device)
+    predictor = SamPredictor(sam)
+
+    model = load_model(model_path, device)  # dust3R
+    # load_images can take a list of images or a directory
+
+    # 2、==============调用DUST3R和Global Alignment获取pointmaps====================
+    scene, imgs = get_reconstructed_scene(  # 调用DUST3R
+        model=model, device=device,
+        image_size=512, filelist=all_imgfiles, schedule=schedule,
+        niter=niter, scenegraph_type="complete", winsize=1, refid=0,
+    )
+
+    poses = scene.get_im_poses()  # cam to world 外参数矩阵的逆
+    intrinsics = scene.get_intrinsics()
+    pts3d = scene.get_pts3d()
+    confidence_masks = scene.get_masks()
+
+    # 3、===============================调用SAM获取2D masks==========================
+    # 这里返回的是针对每张图片，SAM以目标中心的两个point为提示信息输出的分割结果
+    masks = seg(predictor, imgs, confidence_masks, target_ind)
+
+    # 4、==============================基于2D masks获取3D masks=====================
+    pts3d_list = []
+    color_list = []
+    for i, mask in zip(range(len(masks)), masks):
+        # 将SAM分割的结果和三维点云融合，即去除背景只剩下目标物体的三维点
+        pts3d_list.append(pts3d[i][mask].detach().cpu().numpy())
+        # 将SAM分割的结果和原图融合，即去除背景，这里是为了取出原图中二维点像素值，给上面的三维点染色（pts3d_list和color_list的点是一一对应的）
+        color_list.append(imgs[i][mask])
+    # 将所有的三维点连接在一起，即全部绘制出来表示目标物体的三维点云（即论文中的公式(5)经过梯度下降计算出的世界坐标系下的点）
+    points_3D = np.concatenate(pts3d_list).reshape(-1, 3)
+    colors = np.concatenate(color_list).reshape(-1, 3)
+
+    # 将点云数据转换为齐次坐标形式 (N, 4)，最后一列为1
+    points_3D = np.hstack((points_3D, np.ones((len(points_3D), 1))))
+    # 实际中，P 应根据你的相机参数来设置
+    # P = np.eye(4)  # 假设为单位矩阵，即无旋转无平移
+    # P = poses[target_ind].cpu().detach().numpy()
+    # P是从相机到世界的变换矩阵，我们需要它的逆来从世界变换到相机坐标系
+    P = np.linalg.inv(poses[target_ind].cpu().detach().numpy())
+    print(f'Word 2 Camera:{P}')
+    # P = np.eye(4)
+    # P[:3,:4] = target_pose[:,:4]
+
+    # 变换点云到相机坐标系
+    points_camera_coord = np.dot(points_3D, P.T)
+    # points_camera_coord = points_3D
+
+    # 假设的投影矩阵，这里只是一个简化的示例
+    # 真实情况下，投影矩阵取决于相机的内参等因素
+    # projection_matrix = np.array([[1, 0, 0, 0],
+    #                             [0, 1, 0, 0],
+    #                             [0, 0, 1, 0]])
+    projection_matrix = intrinsics[target_ind].cpu().detach().numpy()
+    print(f'intrinsics:{projection_matrix}')
+
+    # 应用内参矩阵由相机坐标系转成像素坐标系
+    points_projected = np.dot(points_camera_coord[:, :3], projection_matrix.T)
+
+    # 从齐次坐标转换为笛卡尔坐标(欧式坐标)
+    points_2D = points_projected[:, :2] / points_projected[:, 2, np.newaxis]
+
+    # 创建一个空的图像（初始化为白色背景）
+    image = np.ones(imgs[0].shape)
+    H, W, _ = image.shape
+    # 根据需要调整2D投影坐标的尺度和位置，以适应图像的尺寸
+    # 这里简化处理，直接按比例缩放并居中
+    # scale = min(W, H) / max(points_2D.max() - points_2D.min(), 1)
+    # # offset = ((W - (points_2D[:, 0].max() - points_2D[:, 0].min()) * scale) / 2,
+    # #         (H - (points_2D[:, 1].max() - points_2D[:, 1].min()) * scale) / 2)
+    # points_2D_scaled = points_2D * scale + offset
+
+    # 计算2D投影坐标的尺度和位置，以适应图像的尺寸
+    # x_min, y_min = points_2D.min(axis=0)
+    # x_max, y_max = points_2D.max(axis=0)
+    # 保持横纵比不变的缩放
+    # scale = min(W / (x_max - x_min), H / (y_max - y_min))
+
+    # 缩放后的坐标
+    # points_2D_scaled = (points_2D - [x_min, y_min]) * scale
+    points_2D_scaled = points_2D
+
+    # 在图像上绘制点云
+    for point, color in zip(points_2D_scaled.astype(int), colors):
+        x, y = point
+        # 检查坐标是否在图像范围内
+        if 0 <= x < W and 0 <= y < H:
+            image[y, x] = color  # 设置像素颜色
+
+    import cv2
+    image = cv2.resize(image, (target_mask.shape[1], target_mask.shape[0]))
+    binary_image = binarize_image(image, threshold=args.thr)
+    # 转换为Pillow图像对象，确保数据类型正确
+    from PIL import Image
+    pil_image = Image.fromarray(binary_image, 'L')  # 'L'代表灰度模式
+
+    # 保存图像
+    os.makedirs(outdir, exist_ok=True)
+    pil_image.save(os.path.join(outdir, 'point_cloud_projection_mask.png'))
+    plt.imsave(os.path.join(outdir, 'point_cloud_projection.png'), image)
+
+
+    # 5、======================================计算IoU和Acc==================================
+    import evaluate
+    """
+    img_dir:  data/nerf_llff_data(NVOS-all)/horns
+    out_dir:  output/horns/nerf_llff_data(NVOS-all)/horns
+    mask_path:output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png
+    gt_path:  data/nerf_llff_data(NVOS-all)/masks/horns_center/DJI_20200223_163024_597_mask.png
+    """
+    mask_path = os.path.join(outdir, "point_cloud_projection_mask.png")
+    gt_dir = os.path.join("data", dataset_name, "masks", scene_name)
+    gt_path = ""
+    for name in os.listdir(gt_dir):
+        if "_mask" in name:
+            gt_path = os.path.join(gt_dir, name)
+
+    assert gt_path != "", os.path.join(outdir, "point_cloud_projection_mask.png") + "路径下没有mask图片"
+    iou, acc = evaluate.get_eval(mask_path, gt_path)
+    append_to_excel(os.path.join("output", dataset_name, 'iouAndAcc.xlsx'), scene_name, iou, acc)
+
+
+# gys：读取mask文件夹下的所有ground truth masks，不再需要经过SAM生成mask
+def get_gt_masks(folder_path):
+    from dust3r.utils.image import load_images, rgb
+    imgs_mask = load_images(folder_path, 512)
+    # 定义保存布尔mask的列表
+    bool_masks = []
+
+    for mask in imgs_mask:
+        image_array = mask['img'].squeeze(0).numpy()
+        # 将RGB图像转换为灰度图像
+        # 使用简单的加权方法转换为灰度: Y = 0.299*R + 0.587*G + 0.114*B
+        gray_image = 0.299 * image_array[0] + 0.587 * image_array[1] + 0.114 * image_array[2]
+
+        # 将灰度图像转换为布尔数组（前景为True，背景为False）
+        bool_array = gray_image > 0
+
+        # 将布尔数组添加到列表中
+        bool_masks.append(bool_array)
+
+    # 输出布尔mask的数量
+    print(f"Total number of mask images processed: {len(bool_masks)}")
+    return bool_masks
+
+# by guoyansong
+'''
+接受的输入路径是：data/llff(sanerf-hq)/cecread
+不再经过人工提示和SAM生成mask，直接基于每个图片的gt.mask获取3D mask
+'''
+def run_llff_SANeRF_HQ(img_dir):
+    dataset_name = img_dir.split('/')[-2]
+    scene_name = img_dir.split('/')[-1]
+    outdir = os.path.join("output", dataset_name, scene_name)
+
+
+    model_path = args.model_path
+    device = 'cuda'
+    print("=============================================")
+    print(torch.cuda.is_available())
+
+    # 1、===============================加载数据集==============================
+    from load_nvos import  load_nvos_data_evaluate
+    target_ind, target_mask, all_imgfiles = load_nvos_data_evaluate(
+        basedir=img_dir)
+    # print(ref_ind, ref_pose.shape, target_ind, all_imgfiles, all_poses.shape)
+    # print(target_pose)
+
+    # from SAM import SamPredictor
+    # from SAM.build_sam import sam_model_registry
+    # sam = sam_model_registry[args.sam_model](checkpoint=args.sam_checkpoint)
+    # sam.to(device=device)
+    # predictor = SamPredictor(sam)
+
+    model = load_model(model_path, device)  # dust3R
+    # load_images can take a list of images or a directory
+
+    # 2、==============调用DUST3R和Global Alignment获取pointmaps====================
+    scene, imgs = get_reconstructed_scene(  # 调用DUST3R
+        model=model, device=device,
+        image_size=512, filelist=all_imgfiles, schedule=schedule,
+        niter=niter, scenegraph_type="complete", winsize=1, refid=0,
+    )
+
+    poses = scene.get_im_poses()  # cam to world 外参数矩阵的逆
+    intrinsics = scene.get_intrinsics()
+    pts3d = scene.get_pts3d()
+    confidence_masks = scene.get_masks()
+
+    # 3、===============================调用SAM获取2D masks==========================
+    # 这里返回的是针对每张图片，SAM以目标中心的两个point为提示信息输出的分割结果
+    # TODO 直接读取mask文件夹下的gt. masks，不再需要SAM输出mask
+    # masks = seg(predictor, imgs, confidence_masks, target_ind)
+    gt_masks_dir = os.path.join("data", dataset_name, scene_name, "gt_masks")
+    masks = get_gt_masks(gt_masks_dir)
+
+    # 4、==============================基于2D masks获取3D masks=====================
+    pts3d_list = []
+    color_list = []
+    for i, mask in zip(range(len(masks)), masks):
+        # 将SAM分割的结果和三维点云融合，即去除背景只剩下目标物体的三维点
+        # pts3d_list.append(pts3d[i][mask].detach().cpu().numpy())
+        pts3d_list.append(pts3d[i].detach().cpu().numpy())
+        # # 将SAM分割的结果和原图融合，即去除背景，这里是为了取出原图中二维点像素值，给上面的三维点染色（pts3d_list和color_list的点是一一对应的）
+        # color_list.append(imgs[i][mask])
+        color_list.append(imgs[i])
+    # 将所有的三维点连接在一起，即全部绘制出来表示目标物体的三维点云（即论文中的公式(5)经过梯度下降计算出的世界坐标系下的点）
+    points_3D = np.concatenate(pts3d_list).reshape(-1, 3)
+    colors = np.concatenate(color_list).reshape(-1, 3)
+
+    # 将点云数据转换为齐次坐标形式 (N, 4)，最后一列为1
+    points_3D = np.hstack((points_3D, np.ones((len(points_3D), 1))))
+    # 实际中，P 应根据你的相机参数来设置
+    # P = np.eye(4)  # 假设为单位矩阵，即无旋转无平移
+    # P = poses[target_ind].cpu().detach().numpy()
+    # P是从相机到世界的变换矩阵，我们需要它的逆来从世界变换到相机坐标系
+    P = np.linalg.inv(poses[target_ind].cpu().detach().numpy())
+    print(f'Word 2 Camera:{P}')
+    # P = np.eye(4)
+    # P[:3,:4] = target_pose[:,:4]
+
+    # 变换点云到相机坐标系
+    points_camera_coord = np.dot(points_3D, P.T)
+    # points_camera_coord = points_3D
+
+    # 假设的投影矩阵，这里只是一个简化的示例
+    # 真实情况下，投影矩阵取决于相机的内参等因素
+    # projection_matrix = np.array([[1, 0, 0, 0],
+    #                             [0, 1, 0, 0],
+    #                             [0, 0, 1, 0]])
+    projection_matrix = intrinsics[target_ind].cpu().detach().numpy()
+    print(f'intrinsics:{projection_matrix}')
+
+    # 应用内参矩阵由相机坐标系转成像素坐标系
+    points_projected = np.dot(points_camera_coord[:, :3], projection_matrix.T)
+
+    # 从齐次坐标转换为笛卡尔坐标(欧式坐标)
+    points_2D = points_projected[:, :2] / points_projected[:, 2, np.newaxis]
+
+    # 创建一个空的图像（初始化为白色背景）
+    image = np.ones(imgs[0].shape)
+    H, W, _ = image.shape
+    # 根据需要调整2D投影坐标的尺度和位置，以适应图像的尺寸
+    # 这里简化处理，直接按比例缩放并居中
+    # scale = min(W, H) / max(points_2D.max() - points_2D.min(), 1)
+    # # offset = ((W - (points_2D[:, 0].max() - points_2D[:, 0].min()) * scale) / 2,
+    # #         (H - (points_2D[:, 1].max() - points_2D[:, 1].min()) * scale) / 2)
+    # points_2D_scaled = points_2D * scale + offset
+
+    # 计算2D投影坐标的尺度和位置，以适应图像的尺寸
+    # x_min, y_min = points_2D.min(axis=0)
+    # x_max, y_max = points_2D.max(axis=0)
+    # 保持横纵比不变的缩放
+    # scale = min(W / (x_max - x_min), H / (y_max - y_min))
+
+    # 缩放后的坐标
+    # points_2D_scaled = (points_2D - [x_min, y_min]) * scale
+    points_2D_scaled = points_2D
+
+    # 在图像上绘制点云
+    for point, color in zip(points_2D_scaled.astype(int), colors):
+        x, y = point
+        # 检查坐标是否在图像范围内
+        if 0 <= x < W and 0 <= y < H:
+            image[y, x] = color  # 设置像素颜色
+
+    import cv2
+    image = cv2.resize(image, (target_mask.shape[1], target_mask.shape[0]))
+    binary_image = binarize_image(image, threshold=args.thr)
+    # 转换为Pillow图像对象，确保数据类型正确
+    from PIL import Image
+    pil_image = Image.fromarray(binary_image, 'L')  # 'L'代表灰度模式
+
+    # 保存图像
+    os.makedirs(outdir, exist_ok=True)
+    pil_image.save(os.path.join(outdir, 'point_cloud_projection_mask.png'))
+    plt.imsave(os.path.join(outdir, 'point_cloud_projection.png'), image)
+
+
+    # 5、======================================计算IoU和Acc==================================
+    import evaluate
+    """
+    img_dir:  data/nerf_llff_data(NVOS-all)/horns
+    out_dir:  output/horns/nerf_llff_data(NVOS-all)/horns
+    mask_path:output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png
+    gt_path:  data/nerf_llff_data(NVOS-all)/masks/horns_center/DJI_20200223_163024_597_mask.png
+    """
+    mask_path = os.path.join(outdir, "point_cloud_projection_mask.png")
+    gt_dir = os.path.join("data", dataset_name, "masks", scene_name)
+    gt_path = ""
+    for name in os.listdir(gt_dir):
+        if "_mask" in name:
+            gt_path = os.path.join(gt_dir, name)
+
+    assert gt_path != "", os.path.join(outdir, "point_cloud_projection_mask.png") + "路径下没有mask图片"
+    iou, acc = evaluate.get_eval(mask_path, gt_path)
+    append_to_excel(os.path.join("output", dataset_name, 'iouAndAcc.xlsx'), scene_name, iou, acc)
+
+
+# 将所有scene的iou和acc结果写入Excel，便于求mIoU和mAcc
+def append_to_excel(file_path, scene, mIoU, mAcc):
+    # 创建一个 DataFrame 保存新的数据
+    data = {
+        'scene': [scene],
+        'mIoU': [f"{mIoU:.12f}"],
+        'mAcc': [f"{mAcc:.12f}"]
+    }
+    new_df = pd.DataFrame(data)
+
+    # 如果文件不存在，创建新的文件并写入数据
+    if not os.path.exists(file_path):
+        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
+            new_df.to_excel(writer, index=False)
+    else:
+        # 如果文件存在，读取文件并追加新数据
+        existing_df = pd.read_excel(file_path)
+        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
+
+        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
+            updated_df.to_excel(writer, index=False)
+
+
+# 将所有scene的iou和acc结果写入Excel，便于求mIoU和mAcc
+def append_to_excel(file_path, scene, mIoU, mAcc):
+    # 创建一个 DataFrame 保存新的数据
+    data = {
+        'scene': [scene],
+        'mIoU': [f"{mIoU:.12f}"],
+        'mAcc': [f"{mAcc:.12f}"]
+    }
+    new_df = pd.DataFrame(data)
+
+    # 如果文件不存在，创建新的文件并写入数据
+    if not os.path.exists(file_path):
+        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
+            new_df.to_excel(writer, index=False)
+    else:
+        # 如果文件存在，读取文件并追加新数据
+        existing_df = pd.read_excel(file_path)
+        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
+
+        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
+            updated_df.to_excel(writer, index=False)
+
+batch_size = 1
+schedule = 'cosine'
+lr = 0.01
+niter = 50 #global alignment中的迭代次数；初始值是300，本机内存爆炸，这里改小到50
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--img-dir', type=str, default="data/nerf_llff_data(NVOS-all)")
+    parser.add_argument('--model-path', type=str, default="checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth")
+    parser.add_argument('--sam-model', type=str, default="vit_b")
+    parser.add_argument('--sam-checkpoint', type=str, default="checkpoints/sam_vit_b_01ec64.pth")
+    parser.add_argument('--outdir', type=str, default="output/eval/horns")
+    parser.add_argument('--thr', type=float, default=0.6) # 0.6
+    args = parser.parse_args()
+
+
+    #### 测试llff(sanerf-hq)数据集的cecread场景
+    # run_llff_SANeRF_HQ("data/nerf_llff_data(NVOS-all)/trex")
+
+
+    for f in os.listdir(args.img_dir):
+        scene_path = os.path.join(args.img_dir, f).replace('\\', '/')
+        if(os.path.isdir(scene_path) and f != "horns" and f != "masks" and f!="reference_image" and f != "scribbles"):
+            run_llff_SANeRF_HQ(scene_path)
+
+
+
+
+
+    # import evaluate
+    #
+    # """
+    # img_dir:  data/nerf_llff_data(NVOS-all)/horns
+    # out_dir:  output/horns/nerf_llff_data(NVOS-all)/horns
+    # mask_path:output/nerf_llff_data(NVOS-all)/horns/point_cloud_projection_mask.png
+    # gt_path:  data/nerf_llff_data(NVOS-all)/masks/horns_center/DJI_20200223_163024_597_mask.png
+    # """
+    # mask_path = os.path.join(out_dir, "point_cloud_projection_mask.png")
+    # gt_dir = os.path.join("data", dataset_name, "masks", scene_name)
+    # gt_path = ""
+    # for name in os.listdir(gt_dir):
+    #     if "_mask" in name:
+    #         gt_path = os.path.join(gt_dir, name)
+    #
+    # assert gt_path != "", os.path.join(out_dir, "point_cloud_projection_mask.png") + "路径下没有mask图片"
+    # evaluate.get_eval(mask_path, gt_path)
+
+
+    # model_path = args.model_path
+    # outdir = args.outdir
+    # device = 'cuda'
+    # print("=============================================")
+    # print(torch.cuda.is_available())
+
+    # from load_nvos import load_nvos_data
+    # ref_ind, ref_pose, target_ind, target_pose, target_mask, all_imgfiles, all_poses  = load_nvos_data(basedir=args.img_dir)
+    # # print(ref_ind, ref_pose.shape, target_ind, all_imgfiles, all_poses.shape)
+    # # print(target_pose)
+    #
+    # from SAM import SamPredictor
+    # from SAM.build_sam import sam_model_registry
+    # sam = sam_model_registry[args.sam_model](checkpoint=args.sam_checkpoint)
+    # sam.to(device=device)
+    # predictor = SamPredictor(sam)
+    #
+    # batch_size = 1
+    # schedule = 'cosine'
+    # lr = 0.01
+    # niter = 50 #300，本机内存爆炸，这里改小到50
+    #
+    # model = load_model(model_path, device) # dust3R
+    # # load_images can take a list of images or a directory
+    # scene, imgs = get_reconstructed_scene( # 调用DUST3R
+    #     model=model, device=device,
+    #     image_size=512, filelist=all_imgfiles, schedule=schedule,
+    #     niter=niter, scenegraph_type="complete", winsize=1, refid=0,
+    # )
+    #
+    # poses = scene.get_im_poses() # cam to world 外参数矩阵的逆
+    # intrinsics = scene.get_intrinsics()
+    # pts3d = scene.get_pts3d()
+    # confidence_masks = scene.get_masks()
+    #
+    # # 这里返回的是针对每张图片，SAM以目标中心的两个point为提示信息输出的分割结果
+    # masks = seg(predictor, imgs, confidence_masks, target_ind)
+    # pts3d_list = []
+    # color_list = []
+    # for i, mask in zip(range(len(masks)),masks):
+    #     # 将SAM分割的结果和三维点云融合，即去除背景只剩下目标物体的三维点
+    #     pts3d_list.append(pts3d[i][mask].detach().cpu().numpy())
+    #     # 将SAM分割的结果和原图融合，即去除背景，这里是为了取出原图中二维点像素值，给上面的三维点染色（pts3d_list和color_list的点是一一对应的）
+    #     color_list.append(imgs[i][mask])
+    # # 将所有的三维点连接在一起，即全部绘制出来表示目标物体的三维点云（即论文中的公式(5)经过梯度下降计算出的世界坐标系下的点）
+    # points_3D = np.concatenate(pts3d_list).reshape(-1,3)
+    # colors = np.concatenate(color_list).reshape(-1,3)
+    #
+    # # 将点云数据转换为齐次坐标形式 (N, 4)，最后一列为1
+    # points_3D = np.hstack((points_3D, np.ones((len(points_3D), 1))))
+    # # 实际中，P 应根据你的相机参数来设置
+    # # P = np.eye(4)  # 假设为单位矩阵，即无旋转无平移
+    # # P = poses[target_ind].cpu().detach().numpy()
+    # # P是从相机到世界的变换矩阵，我们需要它的逆来从世界变换到相机坐标系
+    # P = np.linalg.inv(poses[target_ind].cpu().detach().numpy())
+    # print(f'Word 2 Camera:{P}')
+    # # P = np.eye(4)
+    # # P[:3,:4] = target_pose[:,:4]
+    #
+    # # 变换点云到相机坐标系
+    # points_camera_coord = np.dot(points_3D, P.T)
+    # # points_camera_coord = points_3D
+    #
+    # # 假设的投影矩阵，这里只是一个简化的示例
+    # # 真实情况下，投影矩阵取决于相机的内参等因素
+    # # projection_matrix = np.array([[1, 0, 0, 0],
+    # #                             [0, 1, 0, 0],
+    # #                             [0, 0, 1, 0]])
+    # projection_matrix = intrinsics[target_ind].cpu().detach().numpy()
+    # print(f'intrinsics:{projection_matrix}')
+    #
+    # # 应用内参矩阵由相机坐标系转成像素坐标系
+    # points_projected = np.dot(points_camera_coord[:,:3], projection_matrix.T)
+    #
+    # # 从齐次坐标转换为笛卡尔坐标(欧式坐标)
+    # points_2D = points_projected[:, :2] / points_projected[:, 2, np.newaxis]
+    #
+    #
+    # # 创建一个空的图像（初始化为白色背景）
+    # image = np.ones(imgs[0].shape)
+    # H,W,_ = image.shape
+    # # 根据需要调整2D投影坐标的尺度和位置，以适应图像的尺寸
+    # # 这里简化处理，直接按比例缩放并居中
+    # # scale = min(W, H) / max(points_2D.max() - points_2D.min(), 1)
+    # # # offset = ((W - (points_2D[:, 0].max() - points_2D[:, 0].min()) * scale) / 2,
+    # # #         (H - (points_2D[:, 1].max() - points_2D[:, 1].min()) * scale) / 2)
+    # # points_2D_scaled = points_2D * scale + offset
+    #
+    # # 计算2D投影坐标的尺度和位置，以适应图像的尺寸
+    # #x_min, y_min = points_2D.min(axis=0)
+    # #x_max, y_max = points_2D.max(axis=0)
+    # # 保持横纵比不变的缩放
+    # #scale = min(W / (x_max - x_min), H / (y_max - y_min))
+    #
+    # # 缩放后的坐标
+    # #points_2D_scaled = (points_2D - [x_min, y_min]) * scale
+    # points_2D_scaled = points_2D
+    #
+    # # 在图像上绘制点云
+    # for point, color in zip(points_2D_scaled.astype(int), colors):
+    #     x, y = point
+    #     # 检查坐标是否在图像范围内
+    #     if 0 <= x < W and 0 <= y < H:
+    #         image[y, x] = color  # 设置像素颜色
+    #
+    # import cv2
+    # image = cv2.resize(image, (target_mask.shape[1],target_mask.shape[0]))
+    # binary_image = binarize_image(image, threshold=args.thr)
+    # # 转换为Pillow图像对象，确保数据类型正确
+    # from PIL import Image
+    # pil_image = Image.fromarray(binary_image, 'L')  # 'L'代表灰度模式
+    #
+    # # 保存图像
+    # os.makedirs(args.outdir, exist_ok=True)
+    # pil_image.save(os.path.join(args.outdir,'point_cloud_projection_mask.png'))
+    # plt.imsave(os.path.join(args.outdir,'point_cloud_projection.png'), image)
+