Spaces:

adirathor07
/

snap2scene

Sleeping

App Files Files Community

adirathor07 commited on 17 days ago

Commit

757ed1c

0 Parent(s):

initial commit

Browse files

Files changed (26) hide show

.gitignore +1 -0
README.md +13 -0
__pycache__/binvox_rw.cpython-38.pyc +0 -0
__pycache__/config.cpython-38.pyc +0 -0
__pycache__/data_transforms.cpython-38.pyc +0 -0
__pycache__/helpers.cpython-38.pyc +0 -0
__pycache__/utils.cpython-38.pyc +0 -0
app.py +39 -0
check.ipynb +0 -0
config.py +108 -0
helpers.py +119 -0
models/__init__.py +0 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/__pycache__/decoder.cpython-38.pyc +0 -0
models/__pycache__/encoder.cpython-38.pyc +0 -0
models/__pycache__/merger.cpython-38.pyc +0 -0
models/__pycache__/refiner.cpython-38.pyc +0 -0
models/decoder.py +88 -0
models/encoder.py +85 -0
models/merger.py +71 -0
models/refiner.py +77 -0
requirements.txt +11 -0
utils/__pycache__/binvox_rw.cpython-38.pyc +0 -0
utils/__pycache__/data_transforms.cpython-38.pyc +0 -0
utils/binvox_rw.py +296 -0
utils/data_transforms.py +452 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ saved_model/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Snap2scene
+emoji: 😻
+colorFrom: pink
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.44.1
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/binvox_rw.cpython-38.pyc ADDED Viewed

Binary file (7.44 kB). View file

__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (2.48 kB). View file

__pycache__/data_transforms.cpython-38.pyc ADDED Viewed

Binary file (11.8 kB). View file

__pycache__/helpers.cpython-38.pyc ADDED Viewed

Binary file (2.86 kB). View file

__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+import numpy as np
+import plotly.graph_objects as go
+from PIL import Image
+from helpers import *
+# --- APP START ---
+st.title("2D → 3D Voxel Reconstruction Viewer")
+uploaded_images = st.file_uploader(f"Upload images", accept_multiple_files=True, type=["png", "jpg", "jpeg"])
+# print(uploaded_images)
+# --- DISPLAY ---
+if uploaded_images:
+    st.subheader("Uploaded Input Views")
+    cols = st.columns(len(uploaded_images))
+    rendering_images = []
+    for i, uploaded_file in enumerate(uploaded_images):
+        img = Image.open(uploaded_file)
+        cols[i].image(img, caption=f"View {i+1}", use_container_width=True)
+        img_np = np.array(img).astype(np.float32) / 255.0
+        rendering_images.append(img_np)
+    if st.button("Submit for Reconstruction"):
+        gv=None
+        with st.spinner("Reconstructing..."):
+            gv = predict_voxel_from_images(rendering_images)
+        fig = voxel_to_plotly(gv)
+        st.plotly_chart(fig, use_container_width=True)
+else:
+    st.info(f"Upload images to continue.")

check.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# -*- coding: utf-8 -*-
+#
+# Developed by Haozhe Xie <[email protected]>
+from easydict import EasyDict as edict
+__C                                         = edict()
+cfg                                         = __C
+#
+# Dataset Config
+#
+__C.DATASETS                                = edict()
+__C.DATASETS.SHAPENET                       = edict()
+__C.DATASETS.SHAPENET.TAXONOMY_FILE_PATH    = 'datasets/ShapeNet.json'
+# __C.DATASETS.SHAPENET.TAXONOMY_FILE_PATH  = './datasets/PascalShapeNet.json'
+__C.DATASETS.SHAPENET.RENDERING_PATH        = 'datasets/ShapeNetRendering/%s/%s/rendering/%02d.png'
+# __C.DATASETS.SHAPENET.RENDERING_PATH      = '/home/hzxie/Datasets/ShapeNet/PascalShapeNetRendering/%s/%s/render_%04d.jpg'
+__C.DATASETS.SHAPENET.VOXEL_PATH            = 'datasets/ShapeNetVox32/%s/%s/model.binvox'
+__C.DATASETS.PASCAL3D                       = edict()
+__C.DATASETS.PASCAL3D.TAXONOMY_FILE_PATH    = 'datasets/Pascal3D.json'
+__C.DATASETS.PASCAL3D.ANNOTATION_PATH       = '/home/hzxie/Datasets/PASCAL3D/Annotations/%s_imagenet/%s.mat'
+__C.DATASETS.PASCAL3D.RENDERING_PATH        = '/home/hzxie/Datasets/PASCAL3D/Images/%s_imagenet/%s.JPEG'
+__C.DATASETS.PASCAL3D.VOXEL_PATH            = '/home/hzxie/Datasets/PASCAL3D/CAD/%s/%02d.binvox'
+__C.DATASETS.PIX3D                          = edict()
+__C.DATASETS.PIX3D.TAXONOMY_FILE_PATH       = 'datasets/Pix3D.json'
+__C.DATASETS.PIX3D.ANNOTATION_PATH          = 'datasets/Pix3D/pix3d.json'
+__C.DATASETS.PIX3D.RENDERING_PATH           = 'datasets/Pix3D/img/%s/%s.%s'
+__C.DATASETS.PIX3D.VOXEL_PATH               = 'datasets/Pix3D/model/%s/%s/%s.binvox'
+#
+# Dataset
+#
+__C.DATASET                                 = edict()
+__C.DATASET.MEAN                            = [0.5, 0.5, 0.5]
+__C.DATASET.STD                             = [0.5, 0.5, 0.5]
+__C.DATASET.TRAIN_DATASET                   = 'ShapeNet'
+__C.DATASET.TEST_DATASET                    = 'ShapeNet'
+# __C.DATASET.TEST_DATASET                  = 'Pascal3D'
+# __C.DATASET.TEST_DATASET                  = 'Pix3D'
+#
+# Common
+#
+__C.CONST                                   = edict()
+__C.CONST.DEVICE                            = '0'
+__C.CONST.RNG_SEED                          = 0
+__C.CONST.IMG_W                             = 224       # Image width for input
+__C.CONST.IMG_H                             = 224       # Image height for input
+__C.CONST.N_VOX                             = 32
+__C.CONST.BATCH_SIZE                        = 64
+__C.CONST.N_VIEWS_RENDERING                 = 1         # Dummy property for Pascal 3D
+__C.CONST.CROP_IMG_W                        = 128       # Dummy property for Pascal 3D
+__C.CONST.CROP_IMG_H                        = 128       # Dummy property for Pascal 3D
+#
+# Directories
+#
+__C.DIR                                     = edict()
+__C.DIR.OUT_PATH                            = './output'
+__C.DIR.RANDOM_BG_PATH                      = '/home/hzxie/Datasets/SUN2012/JPEGImages'
+#
+# Network
+#
+__C.NETWORK                                 = edict()
+__C.NETWORK.LEAKY_VALUE                     = .2
+__C.NETWORK.TCONV_USE_BIAS                  = False
+__C.NETWORK.USE_REFINER                     = True
+__C.NETWORK.USE_MERGER                      = True
+#
+# Training
+#
+__C.TRAIN                                   = edict()
+__C.TRAIN.RESUME_TRAIN                      = False
+__C.TRAIN.NUM_WORKER                        = 4             # number of data workers
+__C.TRAIN.NUM_EPOCHES                       = 5
+__C.TRAIN.BRIGHTNESS                        = .4
+__C.TRAIN.CONTRAST                          = .4
+__C.TRAIN.SATURATION                        = .4
+__C.TRAIN.NOISE_STD                         = .1
+__C.TRAIN.RANDOM_BG_COLOR_RANGE             = [[225, 255], [225, 255], [225, 255]]
+__C.TRAIN.POLICY                            = 'adam'        # available options: sgd, adam
+__C.TRAIN.EPOCH_START_USE_REFINER           = 0
+__C.TRAIN.EPOCH_START_USE_MERGER            = 0
+__C.TRAIN.ENCODER_LEARNING_RATE             = 1e-3
+__C.TRAIN.DECODER_LEARNING_RATE             = 1e-3
+__C.TRAIN.REFINER_LEARNING_RATE             = 1e-3
+__C.TRAIN.MERGER_LEARNING_RATE              = 1e-4
+__C.TRAIN.DISCRIMINATOR_LR                  = 1e-4
+__C.TRAIN.GAN_LOSS_WEIGHT                   = 0.01
+__C.TRAIN.ENCODER_LR_MILESTONES             = [150]
+__C.TRAIN.DECODER_LR_MILESTONES             = [150]
+__C.TRAIN.REFINER_LR_MILESTONES             = [150]
+__C.TRAIN.MERGER_LR_MILESTONES              = [150]
+__C.TRAIN.BETAS                             = (.9, .999)
+__C.TRAIN.MOMENTUM                          = .9
+__C.TRAIN.GAMMA                             = .5
+__C.TRAIN.SAVE_FREQ                         = 10            # weights will be overwritten every save_freq epoch
+__C.TRAIN.UPDATE_N_VIEWS_RENDERING          = False
+#
+# Testing options
+#
+__C.TEST                                    = edict()
+__C.TEST.RANDOM_BG_COLOR_RANGE              = [[240, 240], [240, 240], [240, 240]]
+__C.TEST.VOXEL_THRESH                       = [.2, .3, .4, .5]

helpers.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import utils.binvox_rw as binvox_rw
+import numpy as np
+import plotly.graph_objects as go
+from models.encoder import Encoder
+from models.decoder import Decoder
+from models.merger import Merger
+from models.refiner import Refiner
+from config import cfg
+import torch
+from datetime import datetime as dt
+import utils.data_transforms
+device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(device)
+# device='cpu'
+cfg.CONST.WEIGHTS='saved_model/Pix2Vox.pth'
+def read_binvox(file) -> np.ndarray:
+    model = binvox_rw.read_as_3d_array(file)
+    return model.data.astype(np.uint8)
+def voxel_to_plotly(voxels):
+    x, y, z = voxels.nonzero()
+    fig = go.Figure(data=[
+        go.Scatter3d(
+            x=x, y=y, z=z,
+            mode='markers',
+            marker=dict(size=3, color=z, colorscale='Viridis', opacity=0.7)
+        )
+    ])
+    fig.update_layout(scene=dict(aspectmode='data'))
+    return fig
+IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W
+CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W
+test_transforms = utils.data_transforms.Compose([
+    utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE),
+    utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE),
+    utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD),
+    utils.data_transforms.ToTensor(),
+])
+def predict_voxel_from_images(rendering_images):
+    transformed_images = test_transforms(rendering_images)
+    encoder = Encoder(cfg)
+    decoder = Decoder(cfg)
+    refiner = Refiner(cfg)
+    merger = Merger(cfg)
+    if torch.cuda.is_available():
+            encoder = torch.nn.DataParallel(encoder).cuda()
+            decoder = torch.nn.DataParallel(decoder).cuda()
+            refiner = torch.nn.DataParallel(refiner).cuda()
+            merger = torch.nn.DataParallel(merger).cuda()
+    print('[INFO] %s Loading weights from %s ...' % (dt.now(), cfg.CONST.WEIGHTS))
+    checkpoint = torch.load(cfg.CONST.WEIGHTS)
+    epoch_idx = checkpoint['epoch_idx']
+    encoder.load_state_dict(checkpoint['encoder_state_dict'])
+    decoder.load_state_dict(checkpoint['decoder_state_dict'])
+    if cfg.NETWORK.USE_REFINER:
+            refiner.load_state_dict(checkpoint['refiner_state_dict'])
+    if cfg.NETWORK.USE_MERGER:
+            merger.load_state_dict(checkpoint['merger_state_dict'])
+    encoder.eval()
+    decoder.eval()
+    merger.eval()
+    refiner.eval()
+    with torch.no_grad():
+        transformed_images = transformed_images.unsqueeze(0) #adding the batch_dim
+        transformed_images = transformed_images.to(device)
+        # print(rendering_images.shape)
+        image_features = encoder(transformed_images)
+        print(image_features.shape)
+        raw_features, generated_volume = decoder(image_features)
+        print(generated_volume.shape)
+        if cfg.NETWORK.USE_MERGER:
+            generated_volume = merger(raw_features, generated_volume)
+        else:
+            generated_volume = torch.mean(generated_volume, dim=1)
+        # encoder_loss = bce_loss(generated_volume, ground_truth_volume) * 10
+        if cfg.NETWORK.USE_REFINER:
+                generated_volume = refiner(generated_volume)
+                # refiner_loss = bce_loss(generated_volume, ground_truth_volume) * 10
+        else:
+                # refiner_loss = encoder_loss
+                pass
+        generated_volume=generated_volume.squeeze(0)
+        gv = generated_volume.cpu().numpy()
+        gv = (gv >= 0.5).astype(np.uint8)
+    torch.cuda.empty_cache()
+    return gv

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (169 Bytes). View file

models/__pycache__/decoder.cpython-38.pyc ADDED Viewed

Binary file (2.26 kB). View file

models/__pycache__/encoder.cpython-38.pyc ADDED Viewed

Binary file (2.05 kB). View file

models/__pycache__/merger.cpython-38.pyc ADDED Viewed

Binary file (1.65 kB). View file

models/__pycache__/refiner.cpython-38.pyc ADDED Viewed

Binary file (1.92 kB). View file

models/decoder.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+class Decoder(torch.nn.Module):
+    def __init__(self, cfg):
+        super(Decoder, self).__init__()
+        self.cfg = cfg
+        # Layer Definition
+        self.layer1 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(2048, 512, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(512),
+            torch.nn.ReLU()
+        )
+        self.layer2 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(512, 128, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(128),
+            torch.nn.ReLU()
+        )
+        self.layer3 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(128, 32, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(32),
+            torch.nn.ReLU()
+        )
+        self.layer4 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(32, 8, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(8),
+            torch.nn.ReLU()
+        )
+        self.layer5 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(8, 1, kernel_size=1, bias=cfg.NETWORK.TCONV_USE_BIAS),
+            torch.nn.Sigmoid()
+        )
+    def forward(self, image_features):
+        image_features = image_features.permute(1, 0, 2, 3, 4).contiguous()
+        image_features = torch.split(image_features, 1, dim=0)
+        gen_volumes = []
+        raw_features = []
+        for features in image_features:
+            gen_volume = features.view(-1, 2048, 2, 2, 2)
+            # print(gen_volume.size())   # torch.Size([batch_size, 2048, 2, 2, 2])
+            gen_volume = self.layer1(gen_volume)
+            # print(gen_volume.size())   # torch.Size([batch_size, 512, 4, 4, 4])
+            gen_volume = self.layer2(gen_volume)
+            # print(gen_volume.size())   # torch.Size([batch_size, 128, 8, 8, 8])
+            gen_volume = self.layer3(gen_volume)
+            # print(gen_volume.size())   # torch.Size([batch_size, 32, 16, 16, 16])
+            gen_volume = self.layer4(gen_volume)
+            raw_feature = gen_volume
+            # print(gen_volume.size())   # torch.Size([batch_size, 8, 32, 32, 32])
+            gen_volume = self.layer5(gen_volume)
+            # print(gen_volume.size())   # torch.Size([batch_size, 1, 32, 32, 32])
+            raw_feature = torch.cat((raw_feature, gen_volume), dim=1)
+            # print(raw_feature.size())  # torch.Size([batch_size, 9, 32, 32, 32])
+            gen_volumes.append(torch.squeeze(gen_volume, dim=1))
+            raw_features.append(raw_feature)
+        gen_volumes = torch.stack(gen_volumes).permute(1, 0, 2, 3, 4).contiguous()
+        raw_features = torch.stack(raw_features).permute(1, 0, 2, 3, 4, 5).contiguous()
+        # print(gen_volumes.size())      # torch.Size([batch_size, n_views, 32, 32, 32])
+        # print(raw_features.size())     # torch.Size([batch_size, n_views, 9, 32, 32, 32])
+        return raw_features, gen_volumes
+class DummyCfg:
+    class NETWORK:
+        TCONV_USE_BIAS = False
+cfg = DummyCfg()
+# Instantiate the decoder
+decoder = Decoder(cfg)
+# Simulate input: shape [batch_size,n_views,img_c, img_h, img_w]
+n_views = 1
+batch_size = 64
+img_c, img_h, img_w = 256, 8, 8
+dummy_input = torch.randn(batch_size,n_views,img_c, img_h, img_w)
+# Run the decoder
+print(dummy_input.shape)
+raw_features, gen_volumes = decoder(dummy_input)
+# Output shapes
+print("raw_features shape:", raw_features.shape)   # Expected: [64, 5, 9, 32, 32, 32]
+print("gen_volumes shape:", gen_volumes.shape)

models/encoder.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# -*- coding: utf-8 -*-
+#
+# Developed by Haozhe Xie <[email protected]>
+#
+# References:
+# - https://github.com/shawnxu1318/MVCNN-Multi-View-Convolutional-Neural-Networks/blob/master/mvcnn.py
+import torch
+import torchvision.models
+class Encoder(torch.nn.Module):
+    def __init__(self, cfg):
+        super(Encoder, self).__init__()
+        self.cfg = cfg
+        # Layer Definition
+        vgg16_bn = torchvision.models.vgg16_bn(pretrained=True)
+        self.vgg = torch.nn.Sequential(*list(vgg16_bn.features.children()))[:27]
+        self.layer1 = torch.nn.Sequential(
+            torch.nn.Conv2d(512, 512, kernel_size=3),
+            torch.nn.BatchNorm2d(512),
+            torch.nn.ELU(),
+        )
+        self.layer2 = torch.nn.Sequential(
+            torch.nn.Conv2d(512, 512, kernel_size=3),
+            torch.nn.BatchNorm2d(512),
+            torch.nn.ELU(),
+            torch.nn.MaxPool2d(kernel_size=3)
+        )
+        self.layer3 = torch.nn.Sequential(
+            torch.nn.Conv2d(512, 256, kernel_size=1),
+            torch.nn.BatchNorm2d(256),
+            torch.nn.ELU()
+        )
+        # Don't update params in VGG16
+        for param in vgg16_bn.parameters():
+            param.requires_grad = False
+    def forward(self, rendering_images):
+        # print(rendering_images.size())  # torch.Size([batch_size, n_views, img_c, img_h, img_w])
+        rendering_images = rendering_images.permute(1, 0, 2, 3, 4).contiguous()
+        rendering_images = torch.split(rendering_images, 1, dim=0)
+        image_features = []
+        for img in rendering_images:
+            features = self.vgg(img.squeeze(dim=0))
+            # print(features.size())    # torch.Size([batch_size, 512, 28, 28])
+            features = self.layer1(features)
+            # print(features.size())    # torch.Size([batch_size, 512, 26, 26])
+            features = self.layer2(features)
+            # print(features.size())    # torch.Size([batch_size, 512, 24, 24])
+            features = self.layer3(features)
+            # print(features.size())    # torch.Size([batch_size, 256, 8, 8])
+            image_features.append(features)
+        image_features = torch.stack(image_features).permute(1, 0, 2, 3, 4).contiguous()
+        # print(image_features.size())  # torch.Size([batch_size, n_views, 256, 8, 8])
+        return image_features
+class DummyCfg:
+    class NETWORK:
+        TCONV_USE_BIAS = False
+cfg = DummyCfg()
+# Instantiate the decoder
+encoder = Encoder(cfg)
+# Simulate input: shape [batch_size,n_views,img_c, img_h, img_w]
+batch_size = 64
+n_views=5
+img_c, img_h, img_w = 3,224,224
+dummy_input = torch.randn(batch_size,n_views,img_c, img_h, img_w)
+# Run the decoder
+print(dummy_input.shape)
+image_features = encoder(dummy_input)
+print("image_features shape:", image_features.shape)   # Expected: [64, 5, 9, 32, 32, 32]

models/merger.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# -*- coding: utf-8 -*-
+#
+# Developed by Haozhe Xie <[email protected]>
+import torch
+class Merger(torch.nn.Module):
+    def __init__(self, cfg):
+        super(Merger, self).__init__()
+        self.cfg = cfg
+        # Layer Definition
+        self.layer1 = torch.nn.Sequential(
+            torch.nn.Conv3d(9, 16, kernel_size=3, padding=1),
+            torch.nn.BatchNorm3d(16),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE)
+        )
+        self.layer2 = torch.nn.Sequential(
+            torch.nn.Conv3d(16, 8, kernel_size=3, padding=1),
+            torch.nn.BatchNorm3d(8),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE)
+        )
+        self.layer3 = torch.nn.Sequential(
+            torch.nn.Conv3d(8, 4, kernel_size=3, padding=1),
+            torch.nn.BatchNorm3d(4),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE)
+        )
+        self.layer4 = torch.nn.Sequential(
+            torch.nn.Conv3d(4, 2, kernel_size=3, padding=1),
+            torch.nn.BatchNorm3d(2),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE)
+        )
+        self.layer5 = torch.nn.Sequential(
+            torch.nn.Conv3d(2, 1, kernel_size=3, padding=1),
+            torch.nn.BatchNorm3d(1),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE)
+        )
+    def forward(self, raw_features, coarse_volumes):
+        n_views_rendering = coarse_volumes.size(1)
+        raw_features = torch.split(raw_features, 1, dim=1)
+        volume_weights = []
+        for i in range(n_views_rendering):
+            raw_feature = torch.squeeze(raw_features[i], dim=1)
+            # print(raw_feature.size())       # torch.Size([batch_size, 9, 32, 32, 32])
+            volume_weight = self.layer1(raw_feature)
+            # print(volume_weight.size())     # torch.Size([batch_size, 16, 32, 32, 32])
+            volume_weight = self.layer2(volume_weight)
+            # print(volume_weight.size())     # torch.Size([batch_size, 8, 32, 32, 32])
+            volume_weight = self.layer3(volume_weight)
+            # print(volume_weight.size())     # torch.Size([batch_size, 4, 32, 32, 32])
+            volume_weight = self.layer4(volume_weight)
+            # print(volume_weight.size())     # torch.Size([batch_size, 2, 32, 32, 32])
+            volume_weight = self.layer5(volume_weight)
+            # print(volume_weight.size())     # torch.Size([batch_size, 1, 32, 32, 32])
+            volume_weight = torch.squeeze(volume_weight, dim=1)
+            # print(volume_weight.size())     # torch.Size([batch_size, 32, 32, 32])
+            volume_weights.append(volume_weight)
+        volume_weights = torch.stack(volume_weights).permute(1, 0, 2, 3, 4).contiguous()
+        volume_weights = torch.softmax(volume_weights, dim=1)
+        # print(volume_weights.size())        # torch.Size([batch_size, n_views, 32, 32, 32])
+        # print(coarse_volumes.size())        # torch.Size([batch_size, n_views, 32, 32, 32])
+        coarse_volumes = coarse_volumes * volume_weights
+        coarse_volumes = torch.sum(coarse_volumes, dim=1)
+        return torch.clamp(coarse_volumes, min=0, max=1)

models/refiner.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# -*- coding: utf-8 -*-
+#
+# Developed by Haozhe Xie <[email protected]>
+import torch
+class Refiner(torch.nn.Module):
+    def __init__(self, cfg):
+        super(Refiner, self).__init__()
+        self.cfg = cfg
+        # Layer Definition
+        self.layer1 = torch.nn.Sequential(
+            torch.nn.Conv3d(1, 32, kernel_size=4, padding=2),
+            torch.nn.BatchNorm3d(32),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE),
+            torch.nn.MaxPool3d(kernel_size=2)
+        )
+        self.layer2 = torch.nn.Sequential(
+            torch.nn.Conv3d(32, 64, kernel_size=4, padding=2),
+            torch.nn.BatchNorm3d(64),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE),
+            torch.nn.MaxPool3d(kernel_size=2)
+        )
+        self.layer3 = torch.nn.Sequential(
+            torch.nn.Conv3d(64, 128, kernel_size=4, padding=2),
+            torch.nn.BatchNorm3d(128),
+            torch.nn.LeakyReLU(cfg.NETWORK.LEAKY_VALUE),
+            torch.nn.MaxPool3d(kernel_size=2)
+        )
+        self.layer4 = torch.nn.Sequential(
+            torch.nn.Linear(8192, 2048),
+            torch.nn.ReLU()
+        )
+        self.layer5 = torch.nn.Sequential(
+            torch.nn.Linear(2048, 8192),
+            torch.nn.ReLU()
+        )
+        self.layer6 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(128, 64, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(64),
+            torch.nn.ReLU()
+        )
+        self.layer7 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(64, 32, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.BatchNorm3d(32),
+            torch.nn.ReLU()
+        )
+        self.layer8 = torch.nn.Sequential(
+            torch.nn.ConvTranspose3d(32, 1, kernel_size=4, stride=2, bias=cfg.NETWORK.TCONV_USE_BIAS, padding=1),
+            torch.nn.Sigmoid()
+        )
+    def forward(self, coarse_volumes):
+        volumes_32_l = coarse_volumes.view((-1, 1, self.cfg.CONST.N_VOX, self.cfg.CONST.N_VOX, self.cfg.CONST.N_VOX))
+        # print(volumes_32_l.size())       # torch.Size([batch_size, 1, 32, 32, 32])
+        volumes_16_l = self.layer1(volumes_32_l)
+        # print(volumes_16_l.size())       # torch.Size([batch_size, 32, 16, 16, 16])
+        volumes_8_l = self.layer2(volumes_16_l)
+        # print(volumes_8_l.size())        # torch.Size([batch_size, 64, 8, 8, 8])
+        volumes_4_l = self.layer3(volumes_8_l)
+        # print(volumes_4_l.size())        # torch.Size([batch_size, 128, 4, 4, 4])
+        flatten_features = self.layer4(volumes_4_l.view(-1, 8192))
+        # print(flatten_features.size())   # torch.Size([batch_size, 2048])
+        flatten_features = self.layer5(flatten_features)
+        # print(flatten_features.size())   # torch.Size([batch_size, 8192])
+        volumes_4_r = volumes_4_l + flatten_features.view(-1, 128, 4, 4, 4)
+        # print(volumes_4_r.size())        # torch.Size([batch_size, 128, 4, 4, 4])
+        volumes_8_r = volumes_8_l + self.layer6(volumes_4_r)
+        # print(volumes_8_r.size())        # torch.Size([batch_size, 64, 8, 8, 8])
+        volumes_16_r = volumes_16_l + self.layer7(volumes_8_r)
+        # print(volumes_16_r.size())       # torch.Size([batch_size, 32, 16, 16, 16])
+        volumes_32_r = (volumes_32_l + self.layer8(volumes_16_r)) * 0.5
+        # print(volumes_32_r.size())       # torch.Size([batch_size, 1, 32, 32, 32])
+        return volumes_32_r.view((-1, self.cfg.CONST.N_VOX, self.cfg.CONST.N_VOX, self.cfg.CONST.N_VOX))

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+argparse
+easydict
+matplotlib
+numpy
+opencv-python
+scipy
+torch
+torchvision
+streamlit
+plotly
+pillow

utils/__pycache__/binvox_rw.cpython-38.pyc ADDED Viewed

Binary file (7.44 kB). View file

utils/__pycache__/data_transforms.cpython-38.pyc ADDED Viewed

Binary file (11.8 kB). View file

utils/binvox_rw.py ADDED Viewed

	@@ -0,0 +1,296 @@

+#  Copyright (C) 2012 Daniel Maturana
+#  This file is part of binvox-rw-py.
+#
+#  binvox-rw-py is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  binvox-rw-py is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with binvox-rw-py. If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+Binvox to Numpy and back.
+>>> import numpy as np
+>>> import binvox_rw
+>>> with open('chair.binvox', 'rb') as f:
+...     m1 = binvox_rw.read_as_3d_array(f)
+...
+>>> m1.dims
+[32, 32, 32]
+>>> m1.scale
+41.133000000000003
+>>> m1.translate
+[0.0, 0.0, 0.0]
+>>> with open('chair_out.binvox', 'wb') as f:
+...     m1.write(f)
+...
+>>> with open('chair_out.binvox', 'rb') as f:
+...     m2 = binvox_rw.read_as_3d_array(f)
+...
+>>> m1.dims == m2.dims
+True
+>>> m1.scale == m2.scale
+True
+>>> m1.translate == m2.translate
+True
+>>> np.all(m1.data == m2.data)
+True
+>>> with open('chair.binvox', 'rb') as f:
+...     md = binvox_rw.read_as_3d_array(f)
+...
+>>> with open('chair.binvox', 'rb') as f:
+...     ms = binvox_rw.read_as_coord_array(f)
+...
+>>> data_ds = binvox_rw.dense_to_sparse(md.data)
+>>> data_sd = binvox_rw.sparse_to_dense(ms.data, 32)
+>>> np.all(data_sd == md.data)
+True
+>>> # the ordering of elements returned by numpy.nonzero changes with axis
+>>> # ordering, so to compare for equality we first lexically sort the voxels.
+>>> np.all(ms.data[:, np.lexsort(ms.data)] == data_ds[:, np.lexsort(data_ds)])
+True
+"""
+import numpy as np
+class Voxels(object):
+    """ Holds a binvox model.
+    data is either a three-dimensional numpy boolean array (dense representation)
+    or a two-dimensional numpy float array (coordinate representation).
+    dims, translate and scale are the model metadata.
+    dims are the voxel dimensions, e.g. [32, 32, 32] for a 32x32x32 model.
+    scale and translate relate the voxels to the original model coordinates.
+    To translate voxel coordinates i, j, k to original coordinates x, y, z:
+    x_n = (i+.5)/dims[0]
+    y_n = (j+.5)/dims[1]
+    z_n = (k+.5)/dims[2]
+    x = scale*x_n + translate[0]
+    y = scale*y_n + translate[1]
+    z = scale*z_n + translate[2]
+    """
+    def __init__(self, data, dims, translate, scale, axis_order):
+        self.data = data
+        self.dims = dims
+        self.translate = translate
+        self.scale = scale
+        assert (axis_order in ('xzy', 'xyz'))
+        self.axis_order = axis_order
+    def clone(self):
+        data = self.data.copy()
+        dims = self.dims[:]
+        translate = self.translate[:]
+        return Voxels(data, dims, translate, self.scale, self.axis_order)
+    def write(self, fp):
+        write(self, fp)
+def read_header(fp):
+    """ Read binvox header. Mostly meant for internal use.
+    """
+    line = fp.readline().strip()
+    if not line.startswith(b'#binvox'):
+        raise IOError('[ERROR] Not a binvox file')
+    dims = list(map(int, fp.readline().strip().split(b' ')[1:]))
+    translate = list(map(float, fp.readline().strip().split(b' ')[1:]))
+    scale = list(map(float, fp.readline().strip().split(b' ')[1:]))[0]
+    fp.readline()
+    return dims, translate, scale
+def read_as_3d_array(fp, fix_coords=True):
+    """ Read binary binvox format as array.
+    Returns the model with accompanying metadata.
+    Voxels are stored in a three-dimensional numpy array, which is simple and
+    direct, but may use a lot of memory for large models. (Storage requirements
+    are 8*(d^3) bytes, where d is the dimensions of the binvox model. Numpy
+    boolean arrays use a byte per element).
+    Doesn't do any checks on input except for the '#binvox' line.
+    """
+    dims, translate, scale = read_header(fp)
+    raw_data = np.frombuffer(fp.read(), dtype=np.uint8)
+    # if just using reshape() on the raw data:
+    # indexing the array as array[i,j,k], the indices map into the
+    # coords as:
+    # i -> x
+    # j -> z
+    # k -> y
+    # if fix_coords is true, then data is rearranged so that
+    # mapping is
+    # i -> x
+    # j -> y
+    # k -> z
+    values, counts = raw_data[::2], raw_data[1::2]
+    data = np.repeat(values, counts).astype(np.int32)
+    data = data.reshape(dims)
+    if fix_coords:
+        # xzy to xyz TODO the right thing
+        data = np.transpose(data, (0, 2, 1))
+        axis_order = 'xyz'
+    else:
+        axis_order = 'xzy'
+    return Voxels(data, dims, translate, scale, axis_order)
+def read_as_coord_array(fp, fix_coords=True):
+    """ Read binary binvox format as coordinates.
+    Returns binvox model with voxels in a "coordinate" representation, i.e.  an
+    3 x N array where N is the number of nonzero voxels. Each column
+    corresponds to a nonzero voxel and the 3 rows are the (x, z, y) coordinates
+    of the voxel.  (The odd ordering is due to the way binvox format lays out
+    data).  Note that coordinates refer to the binvox voxels, without any
+    scaling or translation.
+    Use this to save memory if your model is very sparse (mostly empty).
+    Doesn't do any checks on input except for the '#binvox' line.
+    """
+    dims, translate, scale = read_header(fp)
+    raw_data = np.frombuffer(fp.read(), dtype=np.uint8)
+    values, counts = raw_data[::2], raw_data[1::2]
+    # sz = np.prod(dims)
+    # index, end_index = 0, 0
+    end_indices = np.cumsum(counts)
+    indices = np.concatenate(([0], end_indices[:-1])).astype(end_indices.dtype)
+    values = values.astype(np.bool)
+    indices = indices[values]
+    end_indices = end_indices[values]
+    nz_voxels = []
+    for index, end_index in zip(indices, end_indices):
+        nz_voxels.extend(range(index, end_index))
+    nz_voxels = np.array(nz_voxels)
+    # TODO are these dims correct?
+    # according to docs,
+    # index = x * wxh + z * width + y; // wxh = width * height = d * d
+    x = nz_voxels / (dims[0] * dims[1])
+    zwpy = nz_voxels % (dims[0] * dims[1])    # z*w + y
+    z = zwpy / dims[0]
+    y = zwpy % dims[0]
+    if fix_coords:
+        data = np.vstack((x, y, z))
+        axis_order = 'xyz'
+    else:
+        data = np.vstack((x, z, y))
+        axis_order = 'xzy'
+    #return Voxels(data, dims, translate, scale, axis_order)
+    return Voxels(np.ascontiguousarray(data), dims, translate, scale, axis_order)
+def dense_to_sparse(voxel_data, dtype=int):
+    """ From dense representation to sparse (coordinate) representation.
+    No coordinate reordering.
+    """
+    if voxel_data.ndim != 3:
+        raise ValueError('[ERROR] voxel_data is wrong shape; should be 3D array.')
+    return np.asarray(np.nonzero(voxel_data), dtype)
+def sparse_to_dense(voxel_data, dims, dtype=bool):
+    if voxel_data.ndim != 2 or voxel_data.shape[0] != 3:
+        raise ValueError('[ERROR] voxel_data is wrong shape; should be 3xN array.')
+    if np.isscalar(dims):
+        dims = [dims] * 3
+    dims = np.atleast_2d(dims).T
+    # truncate to integers
+    xyz = voxel_data.astype(np.int)
+    # discard voxels that fall outside dims
+    valid_ix = ~np.any((xyz < 0) | (xyz >= dims), 0)
+    xyz = xyz[:, valid_ix]
+    out = np.zeros(dims.flatten(), dtype=dtype)
+    out[tuple(xyz)] = True
+    return out
+#def get_linear_index(x, y, z, dims):
+#""" Assuming xzy order. (y increasing fastest.
+#TODO ensure this is right when dims are not all same
+#"""
+#return x*(dims[1]*dims[2]) + z*dims[1] + y
+def write(voxel_model, fp):
+    """ Write binary binvox format.
+    Note that when saving a model in sparse (coordinate) format, it is first
+    converted to dense format.
+    Doesn't check if the model is 'sane'.
+    """
+    if voxel_model.data.ndim == 2:
+        # TODO avoid conversion to dense
+        dense_voxel_data = sparse_to_dense(voxel_model.data, voxel_model.dims).astype(int)
+    else:
+        dense_voxel_data = voxel_model.data.astype(int)
+    file_header = [
+        '#binvox 1\n',
+        'dim %s\n' % ' '.join(map(str, voxel_model.dims)),
+        'translate %s\n' % ' '.join(map(str, voxel_model.translate)),
+        'scale %s\n' % str(voxel_model.scale), 'data\n'
+    ]
+    for fh in file_header:
+        fp.write(fh.encode('latin-1'))
+    if voxel_model.axis_order not in ('xzy', 'xyz'):
+        raise ValueError('[ERROR] Unsupported voxel model axis order')
+    if voxel_model.axis_order == 'xzy':
+        voxels_flat = dense_voxel_data.flatten()
+    elif voxel_model.axis_order == 'xyz':
+        voxels_flat = np.transpose(dense_voxel_data, (0, 2, 1)).flatten()
+    # keep a sort of state machine for writing run length encoding
+    state = voxels_flat[0]
+    ctr = 0
+    for c in voxels_flat:
+        if c == state:
+            ctr += 1
+            # if ctr hits max, dump
+            if ctr == 255:
+                fp.write(chr(state).encode('latin-1'))
+                fp.write(chr(ctr).encode('latin-1'))
+                ctr = 0
+        else:
+            # if switch state, dump
+            fp.write(chr(state).encode('latin-1'))
+            fp.write(chr(ctr).encode('latin-1'))
+            state = c
+            ctr = 1
+    # flush out remainders
+    if ctr > 0:
+        fp.write(chr(state).encode('latin-1'))
+        fp.write(chr(ctr).encode('latin-1'))
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()

utils/data_transforms.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# -*- coding: utf-8 -*-
+#
+# Developed by Haozhe Xie <[email protected]>
+# References:
+# - https://github.com/xiumingzhang/GenRe-ShapeHD
+import cv2
+# import matplotlib.pyplot as plt
+# import matplotlib.patches as patches
+import numpy as np
+import os
+import random
+import torch
+class Compose(object):
+    """ Composes several transforms together.
+    For example:
+    >>> transforms.Compose([
+    >>>     transforms.RandomBackground(),
+    >>>     transforms.CenterCrop(127, 127, 3),
+    >>>  ])
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, rendering_images, bounding_box=None):
+        for t in self.transforms:
+            if t.__class__.__name__ == 'RandomCrop' or t.__class__.__name__ == 'CenterCrop':
+                rendering_images = t(rendering_images, bounding_box)
+            else:
+                rendering_images = t(rendering_images)
+        return rendering_images
+class ToTensor(object):
+    """
+    Convert a PIL Image or numpy.ndarray to tensor.
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
+    """
+    def __call__(self, rendering_images):
+        assert (isinstance(rendering_images, np.ndarray))
+        array = np.transpose(rendering_images, (0, 3, 1, 2))
+        # handle numpy array
+        tensor = torch.from_numpy(array)
+        # put it from HWC to CHW format
+        return tensor.float()
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, rendering_images):
+        assert (isinstance(rendering_images, np.ndarray))
+        rendering_images -= self.mean
+        rendering_images /= self.std
+        return rendering_images
+class RandomPermuteRGB(object):
+    def __call__(self, rendering_images):
+        assert (isinstance(rendering_images, np.ndarray))
+        random_permutation = np.random.permutation(3)
+        for img_idx, img in enumerate(rendering_images):
+            rendering_images[img_idx] = img[..., random_permutation]
+        return rendering_images
+class CenterCrop(object):
+    def __init__(self, img_size, crop_size):
+        """Set the height and weight before and after cropping"""
+        self.img_size_h = img_size[0]
+        self.img_size_w = img_size[1]
+        self.crop_size_h = crop_size[0]
+        self.crop_size_w = crop_size[1]
+    def __call__(self, rendering_images, bounding_box=None):
+        if len(rendering_images) == 0:
+            return rendering_images
+        crop_size_c = rendering_images[0].shape[2]
+        processed_images = np.empty(shape=(0, self.img_size_h, self.img_size_w, crop_size_c))
+        for img_idx, img in enumerate(rendering_images):
+            img_height, img_width, _ = img.shape
+            if bounding_box is not None:
+                bounding_box = [
+                    bounding_box[0] * img_width,
+                    bounding_box[1] * img_height,
+                    bounding_box[2] * img_width,
+                    bounding_box[3] * img_height
+                ]  # yapf: disable
+                # Calculate the size of bounding boxes
+                bbox_width = bounding_box[2] - bounding_box[0]
+                bbox_height = bounding_box[3] - bounding_box[1]
+                bbox_x_mid = (bounding_box[2] + bounding_box[0]) * .5
+                bbox_y_mid = (bounding_box[3] + bounding_box[1]) * .5
+                # Make the crop area as a square
+                square_object_size = max(bbox_width, bbox_height)
+                x_left = int(bbox_x_mid - square_object_size * .5)
+                x_right = int(bbox_x_mid + square_object_size * .5)
+                y_top = int(bbox_y_mid - square_object_size * .5)
+                y_bottom = int(bbox_y_mid + square_object_size * .5)
+                # If the crop position is out of the image, fix it with padding
+                pad_x_left = 0
+                if x_left < 0:
+                    pad_x_left = -x_left
+                    x_left = 0
+                pad_x_right = 0
+                if x_right >= img_width:
+                    pad_x_right = x_right - img_width + 1
+                    x_right = img_width - 1
+                pad_y_top = 0
+                if y_top < 0:
+                    pad_y_top = -y_top
+                    y_top = 0
+                pad_y_bottom = 0
+                if y_bottom >= img_height:
+                    pad_y_bottom = y_bottom - img_height + 1
+                    y_bottom = img_height - 1
+                # Padding the image and resize the image
+                processed_image = np.pad(img[y_top:y_bottom + 1, x_left:x_right + 1],
+                                         ((pad_y_top, pad_y_bottom), (pad_x_left, pad_x_right), (0, 0)),
+                                         mode='edge')
+                processed_image = cv2.resize(processed_image, (self.img_size_w, self.img_size_h))
+            else:
+                if img_height > self.crop_size_h and img_width > self.crop_size_w:
+                    x_left = int(img_width - self.crop_size_w) // 2
+                    x_right = int(x_left + self.crop_size_w)
+                    y_top = int(img_height - self.crop_size_h) // 2
+                    y_bottom = int(y_top + self.crop_size_h)
+                else:
+                    x_left = 0
+                    x_right = img_width
+                    y_top = 0
+                    y_bottom = img_height
+                processed_image = cv2.resize(img[y_top:y_bottom, x_left:x_right], (self.img_size_w, self.img_size_h))
+            processed_images = np.append(processed_images, [processed_image], axis=0)
+            # Debug
+            # fig = plt.figure()
+            # ax1 = fig.add_subplot(1, 2, 1)
+            # ax1.imshow(img)
+            # if not bounding_box is None:
+            #     rect = patches.Rectangle((bounding_box[0], bounding_box[1]),
+            #                              bbox_width,
+            #                              bbox_height,
+            #                              linewidth=1,
+            #                              edgecolor='r',
+            #                              facecolor='none')
+            #     ax1.add_patch(rect)
+            # ax2 = fig.add_subplot(1, 2, 2)
+            # ax2.imshow(processed_image)
+            # plt.show()
+        return processed_images
+class RandomCrop(object):
+    def __init__(self, img_size, crop_size):
+        """Set the height and weight before and after cropping"""
+        self.img_size_h = img_size[0]
+        self.img_size_w = img_size[1]
+        self.crop_size_h = crop_size[0]
+        self.crop_size_w = crop_size[1]
+    def __call__(self, rendering_images, bounding_box=None):
+        if len(rendering_images) == 0:
+            return rendering_images
+        crop_size_c = rendering_images[0].shape[2]
+        processed_images = np.empty(shape=(0, self.img_size_h, self.img_size_w, crop_size_c))
+        for img_idx, img in enumerate(rendering_images):
+            img_height, img_width, _ = img.shape
+            if bounding_box is not None:
+                bounding_box = [
+                    bounding_box[0] * img_width,
+                    bounding_box[1] * img_height,
+                    bounding_box[2] * img_width,
+                    bounding_box[3] * img_height
+                ]  # yapf: disable
+                # Calculate the size of bounding boxes
+                bbox_width = bounding_box[2] - bounding_box[0]
+                bbox_height = bounding_box[3] - bounding_box[1]
+                bbox_x_mid = (bounding_box[2] + bounding_box[0]) * .5
+                bbox_y_mid = (bounding_box[3] + bounding_box[1]) * .5
+                # Make the crop area as a square
+                square_object_size = max(bbox_width, bbox_height)
+                square_object_size = square_object_size * random.uniform(0.8, 1.2)
+                x_left = int(bbox_x_mid - square_object_size * random.uniform(.4, .6))
+                x_right = int(bbox_x_mid + square_object_size * random.uniform(.4, .6))
+                y_top = int(bbox_y_mid - square_object_size * random.uniform(.4, .6))
+                y_bottom = int(bbox_y_mid + square_object_size * random.uniform(.4, .6))
+                # If the crop position is out of the image, fix it with padding
+                pad_x_left = 0
+                if x_left < 0:
+                    pad_x_left = -x_left
+                    x_left = 0
+                pad_x_right = 0
+                if x_right >= img_width:
+                    pad_x_right = x_right - img_width + 1
+                    x_right = img_width - 1
+                pad_y_top = 0
+                if y_top < 0:
+                    pad_y_top = -y_top
+                    y_top = 0
+                pad_y_bottom = 0
+                if y_bottom >= img_height:
+                    pad_y_bottom = y_bottom - img_height + 1
+                    y_bottom = img_height - 1
+                # Padding the image and resize the image
+                processed_image = np.pad(img[y_top:y_bottom + 1, x_left:x_right + 1],
+                                         ((pad_y_top, pad_y_bottom), (pad_x_left, pad_x_right), (0, 0)),
+                                         mode='edge')
+                processed_image = cv2.resize(processed_image, (self.img_size_w, self.img_size_h))
+            else:
+                if img_height > self.crop_size_h and img_width > self.crop_size_w:
+                    x_left = int(img_width - self.crop_size_w) // 2
+                    x_right = int(x_left + self.crop_size_w)
+                    y_top = int(img_height - self.crop_size_h) // 2
+                    y_bottom = int(y_top + self.crop_size_h)
+                else:
+                    x_left = 0
+                    x_right = img_width
+                    y_top = 0
+                    y_bottom = img_height
+                processed_image = cv2.resize(img[y_top:y_bottom, x_left:x_right], (self.img_size_w, self.img_size_h))
+            processed_images = np.append(processed_images, [processed_image], axis=0)
+        return processed_images
+class RandomFlip(object):
+    def __call__(self, rendering_images):
+        assert (isinstance(rendering_images, np.ndarray))
+        for img_idx, img in enumerate(rendering_images):
+            if random.randint(0, 1):
+                rendering_images[img_idx] = np.fliplr(img)
+        return rendering_images
+class ColorJitter(object):
+    def __init__(self, brightness, contrast, saturation):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+    def __call__(self, rendering_images):
+        if len(rendering_images) == 0:
+            return rendering_images
+        # Allocate new space for storing processed images
+        img_height, img_width, img_channels = rendering_images[0].shape
+        processed_images = np.empty(shape=(0, img_height, img_width, img_channels))
+        # Randomize the value of changing brightness, contrast, and saturation
+        brightness = 1 + np.random.uniform(low=-self.brightness, high=self.brightness)
+        contrast = 1 + np.random.uniform(low=-self.contrast, high=self.contrast)
+        saturation = 1 + np.random.uniform(low=-self.saturation, high=self.saturation)
+        # Randomize the order of changing brightness, contrast, and saturation
+        attr_names = ['brightness', 'contrast', 'saturation']
+        attr_values = [brightness, contrast, saturation]    # The value of changing attrs
+        attr_indexes = np.array(range(len(attr_names)))    # The order of changing attrs
+        np.random.shuffle(attr_indexes)
+        for img_idx, img in enumerate(rendering_images):
+            processed_image = img
+            for idx in attr_indexes:
+                processed_image = self._adjust_image_attr(processed_image, attr_names[idx], attr_values[idx])
+            processed_images = np.append(processed_images, [processed_image], axis=0)
+            # print('ColorJitter', np.mean(ori_img), np.mean(processed_image))
+            # fig = plt.figure(figsize=(8, 4))
+            # ax1 = fig.add_subplot(1, 2, 1)
+            # ax1.imshow(ori_img)
+            # ax2 = fig.add_subplot(1, 2, 2)
+            # ax2.imshow(processed_image)
+            # plt.show()
+        return processed_images
+    def _adjust_image_attr(self, img, attr_name, attr_value):
+        """
+        Adjust or randomize the specified attribute of the image
+        Args:
+            img: Image in BGR format
+                Numpy array of shape (h, w, 3)
+            attr_name: Image attribute to adjust or randomize
+                       'brightness', 'saturation', or 'contrast'
+            attr_value: the alpha for blending is randomly drawn from [1 - d, 1 + d]
+        Returns:
+            Output image in BGR format
+            Numpy array of the same shape as input
+        """
+        gs = self._bgr_to_gray(img)
+        if attr_name == 'contrast':
+            img = self._alpha_blend(img, np.mean(gs[:, :, 0]), attr_value)
+        elif attr_name == 'saturation':
+            img = self._alpha_blend(img, gs, attr_value)
+        elif attr_name == 'brightness':
+            img = self._alpha_blend(img, 0, attr_value)
+        else:
+            raise NotImplementedError(attr_name)
+        return img
+    def _bgr_to_gray(self, bgr):
+        """
+        Convert a RGB image to a grayscale image
+            Differences from cv2.cvtColor():
+                1. Input image can be float
+                2. Output image has three repeated channels, other than a single channel
+        Args:
+            bgr: Image in BGR format
+                 Numpy array of shape (h, w, 3)
+        Returns:
+            gs: Grayscale image
+                Numpy array of the same shape as input; the three channels are the same
+        """
+        ch = 0.114 * bgr[:, :, 0] + 0.587 * bgr[:, :, 1] + 0.299 * bgr[:, :, 2]
+        gs = np.dstack((ch, ch, ch))
+        return gs
+    def _alpha_blend(self, im1, im2, alpha):
+        """
+        Alpha blending of two images or one image and a scalar
+        Args:
+            im1, im2: Image or scalar
+                Numpy array and a scalar or two numpy arrays of the same shape
+            alpha: Weight of im1
+                Float ranging usually from 0 to 1
+        Returns:
+            im_blend: Blended image -- alpha * im1 + (1 - alpha) * im2
+                Numpy array of the same shape as input image
+        """
+        im_blend = alpha * im1 + (1 - alpha) * im2
+        return im_blend
+class RandomNoise(object):
+    def __init__(self,
+                 noise_std,
+                 eigvals=(0.2175, 0.0188, 0.0045),
+                 eigvecs=((-0.5675, 0.7192, 0.4009), (-0.5808, -0.0045, -0.8140), (-0.5836, -0.6948, 0.4203))):
+        self.noise_std = noise_std
+        self.eigvals = np.array(eigvals)
+        self.eigvecs = np.array(eigvecs)
+    def __call__(self, rendering_images):
+        alpha = np.random.normal(loc=0, scale=self.noise_std, size=3)
+        noise_rgb = \
+            np.sum(
+                np.multiply(
+                    np.multiply(
+                        self.eigvecs,
+                        np.tile(alpha, (3, 1))
+                    ),
+                    np.tile(self.eigvals, (3, 1))
+                ),
+                axis=1
+            )
+        # Allocate new space for storing processed images
+        img_height, img_width, img_channels = rendering_images[0].shape
+        assert (img_channels == 3), "Please use RandomBackground to normalize image channels"
+        processed_images = np.empty(shape=(0, img_height, img_width, img_channels))
+        for img_idx, img in enumerate(rendering_images):
+            processed_image = img[:, :, ::-1]    # BGR -> RGB
+            for i in range(img_channels):
+                processed_image[:, :, i] += noise_rgb[i]
+            processed_image = processed_image[:, :, ::-1]    # RGB -> BGR
+            processed_images = np.append(processed_images, [processed_image], axis=0)
+            # from copy import deepcopy
+            # ori_img = deepcopy(img)
+            # print(noise_rgb, np.mean(processed_image), np.mean(ori_img))
+            # print('RandomNoise', np.mean(ori_img), np.mean(processed_image))
+            # fig = plt.figure(figsize=(8, 4))
+            # ax1 = fig.add_subplot(1, 2, 1)
+            # ax1.imshow(ori_img)
+            # ax2 = fig.add_subplot(1, 2, 2)
+            # ax2.imshow(processed_image)
+            # plt.show()
+        return processed_images
+class RandomBackground(object):
+    def __init__(self, random_bg_color_range, random_bg_folder_path=None):
+        self.random_bg_color_range = random_bg_color_range
+        self.random_bg_files = []
+        if random_bg_folder_path is not None:
+            self.random_bg_files = os.listdir(random_bg_folder_path)
+            self.random_bg_files = [os.path.join(random_bg_folder_path, rbf) for rbf in self.random_bg_files]
+    def __call__(self, rendering_images):
+        if len(rendering_images) == 0:
+            return rendering_images
+        img_height, img_width, img_channels = rendering_images[0].shape
+        # If the image has the alpha channel, add the background
+        if not img_channels == 4:
+            return rendering_images
+        # Generate random background
+        r, g, b = np.array([
+            np.random.randint(self.random_bg_color_range[i][0], self.random_bg_color_range[i][1] + 1) for i in range(3)
+        ]) / 255.
+        random_bg = None
+        if len(self.random_bg_files) > 0:
+            random_bg_file_path = random.choice(self.random_bg_files)
+            random_bg = cv2.imread(random_bg_file_path).astype(np.float32) / 255.
+        # Apply random background
+        processed_images = np.empty(shape=(0, img_height, img_width, img_channels - 1))
+        for img_idx, img in enumerate(rendering_images):
+            alpha = (np.expand_dims(img[:, :, 3], axis=2) == 0).astype(np.float32)
+            img = img[:, :, :3]
+            bg_color = random_bg if random.randint(0, 1) and random_bg is not None else np.array([[[r, g, b]]])
+            img = alpha * bg_color + (1 - alpha) * img
+            processed_images = np.append(processed_images, [img], axis=0)
+        return processed_images