Spaces:

ThunderVVV
/

HaWoR

Running

App Files Files Community

ThunderVVV commited on 29 days ago

Commit

981eadf

1 Parent(s): 193a8b0

update

Browse files

Files changed (4) hide show

app.py +50 -20
lib/vis/renderer_world.py +357 -0
pre-requirements.txt +2 -1
scripts/scripts_test_video/hawor_video.py +6 -0

app.py CHANGED Viewed

@@ -49,13 +49,17 @@ os.environ["FORCE_CUDA"] = "1"
 os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git@stable')
 import numpy as np
 from easydict import EasyDict
 from scripts.scripts_test_video.detect_track_video import detect_track_video
 from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
 from scripts.scripts_test_video.hawor_slam import hawor_slam
 from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
 from lib.eval_utils.custom_utils import load_slam_cam
-from lib.vis.run_vis2 import run_vis2_on_video, run_vis2_on_video_cam
 @spaces.GPU(duration=300)
 def render_reconstruction(input_video, img_focal):
@@ -71,8 +75,9 @@ def render_reconstruction(input_video, img_focal):
     frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)
-    hawor_slam(args, start_idx, end_idx)
     slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
     R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)
     pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)
@@ -134,24 +139,49 @@ def render_reconstruction(input_video, img_focal):
     left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
     right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())
-    # Here we use aitviewer(https://github.com/eth-ait/aitviewer) for simple visualization.
-    if args.vis_mode == 'world':
-        output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
-        if not os.path.exists(output_pth):
-            os.makedirs(output_pth)
-        image_names = imgfiles[vis_start:vis_end]
-        print(f"vis {vis_start} to {vis_end}")
-        vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
-    elif args.vis_mode == 'cam':
-        # output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
-        # if not os.path.exists(output_pth):
-        #     os.makedirs(output_pth)
-        # image_names = imgfiles[vis_start:vis_end]
-        # print(f"vis {vis_start} to {vis_end}")
-        # run_vis2_on_video_cam(left_dict, right_dict, output_pth, img_focal, image_names, R_w2c=R_w2c_sla_all[vis_start:vis_end], t_w2c=t_w2c_sla_all[vis_start:vis_end])
-        raise NotImplementedError
-    return vis_video_path

 os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git@stable')
 import numpy as np
+import joblib
+import cv2
+import imageio
 from easydict import EasyDict
 from scripts.scripts_test_video.detect_track_video import detect_track_video
 from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
 from scripts.scripts_test_video.hawor_slam import hawor_slam
 from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
 from lib.eval_utils.custom_utils import load_slam_cam
+from lib.vis.run_vis2 import lookat_matrix, run_vis2_on_video, run_vis2_on_video_cam
+from lib.vis.renderer_world import Renderer
 @spaces.GPU(duration=300)
 def render_reconstruction(input_video, img_focal):
     frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)
     slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
+    if not os.path.exists(slam_path):
+        hawor_slam(args, start_idx, end_idx)
     R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)
     pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)
     left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
     right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())
+    # simple visualization
+    bin_size = 128
+    max_faces_per_bin = 20000
+    img = cv2.imread(imgfiles[0])
+    renderer = Renderer(img.shape[1], img.shape[0], 1800, 'cuda',
+                    bin_size=bin_size, max_faces_per_bin=max_faces_per_bin)
+    output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
+    if not os.path.exists(output_pth):
+        os.makedirs(output_pth)
+    image_names = imgfiles[vis_start:vis_end]
+    print(f"vis {vis_start} to {vis_end}")
+    # vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
+    faces_left = torch.from_numpy(faces_left).cuda()
+    faces_right = torch.from_numpy(faces_right).cuda()
+    faces_all = torch.stack((faces_left, faces_right))
+    side_source = torch.tensor([0.463, -0.478, 2.456])
+    side_target = torch.tensor([0.026, -0.481, -3.184])
+    up = torch.tensor([1.0, 0.0, 0.0])
+    view_camera = lookat_matrix(side_source, side_target, up)
+    cam_R = view_camera[:3, :3].unsqueeze(0).cuda()
+    cam_T = view_camera[:3, 3].unsqueeze(0).cuda()
+    vis_video_imgs = []
+    writer = imageio.get_writer(f'{seq_folder}/vis_output.mp4', fps=30, mode='I',
+                        format='FFMPEG', macro_block_size=1)
+    renderer.set_ground(100, 0, 0)
+    for img_i, _ in enumerate(image_names):
+        vertices_left = left_dict['vertices'][:, img_i]
+        vertices_right = right_dict['vertices'][:, img_i]
+        cameras, lights = renderer.create_camera_from_cv(cam_R, cam_T)
+        verts_color = torch.tensor([0.207, 0.596, 0.792, 1.0]).unsqueeze(0).repeat(2, 1)
+        vertices_i = torch.stack((vertices_left, vertices_right))
+        rend, _ = renderer.render_multiple(vertices_i.cuda(), faces_all.cuda(), verts_color.cuda(), cameras, lights)
+        writer.append_data(rend)
+    writer.close()
+    print("finish")
+    return f'{seq_folder}/vis_output.mp4'

lib/vis/renderer_world.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# Useful rendering functions from WHAM (some modification)
+import cv2
+import torch
+import numpy as np
+from pytorch3d.renderer import (
+    PerspectiveCameras,
+    TexturesVertex,
+    PointLights,
+    Materials,
+    RasterizationSettings,
+    MeshRenderer,
+    MeshRasterizer,
+    SoftPhongShader,
+)
+from pytorch3d.structures import Meshes
+from pytorch3d.structures.meshes import join_meshes_as_scene
+from pytorch3d.renderer.cameras import look_at_rotation
+from pytorch3d.renderer.camera_conversions import _cameras_from_opencv_projection
+from lib.vis.wham_tools.tools import checkerboard_geometry
+def overlay_image_onto_background(image, mask, bbox, background):
+    if isinstance(image, torch.Tensor):
+        image = image.detach().cpu().numpy()
+    if isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    out_image = background.copy()
+    bbox = bbox[0].int().cpu().numpy().copy()
+    roi_image = out_image[bbox[1]:bbox[3], bbox[0]:bbox[2]]
+    roi_image[mask] = image[mask]
+    out_image[bbox[1]:bbox[3], bbox[0]:bbox[2]] = roi_image
+    return out_image
+def update_intrinsics_from_bbox(K_org, bbox):
+    device, dtype = K_org.device, K_org.dtype
+    K = torch.zeros((K_org.shape[0], 4, 4)
+    ).to(device=device, dtype=dtype)
+    K[:, :3, :3] = K_org.clone()
+    K[:, 2, 2] = 0
+    K[:, 2, -1] = 1
+    K[:, -1, 2] = 1
+    image_sizes = []
+    for idx, bbox in enumerate(bbox):
+        left, upper, right, lower = bbox
+        cx, cy = K[idx, 0, 2], K[idx, 1, 2]
+        new_cx = cx - left
+        new_cy = cy - upper
+        new_height = max(lower - upper, 1)
+        new_width = max(right - left, 1)
+        new_cx = new_width - new_cx
+        new_cy = new_height - new_cy
+        K[idx, 0, 2] = new_cx
+        K[idx, 1, 2] = new_cy
+        image_sizes.append((int(new_height), int(new_width)))
+    return K, image_sizes
+def perspective_projection(x3d, K, R=None, T=None):
+    if R != None:
+        x3d = torch.matmul(R, x3d.transpose(1, 2)).transpose(1, 2)
+    if T != None:
+        x3d = x3d + T.transpose(1, 2)
+    x2d = torch.div(x3d, x3d[..., 2:])
+    x2d = torch.matmul(K, x2d.transpose(-1, -2)).transpose(-1, -2)[..., :2]
+    return x2d
+def compute_bbox_from_points(X, img_w, img_h, scaleFactor=1.2):
+    left = torch.clamp(X.min(1)[0][:, 0], min=0, max=img_w)
+    right = torch.clamp(X.max(1)[0][:, 0], min=0, max=img_w)
+    top = torch.clamp(X.min(1)[0][:, 1], min=0, max=img_h)
+    bottom = torch.clamp(X.max(1)[0][:, 1], min=0, max=img_h)
+    cx = (left + right) / 2
+    cy = (top + bottom) / 2
+    width = (right - left)
+    height = (bottom - top)
+    new_left = torch.clamp(cx - width/2 * scaleFactor, min=0, max=img_w-1)
+    new_right = torch.clamp(cx + width/2 * scaleFactor, min=1, max=img_w)
+    new_top = torch.clamp(cy - height / 2 * scaleFactor, min=0, max=img_h-1)
+    new_bottom = torch.clamp(cy + height / 2 * scaleFactor, min=1, max=img_h)
+    bbox = torch.stack((new_left.detach(), new_top.detach(),
+                        new_right.detach(), new_bottom.detach())).int().float().T
+    return bbox
+class Renderer():
+    def __init__(self, width, height, focal_length, device,
+                 bin_size=None, max_faces_per_bin=None):
+        self.width = width
+        self.height = height
+        self.focal_length = focal_length
+        self.device = device
+        self.initialize_camera_params()
+        self.lights = PointLights(device=device, location=[[0.0, 0.0, -10.0]])
+        self.create_renderer(bin_size, max_faces_per_bin)
+    def create_renderer(self, bin_size, max_faces_per_bin):
+        self.renderer = MeshRenderer(
+            rasterizer=MeshRasterizer(
+                raster_settings=RasterizationSettings(
+                    image_size=self.image_sizes[0],
+                    blur_radius=1e-5, bin_size=bin_size,
+                    max_faces_per_bin=max_faces_per_bin),
+            ),
+            shader=SoftPhongShader(
+                device=self.device,
+                lights=self.lights,
+            )
+        )
+    def initialize_camera_params(self):
+        """Hard coding for camera parameters
+        TODO: Do some soft coding"""
+        # Extrinsics
+        self.R = torch.diag(
+            torch.tensor([1, 1, 1])
+        ).float().to(self.device).unsqueeze(0)
+        self.T = torch.tensor(
+            [0, 0, 0]
+        ).unsqueeze(0).float().to(self.device)
+        # Intrinsics
+        self.K = torch.tensor(
+            [[self.focal_length, 0, self.width/2],
+            [0, self.focal_length, self.height/2],
+            [0, 0, 1]]
+        ).unsqueeze(0).float().to(self.device)
+        self.bboxes = torch.tensor([[0, 0, self.width, self.height]]).float()
+        self.K_full, self.image_sizes = update_intrinsics_from_bbox(self.K, self.bboxes)
+        # self.K_full = self.K  # test
+        self.cameras = self.create_camera()
+    def create_camera(self, R=None, T=None):
+        if R is not None:
+            self.R = R.clone().view(1, 3, 3).to(self.device)
+        if T is not None:
+            self.T = T.clone().view(1, 3).to(self.device)
+        return PerspectiveCameras(
+            device=self.device,
+            R=self.R, #.mT,
+            T=self.T,
+            K=self.K_full,
+            image_size=self.image_sizes,
+            in_ndc=False)
+    def create_camera_from_cv(self, R, T, K=None, image_size=None):
+        # R: [1, 3, 3] Tensor
+        # T: [1, 3] Tensor
+        # K: [1, 3, 3] Tensor
+        # image_size: [1, 2] Tensor in HW
+        if K is None:
+            K = self.K
+        if image_size is None:
+            image_size = torch.tensor(self.image_sizes)
+        cameras = _cameras_from_opencv_projection(R, T, K, image_size)
+        lights = PointLights(device=K.device, location=T)
+        return cameras, lights
+    def set_ground(self, length, center_x, center_z):
+        device = self.device
+        v, f, vc, fc = map(torch.from_numpy, checkerboard_geometry(length=length, tile_width=1.0, c1=center_x, c2=center_z, up="z"))
+        v[:, 2] -= 2 # z plane
+        v, f, vc = v.to(device), f.to(device), vc.to(device)
+        self.ground_geometry = [v, f, vc]
+    def update_bbox(self, x3d, scale=2.0, mask=None):
+        """ Update bbox of cameras from the given 3d points
+        x3d: input 3D keypoints (or vertices), (num_frames, num_points, 3)
+        """
+        if x3d.size(-1) != 3:
+            x2d = x3d.unsqueeze(0)
+        else:
+            x2d = perspective_projection(x3d.unsqueeze(0), self.K, self.R, self.T.reshape(1, 3, 1))
+        if mask is not None:
+            x2d = x2d[:, ~mask]
+        bbox = compute_bbox_from_points(x2d, self.width, self.height, scale)
+        self.bboxes = bbox
+        self.K_full, self.image_sizes = update_intrinsics_from_bbox(self.K, bbox)
+        self.cameras = self.create_camera()
+        self.create_renderer()
+    def reset_bbox(self,):
+        bbox = torch.zeros((1, 4)).float().to(self.device)
+        bbox[0, 2] = self.width
+        bbox[0, 3] = self.height
+        self.bboxes = bbox
+        self.K_full, self.image_sizes = update_intrinsics_from_bbox(self.K, bbox)
+        self.cameras = self.create_camera()
+        self.create_renderer()
+    def render_mesh(self, vertices, background, colors=[0.8, 0.8, 0.8]):
+        self.update_bbox(vertices[::50], scale=1.2)
+        vertices = vertices.unsqueeze(0)
+        if colors[0] > 1: colors = [c / 255. for c in colors]
+        verts_features = torch.tensor(colors).reshape(1, 1, 3).to(device=vertices.device, dtype=vertices.dtype)
+        verts_features = verts_features.repeat(1, vertices.shape[1], 1)
+        textures = TexturesVertex(verts_features=verts_features)
+        mesh = Meshes(verts=vertices,
+                      faces=self.faces,
+                      textures=textures,)
+        materials = Materials(
+            device=self.device,
+            specular_color=(colors, ),
+            shininess=0
+            )
+        results = torch.flip(
+            self.renderer(mesh, materials=materials, cameras=self.cameras, lights=self.lights),
+            [1, 2]
+        )
+        image = results[0, ..., :3] * 255
+        mask = results[0, ..., -1] > 1e-3
+        image = overlay_image_onto_background(image, mask, self.bboxes, background.copy())
+        self.reset_bbox()
+        return image
+    def render_with_ground(self, verts, faces, colors, cameras, lights):
+        """
+        :param verts (B, V, 3)
+        :param faces (F, 3)
+        :param colors (B, 3)
+        """
+        # (B, V, 3), (B, F, 3), (B, V, 3)
+        verts, faces, colors = prep_shared_geometry(verts, faces, colors)
+        # (V, 3), (F, 3), (V, 3)
+        gv, gf, gc = self.ground_geometry
+        verts = list(torch.unbind(verts, dim=0)) + [gv]
+        faces = list(torch.unbind(faces, dim=0)) + [gf]
+        colors = list(torch.unbind(colors, dim=0)) + [gc[..., :3]]
+        mesh = create_meshes(verts, faces, colors)
+        materials = Materials(
+            device=self.device,
+            shininess=0
+        )
+        results = self.renderer(mesh, cameras=cameras, lights=lights, materials=materials)
+        image = (results[0, ..., :3].cpu().numpy() * 255).astype(np.uint8)
+        return image
+    def render_multiple(self, verts_list, faces_list, colors_list, cameras, lights):
+        """
+        :param verts (B, V, 3)
+        :param faces (F, 3)
+        :param colors (B, 3)
+        """
+        # (B, V, 3), (B, F, 3), (B, V, 3)
+        verts_, faces_, colors_ = [], [], []
+        for i, verts in enumerate(verts_list):
+            colors = colors_list[[i]]
+            faces = faces_list[i]
+            verts_i, faces_i, colors_i = prep_shared_geometry(verts, faces, colors)
+            if i == 0:
+                verts_ = list(torch.unbind(verts_i, dim=0))
+                faces_ = list(torch.unbind(faces_i, dim=0))
+                colors_ = list(torch.unbind(colors_i, dim=0))
+            else:
+                verts_ += list(torch.unbind(verts_i, dim=0))
+                faces_ += list(torch.unbind(faces_i, dim=0))
+                colors_ += list(torch.unbind(colors_i, dim=0))
+        # (V, 3), (F, 3), (V, 3)
+        gv, gf, gc = self.ground_geometry
+        verts_ += [gv]
+        faces_ += [gf]
+        colors_ += [gc[..., :3]]
+        mesh = create_meshes(verts_, faces_, colors_)
+        materials = Materials(
+            device=self.device,
+            shininess=0
+        )
+        results = self.renderer(mesh, cameras=cameras, lights=lights, materials=materials)
+        image = (results[0, ..., :3].cpu().numpy() * 255).astype(np.uint8)
+        mask = results[0, ..., -1].cpu().numpy() > 0
+        return image, mask
+def prep_shared_geometry(verts, faces, colors):
+    """
+    :param verts (B, V, 3)
+    :param faces (F, 3)
+    :param colors (B, 4)
+    """
+    B, V, _ = verts.shape
+    F, _ = faces.shape
+    colors = colors.unsqueeze(1).expand(B, V, -1)[..., :3]
+    faces = faces.unsqueeze(0).expand(B, F, -1)
+    return verts, faces, colors
+def create_meshes(verts, faces, colors):
+    """
+    :param verts (B, V, 3)
+    :param faces (B, F, 3)
+    :param colors (B, V, 3)
+    """
+    textures = TexturesVertex(verts_features=colors)
+    meshes = Meshes(verts=verts, faces=faces, textures=textures)
+    return join_meshes_as_scene(meshes)
+def get_global_cameras(verts, device, distance=5, position=(-5.0, 5.0, 0.0)):
+    positions = torch.tensor([position]).repeat(len(verts), 1)
+    targets = verts.mean(1)
+    directions = targets - positions
+    directions = directions / torch.norm(directions, dim=-1).unsqueeze(-1) * distance
+    positions = targets - directions
+    rotation = look_at_rotation(positions, targets, ).mT
+    translation = -(rotation @ positions.unsqueeze(-1)).squeeze(-1)
+    lights = PointLights(device=device, location=[position])
+    return rotation, translation, lights

pre-requirements.txt CHANGED Viewed

@@ -34,4 +34,5 @@ easydict
 loguru
 dill
 lapx
-moderngl-window==2.4.6

 loguru
 dill
 lapx
+moderngl-window==2.4.6
+imageio[ffmpeg]

scripts/scripts_test_video/hawor_video.py CHANGED Viewed

@@ -65,6 +65,11 @@ def hawor_motion_estimation(args, start_idx, end_idx, seq_folder):
     tid = np.array([tr for tr in tracks])
     print(f'Running hawor on {video} ...')
     left_trk = []
@@ -211,6 +216,7 @@ def hawor_motion_estimation(args, start_idx, end_idx, seq_folder):
     model_masks = model_masks > 0 # bool
     np.save(f'{seq_folder}/tracks_{start_idx}_{end_idx}/model_masks.npy', model_masks)
     return frame_chunks_all, img_focal
 def hawor_infiller(args, start_idx, end_idx, frame_chunks_all):

     tid = np.array([tr for tr in tracks])
+    if os.path.exists(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy'):
+        print("skip hawor motion estimation")
+        frame_chunks_all = joblib.load(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy')
+        return frame_chunks_all, img_focal
     print(f'Running hawor on {video} ...')
     left_trk = []
     model_masks = model_masks > 0 # bool
     np.save(f'{seq_folder}/tracks_{start_idx}_{end_idx}/model_masks.npy', model_masks)
+    joblib.dump(frame_chunks_all, f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy')
     return frame_chunks_all, img_focal
 def hawor_infiller(args, start_idx, end_idx, frame_chunks_all):