Spaces:
Running
Running
File size: 10,004 Bytes
c33d1b8 c1d7f36 7fb4aa3 ca63a5c 7388ff5 01a3594 193a8b0 49be5d4 fd197d1 44b4275 1523151 fd197d1 ca63a5c b56d61a ca63a5c c1d7f36 ca63a5c 923e631 ca63a5c 829e8f8 ca63a5c 89befaf 923e631 fd197d1 7fb4aa3 153a6a8 c33d1b8 0675aa2 7fb4aa3 c1d7f36 7fb4aa3 b7eedf7 7fb4aa3 e3b8f47 7fb4aa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import gradio as gr
import spaces
import os
import subprocess
import torch
print(torch.__version__)
print(torch.version.cuda)
os.system('mkdir -p /home/user/app/tmp/.X11-unix')
os.system('chmod 1777 /home/user/app/tmp/.X11-unix')
os.environ["TMPDIR"] = "/home/user/app/tmp"
os.environ["DISPLAY"] = ":0.0"
os.system('Xvfb :0 -screen 0 640x480x24 -fp /home/user/app/tmp/.X11-unix &')
# download model
print("Downloading model weights")
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/metric_depth_vit_large_800k.pth -P ./thirdparty/Metric3D/weights/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/droid.pth -P ./weights/external/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/detector.pt -P ./weights/external/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/hawor.ckpt -P ./weights/hawor/checkpoints/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/infiller.pt -P ./weights/hawor/checkpoints/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/model_config.yaml -P ./weights/hawor/')
def install_cuda_toolkit():
# CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
# CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
os.environ["CUDA_HOME"] = "/usr/local/cuda"
os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
os.environ["CUDA_HOME"],
"" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
)
# Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
print("Compling other packages")
install_cuda_toolkit()
os.system('pip install ./thirdparty/DROID-SLAM')
os.system('pip install ./thirdparty/DROID-SLAM/thirdparty/lietorch')
os.environ["FORCE_CUDA"] = "1"
os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git@stable')
import numpy as np
from easydict import EasyDict
from scripts.scripts_test_video.detect_track_video import detect_track_video
from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
from scripts.scripts_test_video.hawor_slam import hawor_slam
from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
from lib.eval_utils.custom_utils import load_slam_cam
from lib.vis.run_vis2 import run_vis2_on_video, run_vis2_on_video_cam
@spaces.GPU(duration=300)
def render_reconstruction(input_video, img_focal):
args = EasyDict()
args.video_path = input_video
args.input_type = 'file'
args.checkpoint = './weights/hawor/checkpoints/hawor.ckpt'
args.infiller_weight = './weights/hawor/checkpoints/infiller.pt'
args.vis_mode = 'world'
args.img_focal = img_focal
start_idx, end_idx, seq_folder, imgfiles = detect_track_video(args)
frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)
hawor_slam(args, start_idx, end_idx)
slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)
pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)
# vis sequence for this video
hand2idx = {
"right": 1,
"left": 0
}
vis_start = 0
vis_end = pred_trans.shape[1] - 1
# get faces
faces = get_mano_faces()
faces_new = np.array([[92, 38, 234],
[234, 38, 239],
[38, 122, 239],
[239, 122, 279],
[122, 118, 279],
[279, 118, 215],
[118, 117, 215],
[215, 117, 214],
[117, 119, 214],
[214, 119, 121],
[119, 120, 121],
[121, 120, 78],
[120, 108, 78],
[78, 108, 79]])
faces_right = np.concatenate([faces, faces_new], axis=0)
# get right hand vertices
hand = 'right'
hand_idx = hand2idx[hand]
pred_glob_r = run_mano(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
right_verts = pred_glob_r['vertices'][0]
right_dict = {
'vertices': right_verts.unsqueeze(0),
'faces': faces_right,
}
# get left hand vertices
faces_left = faces_right[:,[0,2,1]]
hand = 'left'
hand_idx = hand2idx[hand]
pred_glob_l = run_mano_left(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
left_verts = pred_glob_l['vertices'][0]
left_dict = {
'vertices': left_verts.unsqueeze(0),
'faces': faces_left,
}
R_x = torch.tensor([[1, 0, 0],
[0, -1, 0],
[0, 0, -1]]).float()
R_c2w_sla_all = torch.einsum('ij,njk->nik', R_x, R_c2w_sla_all)
t_c2w_sla_all = torch.einsum('ij,nj->ni', R_x, t_c2w_sla_all)
R_w2c_sla_all = R_c2w_sla_all.transpose(-1, -2)
t_w2c_sla_all = -torch.einsum("bij,bj->bi", R_w2c_sla_all, t_c2w_sla_all)
left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())
# Here we use aitviewer(https://github.com/eth-ait/aitviewer) for simple visualization.
if args.vis_mode == 'world':
output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
if not os.path.exists(output_pth):
os.makedirs(output_pth)
image_names = imgfiles[vis_start:vis_end]
print(f"vis {vis_start} to {vis_end}")
vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
elif args.vis_mode == 'cam':
# output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
# if not os.path.exists(output_pth):
# os.makedirs(output_pth)
# image_names = imgfiles[vis_start:vis_end]
# print(f"vis {vis_start} to {vis_end}")
# run_vis2_on_video_cam(left_dict, right_dict, output_pth, img_focal, image_names, R_w2c=R_w2c_sla_all[vis_start:vis_end], t_w2c=t_w2c_sla_all[vis_start:vis_end])
raise NotImplementedError
return vis_video_path
header = ('''
<div class="embed_hidden" style="text-align: center;">
<h1> <b>HaWoR</b>: World-Space Hand Motion Reconstruction from Egocentric Videos</h1>
<h3>
<a href="" target="_blank" rel="noopener noreferrer">Jinglei Zhang</a><sup>1</sup>,
<a href="https://jiankangdeng.github.io/" target="_blank" rel="noopener noreferrer">Jiankang Deng</a><sup>2</sup>,
<br>
<a href="https://scholar.google.com/citations?user=syoPhv8AAAAJ&hl=en" target="_blank" rel="noopener noreferrer">Chao Ma</a><sup>1</sup>,
<a href="https://rolpotamias.github.io" target="_blank" rel="noopener noreferrer">Rolandos Alexandros Potamias</a><sup>2</sup>
</h3>
<h3>
<sup>1</sup>Shanghai Jiao Tong University;
<sup>2</sup>Imperial College London
</h3>
</div>
<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
<a href='https://arxiv.org/abs/xxxx.xxxxx'><img src='https://img.shields.io/badge/Arxiv-xxxx.xxxxx-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
<a href=''><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
<a href='https://hawor-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
<a href='https://github.com/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/GitHub-Code-black?style=flat&logo=github&logoColor=white'></a>
''')
with gr.Blocks(title="HaWoR: World-Space Hand Motion Reconstruction from Egocentric Videos", css=".gradio-container") as demo:
gr.Markdown(header)
with gr.Row():
with gr.Column():
input_video = gr.Video(label="Input video", sources=["upload"])
img_focal = gr.Number(label="Focal Length", value=600)
# threshold = gr.Slider(value=0.3, minimum=0.05, maximum=0.95, step=0.05, label='Detection Confidence Threshold')
#nms = gr.Slider(value=0.5, minimum=0.05, maximum=0.95, step=0.05, label='IoU NMS Threshold')
submit = gr.Button("Submit", variant="primary")
with gr.Column():
reconstruction = gr.Video(label="Reconstruction",show_download_button=True)
# hands_detected = gr.Textbox(label="Hands Detected")
submit.click(fn=render_reconstruction, inputs=[input_video, img_focal], outputs=[reconstruction])
with gr.Row():
example_images = gr.Examples([
['./example/video_0.mp4']
],
inputs=input_video)
demo.launch(debug=True) |