MotionCtrl_SVD / app.py
wzhouxiff's picture
Update app.py
5d17ee3 verified
import argparse
import os
import tempfile
import gradio as gr
import numpy as np
import torch
from glob import glob
from torchvision.transforms import CenterCrop, Compose, Resize
from gradio_utils.camera_utils import CAMERA_MOTION_MODE, process_camera, create_relative
from gradio_utils.utils import vis_camera
from gradio_utils.motionctrl_cmcm_gradio import build_model, motionctrl_sample
os.environ['KMP_DUPLICATE_LIB_OK']='True'
SPACE_ID = os.environ.get('SPACE_ID', '')
#### Description ####
title = r"""<h1 align="center">MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</h1>"""
subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>"""
important_link = r"""
<div align='center'>
<a href='https://wzhouxiff.github.io/projects/MotionCtrl/assets/paper/MotionCtrl.pdf'>[Paper]</a>
&ensp; <a href='https://wzhouxiff.github.io/projects/MotionCtrl/'>[Project Page]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl'>[Code]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md'>[Showcases]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md'>[Tutorial]</a>
</div>
"""
description = r"""
<b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</b></a>.<br>
🔥 MotionCtrl is capable of independently and flexibly controling the camera motion and object motion of a generated video, with only a unified model.<br>
🤗 Try to control the motion of the generated videos yourself!<br>
❗❗❗ Please note **ONLY** Camera Motion Control in the current version of **MotionCtrl** deployed on **SVD** is avaliable.<br>
❗❗❗ <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md' target='_blank'>Showcases</a> and
<a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>Tutorial</a> can be found
<a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>here</a><br>.
"""
# <div>
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/00_ibzz5-dxv2h.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/01_5guvn-0x6v2.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/12_sn7bz-0hcaf.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/13_3lyco-4ru8j.gif", width="300">
# </div>
article = r"""
If MotionCtrl is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks!
[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
)](https://github.com/TencentARC/MotionCtrl)
---
📝 **Citation**
<br>
If our work is useful for your research, please consider citing:
```bibtex
@inproceedings{wang2023motionctrl,
title={MotionCtrl: A Unified and Flexible Motion Controller for Video Generation},
author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying},
booktitle={arXiv preprint arXiv:2312.03641},
year={2023}
}
```
📧 **Contact**
<br>
If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
"""
css = """
.gradio-container {width: 85% !important}
.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important;}
span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; color: #d30f2f !important;}
button {border-radius: 8px !important;}
.add_button {background-color: #4CAF50 !important;}
.remove_button {background-color: #f44336 !important;}
.clear_button {background-color: gray !important;}
.mask_button_group {gap: 10px !important;}
.video {height: 300px !important;}
.image {height: 300px !important;}
.video .wrap.svelte-lcpz3o {display: flex !important; align-items: center !important; justify-content: center !important;}
.video .wrap.svelte-lcpz3o > :first-child {height: 100% !important;}
.margin_center {width: 50% !important; margin: auto !important;}
.jc_center {justify-content: center !important;}
"""
T_base = [
[1.,0.,0.], ## W2C x 的正方向: 相机朝左 left
[-1.,0.,0.], ## W2C x 的负方向: 相机朝右 right
[0., 1., 0.], ## W2C y 的正方向: 相机朝上 up
[0.,-1.,0.], ## W2C y 的负方向: 相机朝下 down
[0.,0.,1.], ## W2C z 的正方向: 相机往前 zoom out
[0.,0.,-1.], ## W2C z 的负方向: 相机往前 zoom in
]
radius = 1
n = 16
# step =
look_at = np.array([0, 0, 0.8]).reshape(3,1)
# look_at = np.array([0, 0, 0.2]).reshape(3,1)
T_list = []
base_R = np.array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
res = []
res_forsave = []
T_range = 1.8
for i in range(0, 16):
# theta = (1)*np.pi*i/n
R = base_R[:,:3]
T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2
RT = np.concatenate([R,T], axis=1)
res.append(RT)
fig = vis_camera(res)
# MODE = ["camera motion control", "object motion control", "camera + object motion control"]
MODE = ["control camera poses", "control object trajectory", "control both camera and object motion"]
RESIZE_MODE = ['Center Crop To 576x1024', 'Keep original spatial ratio']
DIY_MODE = ['Customized Mode 1: First A then B',
'Customized Mode 2: Both A and B',
'Customized Mode 3: RAW Camera Poses']
## load default model
num_frames = 14
num_steps = 25
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device {device}")
config = "configs/inference/config_motionctrl_cmcm.yaml"
ckpt='checkpoints/motionctrl_svd.ckpt'
if not os.path.exists(ckpt):
os.system(f'wget https://huggingface.co/TencentARC/MotionCtrl/resolve/main/motionctrl_svd.ckpt?download=true -P .')
os.system(f'mkdir checkpoints')
os.system(f'mv motionctrl_svd.ckpt?download=true {ckpt}')
model = build_model(config, ckpt, device, num_frames, num_steps)
width, height = 1024, 576
traj_list = []
camera_dict = {
"motion":[],
"mode": "Customized Mode 1: First A then B", # "First A then B", "Both A and B", "Custom"
"speed": 1.0,
"complex": None
}
def fn_vis_camera(camera_args):
global camera_dict, num_frames, width, height
RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height) # [t, 3, 4]
rescale_T = 1.0
rescale_T = max(rescale_T, np.max(np.abs(RT[:,:,-1])) / 1.9)
fig = vis_camera(create_relative(RT), rescale_T=rescale_T)
vis_step3_prompt_generate = True
vis_generation_dec = True
vis_prompt = True
vis_num_samples = True
vis_seed = True
vis_start = True
vis_gen_video = True
vis_repeat_highlight = True
return fig, \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video, value=None), \
gr.update(visible=vis_repeat_highlight)
def display_camera_info(camera_dict, camera_mode=None):
if camera_dict['complex'] is not None:
res = f"complex : {camera_dict['complex']}. "
res += f"speed : {camera_dict['speed']}. "
else:
res = ""
res += f"motion : {[_ for _ in camera_dict['motion']]}. "
res += f"speed : {camera_dict['speed']}. "
if camera_mode == CAMERA_MOTION_MODE[2]:
res += f"mode : {camera_dict['mode']}. "
return res
def add_camera_motion(camera_motion, camera_mode):
global camera_dict
if camera_dict['complex'] is not None:
camera_dict['complex'] = None
if camera_mode == CAMERA_MOTION_MODE[2] and len(camera_dict['motion']) <2:
camera_dict['motion'].append(camera_motion)
else:
camera_dict['motion']=[camera_motion]
return display_camera_info(camera_dict, camera_mode)
def add_complex_camera_motion(camera_motion):
global camera_dict
camera_dict['complex']=camera_motion
return display_camera_info(camera_dict)
def input_raw_camera_pose(combine_type, camera_mode):
global camera_dict
camera_dict['mode'] = combine_type
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_speed = True
vis_combine3_des = True
return gr.update(value='1 0 0 0 0 1 0 0 0 0 1 0\n1 0 0 0 0 1 0 0 0 0 1 -0.225\n1 0 0 0 0 1 0 0 0 0 1 -0.45\n1 0 0 0 0 1 0 0 0 0 1 -0.675\n1 0 0 0 0 1 0 0 0 0 1 -0.9\n1 0 0 0 0 1 0 0 0 0 1 -1.125\n1 0 0 0 0 1 0 0 0 0 1 -1.35\n1 0 0 0 0 1 0 0 0 0 1 -1.575\n1 0 0 0 0 1 0 0 0 0 1 -1.8\n1 0 0 0 0 1 0 0 0 0 1 -2.025\n1 0 0 0 0 1 0 0 0 0 1 -2.25\n1 0 0 0 0 1 0 0 0 0 1 -2.475\n1 0 0 0 0 1 0 0 0 0 1 -2.7\n1 0 0 0 0 1 0 0 0 0 1 -2.925\n', max_lines=16, interactive=True), \
gr.update(visible=vis_U), \
gr.update(visible=vis_D), \
gr.update(visible=vis_L),\
gr.update(visible=vis_R), \
gr.update(visible=vis_I), \
gr.update(visible=vis_O), \
gr.update(visible=vis_ACW), \
gr.update(visible=vis_CW), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_combine3_des)
def change_camera_mode(combine_type, camera_mode):
global camera_dict
camera_dict['mode'] = combine_type
vis_U = True
vis_D = True
vis_L = True
vis_R = True
vis_I = True
vis_O = True
vis_ACW = True
vis_CW = True
vis_speed = True
vis_combine3_des = False
return display_camera_info(camera_dict, camera_mode), \
gr.update(visible=vis_U), \
gr.update(visible=vis_D), \
gr.update(visible=vis_L),\
gr.update(visible=vis_R), \
gr.update(visible=vis_I), \
gr.update(visible=vis_O), \
gr.update(visible=vis_ACW), \
gr.update(visible=vis_CW), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_combine3_des)
def change_camera_speed(camera_speed):
global camera_dict
camera_dict['speed'] = camera_speed
return display_camera_info(camera_dict)
def reset_camera():
global camera_dict
camera_dict = {
"motion":[],
"mode": "Customized Mode 1: First A then B",
"speed": 1.0,
"complex": None
}
return display_camera_info(camera_dict)
def visualized_camera_poses(step2_camera_motion):
reset_camera()
# generate video
vis_step3_prompt_generate = False
vis_generation_dec = False
vis_prompt = False
vis_num_samples = False
vis_seed = False
vis_start = False
vis_gen_video = False
vis_repeat_highlight = False
if step2_camera_motion == CAMERA_MOTION_MODE[0]:
vis_basic_camera_motion = True
vis_basic_camera_motion_des = True
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = True
vis_D = True
vis_L = True
vis_R = True
vis_I = True
vis_O = True
vis_ACW = True
vis_CW = True
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = True
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
elif step2_camera_motion == CAMERA_MOTION_MODE[1]:
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = True
vis_complex_camera_motion_des = True
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = True
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = True, True, True, True
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = True, True, True, True
else: # step2_camera_motion = CAMERA_MOTION_MODE[2]:
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = True
vis_custom_run_status = True
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = True
vis_combine2 = True
vis_combine3 = True
vis_combine3_des = False
vis_speed = False
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
vis_camera_args = True
vis_camera_reset = True
vis_camera_vis = True
vis_vis_camera = True
return gr.update(visible=vis_basic_camera_motion), \
gr.update(visible=vis_basic_camera_motion_des), \
gr.update(visible=vis_custom_camera_motion), \
gr.update(visible=vis_custom_run_status), \
gr.update(visible=vis_complex_camera_motion), \
gr.update(visible=vis_complex_camera_motion_des), \
gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
gr.update(visible=vis_combine3_des), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
gr.update(visible=vis_camera_args, value=None), \
gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
gr.update(visible=vis_vis_camera, value=None), \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video), \
gr.update(visible=vis_repeat_highlight)
def process_input_image(input_image, resize_mode):
global width, height
if resize_mode == RESIZE_MODE[0]:
height = 576
width = 1024
w, h = input_image.size
h_ratio = h / height
w_ratio = w / width
if h_ratio > w_ratio:
h = int(h / w_ratio)
if h < height:
h = height
input_image = Resize((h, width))(input_image)
else:
w = int(w / h_ratio)
if w < width:
w = width
input_image = Resize((height, w))(input_image)
transformer = Compose([
# Resize(width),
CenterCrop((height, width)),
])
input_image = transformer(input_image)
else:
w, h = input_image.size
if h > w:
height = 576
width = int(w * height / h)
else:
width = 1024
height = int(h * width / w)
input_image = Resize((height, width))(input_image)
# print(f'input_image size: {input_image.size}')
vis_step2_camera_motion = True
vis_step2_camera_motion_des = True
vis_camera_mode = True
vis_camera_info = True
####
# camera motion control
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = False
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
vis_camera_args = False
vis_camera_reset = False
vis_camera_vis = False
vis_vis_camera = False
# generate video
vis_step3_prompt_generate = False
vis_generation_dec = False
vis_prompt = False
vis_num_samples = False
vis_seed = False
vis_start = False
vis_gen_video = False
vis_repeat_highlight = False
return gr.update(visible=True, value=input_image, height=height, width=width), \
gr.update(visible=vis_step2_camera_motion), \
gr.update(visible=vis_step2_camera_motion_des), \
gr.update(visible=vis_camera_mode), \
gr.update(visible=vis_camera_info), \
gr.update(visible=vis_basic_camera_motion), \
gr.update(visible=vis_basic_camera_motion_des), \
gr.update(visible=vis_custom_camera_motion), \
gr.update(visible=vis_custom_run_status), \
gr.update(visible=vis_complex_camera_motion), \
gr.update(visible=vis_complex_camera_motion_des), \
gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
gr.update(visible=vis_combine3_des), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
gr.update(visible=vis_camera_args, value=None), \
gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
gr.update(visible=vis_vis_camera, value=None), \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video), \
gr.update(visible=vis_repeat_highlight)
def model_run(input_image, fps_id, seed, n_samples, camera_args):
global model, device, camera_dict, num_frames, num_steps, width, height
RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height).reshape(-1,12)
video_path = motionctrl_sample(
model=model,
image=input_image,
RT=RT,
num_frames=num_frames,
fps_id=fps_id,
decoding_t=1,
seed=seed,
sample_num=n_samples,
device=device
)
return video_path
def main(args):
demo = gr.Blocks()
with demo:
gr.Markdown(title)
gr.Markdown(subtitle)
gr.Markdown(important_link)
gr.Markdown(description)
with gr.Column():
# step 0: Some useful tricks
gr.Markdown("## Step 0/3: Some Useful Tricks", show_label=False)
gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
\n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
or increase `FPS`.", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=True)
# step 2: input an image
step2_title = gr.Markdown("---\n## Step 1/3: Input an Image", show_label=False, visible=True)
step2_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \
\n 2. Click `{RESIZE_MODE[0]}` or `{RESIZE_MODE[1]}` to select the image resize mode. \
You will get a processed image and go into the next step. \
\n - `{RESIZE_MODE[0]}`: Our MotionCtrl is train on image with spatial size 576x1024. Choose `{RESIZE_MODE[0]}` can get better generated video. \
\n - `{RESIZE_MODE[1]}`: Choose `{RESIZE_MODE[1]}` if you want to generate video with the same spatial ratio as the input image.",
show_label=False, visible=True)
with gr.Row(equal_height=True):
with gr.Column(scale=2):
input_image = gr.Image(type="pil", interactive=True, elem_id="input_image", elem_classes='image', visible=True)
# process_input_image_button = gr.Button(value="Process Input Image", visible=False)
with gr.Row():
center_crop_botton = gr.Button(value=RESIZE_MODE[0], visible=True)
keep_spatial_raition_botton = gr.Button(value=RESIZE_MODE[1], visible=True)
with gr.Column(scale=2):
process_image = gr.Image(type="pil", interactive=False, elem_id="process_image", elem_classes='image', visible=False)
# step2_proceed_button = gr.Button(value="Proceed", visible=False)
# step3 - camera motion control
step2_camera_motion = gr.Markdown("---\n## Step 2/3: Select the camera poses", show_label=False, visible=False)
step2_camera_motion_des = gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Including 8 basic camera poses, such as pan up, pan down, zoom in, and zoom out. \
\n - {CAMERA_MOTION_MODE[1]}: Complex camera poses extracted from the real videos. \
\n - {CAMERA_MOTION_MODE[2]}: You can customize complex camera poses yourself by combining or fusing two of the eight basic camera poses or input RAW RT matrix. \
\n - Click `Proceed` to go into next step",
show_label=False, visible=False)
camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
camera_info = gr.Button(value="Proceed", visible=False)
with gr.Row():
with gr.Column():
# step3.1 - camera motion control - basic
basic_camera_motion = gr.Markdown("---\n### Basic Camera Poses", show_label=False, visible=False)
basic_camera_motion_des = gr.Markdown(f"\n 1. Click one of the basic camera poses, such as `Pan Up`; \
\n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
\n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
# step3.2 - camera motion control - provided complex
complex_camera_motion = gr.Markdown("---\n### Provided Complex Camera Poses", show_label=False, visible=False)
complex_camera_motion_des = gr.Markdown(f"\n 1. Click one of the complex camera poses, such as `Pose_1`; \
\n 2. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 3. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
# step3.3 - camera motion control - custom
custom_camera_motion = gr.Markdown(f"---\n### {CAMERA_MOTION_MODE[2]}", show_label=False, visible=False)
custom_run_status = gr.Markdown(f"\n 1. Click `{DIY_MODE[0]}`, `{DIY_MODE[1]}`, or `{DIY_MODE[2]}` \
\n - `Customized Mode 1: First A then B`: For example, click `Pan Up` and `Pan Left`, the camera will first `Pan Up` and then `Pan Left`; \
\n - `Customized Mode 2: Both A and B`: For example, click `Pan Up` and `Pan Left`, the camera will move towards the upper left corner; \
\n - `{DIY_MODE[2]}`: Input the RAW RT matrix yourselves. \
\n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
\n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
gr.HighlightedText(value=[("",""), ("1. Select two of the basic camera poses; 2. Select Customized Mode 1 OR Customized Mode 2. 3. Visualized Camera to show the customized camera poses", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
with gr.Row():
combine1 = gr.Button(value=DIY_MODE[0], visible=False)
combine2 = gr.Button(value=DIY_MODE[1], visible=False)
combine3 = gr.Button(value=DIY_MODE[2], visible=False)
with gr.Row():
combine3_des = gr.Markdown(f"---\n#### Input your camera pose in the following textbox. \
A total of 14 lines and each line contains 12 float number, indicated \
the RT matrix in the shape of 1x12. \
The example is RT matrix of ZOOM IN.", show_label=False, visible=False)
with gr.Row():
U = gr.Button(value="Pan Up", visible=False)
D = gr.Button(value="Pan Down", visible=False)
L = gr.Button(value="Pan Left", visible=False)
R = gr.Button(value="Pan Right", visible=False)
with gr.Row():
I = gr.Button(value="Zoom In", visible=False)
O = gr.Button(value="Zoom Out", visible=False)
ACW = gr.Button(value="ACW", visible=False)
CW = gr.Button(value="CW", visible=False)
with gr.Row():
speed = gr.Slider(minimum=0, maximum=8, step=0.2, label="Motion Speed", value=1.0, visible=False)
with gr.Row():
Pose_1 = gr.Button(value="Pose_1", visible=False)
Pose_2 = gr.Button(value="Pose_2", visible=False)
Pose_3 = gr.Button(value="Pose_3", visible=False)
Pose_4 = gr.Button(value="Pose_4", visible=False)
with gr.Row():
Pose_5 = gr.Button(value="Pose_5", visible=False)
Pose_6 = gr.Button(value="Pose_6", visible=False)
Pose_7 = gr.Button(value="Pose_7", visible=False)
Pose_8 = gr.Button(value="Pose_8", visible=False)
with gr.Row():
camera_args = gr.Textbox(value="Camera Type", label="Camera Type", visible=False)
with gr.Row():
camera_vis= gr.Button(value="Visualize Camera and Proceed", visible=False)
camera_reset = gr.Button(value="Reset Camera", visible=False)
with gr.Column():
vis_camera = gr.Plot(fig, label='Camera Poses', visible=False)
# step4 - Generate videos
with gr.Row():
with gr.Column():
step3_prompt_generate = gr.Markdown("---\n## Step 3/3: Generate videos", show_label=False, visible=False)
generation_dec = gr.Markdown(f"\n 1. Set `FPS`.; \
\n 2. Set `n_samples`; \
\n 3. Set `seed`; \
\n 4. Click `Start generation !` to generate videos; ", visible=False)
# prompt = gr.Textbox(value="a dog sitting on grass", label="Prompt", interactive=True, visible=False)
prompt = gr.Slider(minimum=5, maximum=30, step=1, label="FPS", value=10, visible=False)
n_samples = gr.Number(value=1, precision=0, interactive=True, label="n_samples", visible=False)
seed = gr.Number(value=1234, precision=0, interactive=True, label="Seed", visible=False)
start = gr.Button(value="Start generation !", visible=False)
with gr.Column():
gen_video = gr.Video(value=None, label="Generate Video", visible=False)
repeat_highlight=gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
\n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
or increase `FPS`.", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
center_crop_botton.click(
fn=process_input_image,
inputs=[input_image, center_crop_botton],
outputs=[
process_image,
step2_camera_motion,
step2_camera_motion_des,
camera_mode,
camera_info,
basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed, start, gen_video, repeat_highlight])
keep_spatial_raition_botton.click(
fn=process_input_image,
inputs=[input_image, keep_spatial_raition_botton],
outputs=[
process_image,
step2_camera_motion,
step2_camera_motion_des,
camera_mode,
camera_info,
basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed, start, gen_video, repeat_highlight])
camera_info.click(
fn=visualized_camera_poses,
inputs=[camera_mode],
outputs=[basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate, generation_dec, prompt, n_samples, seed, start, gen_video, repeat_highlight],
)
U.click(fn=add_camera_motion, inputs=[U, camera_mode], outputs=camera_args)
D.click(fn=add_camera_motion, inputs=[D, camera_mode], outputs=camera_args)
L.click(fn=add_camera_motion, inputs=[L, camera_mode], outputs=camera_args)
R.click(fn=add_camera_motion, inputs=[R, camera_mode], outputs=camera_args)
I.click(fn=add_camera_motion, inputs=[I, camera_mode], outputs=camera_args)
O.click(fn=add_camera_motion, inputs=[O, camera_mode], outputs=camera_args)
ACW.click(fn=add_camera_motion, inputs=[ACW, camera_mode], outputs=camera_args)
CW.click(fn=add_camera_motion, inputs=[CW, camera_mode], outputs=camera_args)
speed.change(fn=change_camera_speed, inputs=speed, outputs=camera_args)
camera_reset.click(fn=reset_camera, inputs=None, outputs=[camera_args])
combine1.click(fn=change_camera_mode,
inputs=[combine1, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW, speed,
combine3_des])
combine2.click(fn=change_camera_mode,
inputs=[combine2, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW,
speed,
combine3_des])
combine3.click(fn=input_raw_camera_pose,
inputs=[combine3, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW,
speed,
combine3_des])
camera_vis.click(fn=fn_vis_camera, inputs=[camera_args],
outputs=[vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed,
start,
gen_video,
repeat_highlight])
Pose_1.click(fn=add_complex_camera_motion, inputs=Pose_1, outputs=camera_args)
Pose_2.click(fn=add_complex_camera_motion, inputs=Pose_2, outputs=camera_args)
Pose_3.click(fn=add_complex_camera_motion, inputs=Pose_3, outputs=camera_args)
Pose_4.click(fn=add_complex_camera_motion, inputs=Pose_4, outputs=camera_args)
Pose_5.click(fn=add_complex_camera_motion, inputs=Pose_5, outputs=camera_args)
Pose_6.click(fn=add_complex_camera_motion, inputs=Pose_6, outputs=camera_args)
Pose_7.click(fn=add_complex_camera_motion, inputs=Pose_7, outputs=camera_args)
Pose_8.click(fn=add_complex_camera_motion, inputs=Pose_8, outputs=camera_args)
start.click(fn=model_run,
inputs=[process_image, prompt, seed, n_samples, camera_args],
outputs=gen_video)
# set example
gr.Markdown("## Examples")
examples = glob(os.path.join(os.path.dirname(__file__), "./assets/demo/images", "*.png"))
gr.Examples(
examples=examples,
inputs=[input_image],
examples_per_page=15
)
gr.Markdown(article)
# demo.launch(server_name='0.0.0.0', share=False, server_port=args['server_port'])
# demo.queue(concurrency_count=1, max_size=10)
# demo.launch()
demo.queue(max_size=10).launch(**args)
if __name__=="__main__":
parser = argparse.ArgumentParser()
# parser.add_argument("--port", type=int, default=12345)
parser.add_argument(
'--listen',
type=str,
default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
help='IP to listen on for connections to Gradio',
)
parser.add_argument(
'--username', type=str, default='', help='Username for authentication'
)
parser.add_argument(
'--password', type=str, default='', help='Password for authentication'
)
parser.add_argument(
'--server_port',
type=int,
default=0,
help='Port to run the server listener on',
)
parser.add_argument(
'--inbrowser', action='store_true', help='Open in browser'
)
parser.add_argument(
'--share', action='store_true', help='Share the gradio UI'
)
args = parser.parse_args()
launch_kwargs = {}
launch_kwargs['server_name'] = args.listen
if args.username and args.password:
launch_kwargs['auth'] = (args.username, args.password)
if args.server_port:
launch_kwargs['server_port'] = args.server_port
if args.inbrowser:
launch_kwargs['inbrowser'] = args.inbrowser
if args.share:
launch_kwargs['share'] = args.share
main(launch_kwargs)