Spaces:

reedmayhew
/

VideoPainter

Runtime error

App Files Files Community

reedmayhew commited on 1 day ago

Commit

0b3b09b

verified ·

1 Parent(s): 7d5d19b

Upload app.py

Browse files

Files changed (1) hide show

app.py +754 -0

app.py ADDED Viewed

	@@ -0,0 +1,754 @@

+#!/usr/bin/env python
+"""
+This is the full application script for VideoPainter.
+It first checks for and (if necessary) installs missing dependencies.
+When installing the custom packages (diffusers and app),
+it uses the flag --no-build-isolation so that the installed torch is seen.
+If the custom diffusers package fails to provide the expected submodules,
+the script will force-install the official diffusers package.
+"""
+import os
+import sys
+import subprocess
+import warnings
+import time
+import importlib
+warnings.filterwarnings("ignore")
+# Set Gradio temp directory via environment variable
+GRADIO_TEMP_DIR = "./tmp_gradio"
+os.makedirs(GRADIO_TEMP_DIR, exist_ok=True)
+os.makedirs(f"{GRADIO_TEMP_DIR}/track", exist_ok=True)
+os.makedirs(f"{GRADIO_TEMP_DIR}/inpaint", exist_ok=True)
+os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
+def install_package(package_spec):
+    print(f"Installing {package_spec} ...")
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
+        print(f"Successfully installed {package_spec}")
+        return True
+    except Exception as e:
+        print(f"Failed to install {package_spec}: {e}")
+        return False
+print("Checking for PyTorch ...")
+try:
+    import torch
+    print("PyTorch is already installed.")
+except ImportError:
+    print("PyTorch not found, installing...")
+    if not install_package("torch>=2.0.0 torchvision>=0.15.0"):
+        print("Failed to install PyTorch, which is required.")
+        sys.exit(1)
+# First, install wheel package which is needed for bdist_wheel command
+install_package("wheel")
+# Install ninja for faster builds
+install_package("ninja")
+# Check and install other critical dependencies
+critical_dependencies = [
+    ("hydra", "hydra-core>=1.3.2"),
+    ("omegaconf", "omegaconf>=2.3.0"),
+    ("decord", "decord>=0.6.0"),
+    ("diffusers", "diffusers>=0.24.0"),  # Will be replaced with custom one
+    ("transformers", "transformers>=4.35.0"),
+    ("gradio", "gradio>=4.0.0"),
+    ("numpy", "numpy>=1.24.0"),
+    ("cv2", "opencv-python>=4.8.0"),
+    ("PIL", "Pillow>=10.0.0"),
+    ("scipy", "scipy>=1.11.0"),
+    ("einops", "einops>=0.7.0"),
+    ("onnxruntime", "onnxruntime>=1.16.0"),
+    ("timm", "timm>=0.9.0"),
+    ("safetensors", "safetensors>=0.4.0"),
+    ("moviepy", "moviepy>=1.0.3"),
+    ("imageio", "imageio>=2.30.0"),
+    ("tqdm", "tqdm>=4.64.0"),
+    ("openai", "openai>=1.5.0"),
+    ("psutil", "psutil>=5.9.0")
+]
+for mod_name, pkg_spec in critical_dependencies:
+    try:
+        if mod_name == "PIL":
+            from PIL import Image
+        elif mod_name == "cv2":
+            import cv2
+        else:
+            __import__(mod_name)
+        print(f"{mod_name} is already installed.")
+    except ImportError:
+        print(f"{mod_name} not found, installing {pkg_spec} ...")
+        install_package(pkg_spec)
+print("Setting up environment...")
+# Clone the VideoPainter repository if not present
+if not os.path.exists("VideoPainter"):
+    print("Cloning VideoPainter repository...")
+    os.system("git clone https://github.com/TencentARC/VideoPainter.git")
+# Add necessary paths to sys.path
+sys.path.append(os.path.join(os.getcwd(), "VideoPainter"))
+sys.path.append(os.path.join(os.getcwd(), "VideoPainter/app"))
+sys.path.append(os.path.join(os.getcwd(), "app"))
+sys.path.append(".")
+# Ensure custom diffusers is importable
+if os.path.exists("VideoPainter/diffusers"):
+    print("Installing custom diffusers...")
+    # First, remove any existing diffusers installation
+    subprocess.call([sys.executable, "-m", "pip", "uninstall", "-y", "diffusers"])
+    # Copy the files directly into the site-packages directory instead of using pip install -e
+    import site
+    site_packages = site.getsitepackages()[0]
+    diffusers_src = os.path.join(os.getcwd(), "VideoPainter/diffusers/src/diffusers")
+    diffusers_dst = os.path.join(site_packages, "diffusers")
+    print(f"Copying diffusers from {diffusers_src} to {diffusers_dst}")
+    if not os.path.exists(diffusers_dst):
+        os.makedirs(diffusers_dst, exist_ok=True)
+    # Copy diffusers files directly
+    os.system(f"cp -r {diffusers_src}/* {diffusers_dst}/")
+    # Also add VideoPainter/diffusers/src to sys.path
+    sys.path.append(os.path.join(os.getcwd(), "VideoPainter/diffusers/src"))
+    # Verify the custom model is available
+    try:
+        # Force reload diffusers to pick up the new files
+        if "diffusers" in sys.modules:
+            del sys.modules["diffusers"]
+        import diffusers
+        print(f"Diffusers version: {diffusers.__version__}")
+        print(f"Available modules in diffusers: {dir(diffusers)}")
+        # Check if models directory exists in custom diffusers
+        models_dir = os.path.join(diffusers_dst, "models")
+        if os.path.exists(models_dir):
+            print(f"Models in diffusers: {os.listdir(models_dir)}")
+    except Exception as e:
+        print(f"Error verifying diffusers installation: {e}")
+# Copy the app directory if needed
+if not os.path.exists("app"):
+    os.makedirs("app", exist_ok=True)
+    print("Copying VideoPainter/app to local app directory...")
+    os.system("cp -r VideoPainter/app/* app/")
+# Don't try to install app package, just add to path
+print("Adding app directory to Python path...")
+app_path = os.path.join(os.getcwd(), "app")
+sys.path.insert(0, app_path)
+# Insert the VideoPainter path at the beginning of sys.path to ensure it takes precedence
+sys.path.insert(0, os.path.join(os.getcwd(), "VideoPainter"))
+print("Importing standard modules and dependencies ...")
+try:
+    import gradio as gr
+    import cv2
+    import numpy as np
+    import scipy
+    import torchvision
+    from PIL import Image
+    from huggingface_hub import snapshot_download
+    from decord import VideoReader
+except ImportError as e:
+    print(f"Error importing basic modules: {e}")
+    sys.exit(1)
+# Import specialized modules with better error handling
+try:
+    # Import our custom modules
+    from sam2.build_sam import build_sam2_video_predictor
+    # Force reload of diffusers after direct copy
+    if "diffusers" in sys.modules:
+        del sys.modules["diffusers"]
+    # Now import diffusers with explicit path to the files we need
+    sys.path.insert(0, os.path.join(os.getcwd(), "VideoPainter/app"))
+    # Import utils after setting up correct paths
+    from utils import load_model, generate_frames
+    print("All modules imported successfully!")
+except ImportError as e:
+    print(f"Error importing specialized modules: {e}")
+    print("Paths:", sys.path)
+    # Try to diagnose and fix the specific issue
+    if "CogvideoXBranchModel" in str(e):
+        print("Trying to fix missing CogvideoXBranchModel...")
+        # Check if the model file exists in the repository
+        branch_model_file = "VideoPainter/diffusers/src/diffusers/models/cogvideox_branch.py"
+        if os.path.exists(branch_model_file):
+            print(f"Found branch model file at {branch_model_file}")
+            # Manually import the module
+            import sys
+            sys.path.insert(0, os.path.join(os.getcwd(), "VideoPainter/diffusers/src"))
+            # Add the import to __init__.py if not already there
+            init_file = os.path.join(site_packages, "diffusers/__init__.py")
+            with open(init_file, 'r') as f:
+                init_content = f.read()
+            if "CogvideoXBranchModel" not in init_content:
+                print("Adding CogvideoXBranchModel to diffusers/__init__.py")
+                with open(init_file, 'a') as f:
+                    f.write("\nfrom .models.cogvideox_branch import CogvideoXBranchModel\n")
+            # Force reload diffusers
+            if "diffusers" in sys.modules:
+                del sys.modules["diffusers"]
+            # Try importing again
+            from utils import load_model, generate_frames
+            print("Fixed CogvideoXBranchModel import issue!")
+        else:
+            print(f"Could not find {branch_model_file}")
+            sys.exit(1)
+    else:
+        sys.exit(1)
+###############################
+# Begin Application Code (VideoPainter demo)
+###############################
+def download_models():
+    print("Downloading models from Hugging Face Hub...")
+    models = {
+        "CogVideoX-5b-I2V": "THUDM/CogVideoX-5b-I2V",
+        "VideoPainter": "TencentARC/VideoPainter"
+    }
+    model_paths = {}
+    os.makedirs("ckpt", exist_ok=True)
+    for name, repo_id in models.items():
+        print(f"Downloading {name} from {repo_id}...")
+        path = snapshot_download(repo_id=repo_id)
+        model_paths[name] = path
+        print(f"Downloaded {name} to {path}")
+    try:
+        flux_path = snapshot_download(repo_id="black-forest-labs/FLUX.1-Fill-dev")
+        model_paths["FLUX"] = flux_path
+    except Exception as e:
+        print(f"Failed to download FLUX model: {e}")
+        model_paths["FLUX"] = None
+    os.makedirs("ckpt/Grounded-SAM-2", exist_ok=True)
+    sam2_path = "ckpt/Grounded-SAM-2/sam2_hiera_large.pt"
+    if not os.path.exists(sam2_path):
+        print(f"Downloading SAM2 to {sam2_path}...")
+        os.system(f"wget -O {sam2_path} https://huggingface.co/spaces/sam2/sam2/resolve/main/sam2_hiera_large.pt")
+    model_paths["SAM2"] = sam2_path
+    return model_paths
+print("Initializing application environment...")
+if not os.path.exists("app"):
+    print("Setting up app folder from VideoPainter repository ...")
+    os.system("git clone https://github.com/TencentARC/VideoPainter.git")
+    os.makedirs("app", exist_ok=True)
+    os.system("cp -r VideoPainter/app/* app/")
+    os.system("pip install --no-build-isolation -e VideoPainter/diffusers")
+    os.chdir("app")
+    os.system("pip install --no-build-isolation -e .")
+    os.chdir("..")
+sys.path.append("app")
+sys.path.append(".")
+# Import project modules (again, to be safe)
+try:
+    from decord import VideoReader
+    from sam2.build_sam import build_sam2_video_predictor
+    from utils import load_model, generate_frames
+except ImportError as e:
+    print(f"Failed to import specialized modules: {e}")
+    sys.exit(1)
+# Set up OpenRouter / OpenAI (for caption generation)
+try:
+    from openai import OpenAI
+    vlm_model = OpenAI(
+        api_key=os.getenv("OPENROUTER_API_KEY", ""),
+        base_url="https://openrouter.ai/api/v1"
+    )
+    print("OpenRouter client initialized successfully")
+except Exception as e:
+    print(f"OpenRouter API not available: {e}")
+    class DummyModel:
+        def __getattr__(self, name):
+            return self
+        def __call__(self, *args, **kwargs):
+            return self
+        def create(self, *args, **kwargs):
+            class DummyResponse:
+                choices = [type('obj', (object,), {'message': type('obj', (object,), {'content': "OpenRouter API not available. Using default prompt."})})]
+            return DummyResponse()
+    vlm_model = DummyModel()
+###############################
+# Download models and initialize predictors
+###############################
+model_paths = download_models()
+base_model_path = model_paths["CogVideoX-5b-I2V"]
+videopainter_path = model_paths["VideoPainter"]
+inpainting_branch = os.path.join(videopainter_path, "checkpoints/branch")
+id_adapter = os.path.join(videopainter_path, "VideoPainterID/checkpoints")
+img_inpainting_model = model_paths.get("FLUX")
+sam2_checkpoint = "ckpt/Grounded-SAM-2/sam2_hiera_large.pt"
+model_cfg = "sam2_hiera_l.yaml"
+try:
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)
+    print("Build SAM2 predictor done!")
+    validation_pipeline, validation_pipeline_img = load_model(
+        model_path=base_model_path,
+        inpainting_branch=inpainting_branch,
+        id_adapter=id_adapter,
+        img_inpainting_model=img_inpainting_model
+    )
+    print("Load model done!")
+except Exception as e:
+    print(f"Error initializing models: {e}")
+    sys.exit(1)
+###############################
+# Helper functions & state definitions
+###############################
+EXAMPLES = [
+    [
+        "https://huggingface.co/spaces/TencentARC/VideoPainter/resolve/main/examples/ferry.mp4",
+        "A white ferry with red and blue accents, named 'COLONIA', cruises on a calm river...",
+        "White and red passenger ferry boat labeled 'COLONIA 6' with multiple windows, life buoys, and upper deck seating.",
+        "Positive",
+        "Inpaint",
+        "",
+        42,
+        6.0,
+        16,
+        [[[320, 240]], [1]],
+    ],
+    [
+        "https://huggingface.co/spaces/TencentARC/VideoPainter/resolve/main/examples/street.mp4",
+        "A bustling city street at night illuminated by festive lights, a red double-decker bus...",
+        "The rear of a black car with illuminated red tail lights and a visible license plate.",
+        "Positive",
+        "Inpaint",
+        "",
+        42,
+        6.0,
+        16,
+        [[[200, 400]], [1]],
+    ],
+]
+class StatusMessage:
+    INFO = "Info"
+    WARNING = "Warning"
+    ERROR = "Error"
+    SUCCESS = "Success"
+def create_status(message, status_type=StatusMessage.INFO):
+    timestamp = time.strftime("%H:%M:%S")
+    return [("", ""), (f"[{timestamp}]: {message}\n", status_type)]
+def update_status(previous_status, new_message, status_type=StatusMessage.INFO):
+    timestamp = time.strftime("%H:%M:%S")
+    history = previous_status[-3:]
+    history.append((f"[{timestamp}]: {new_message}\n", status_type))
+    return [("", "")] + history
+def init_state(offload_video_to_cpu=False, offload_state_to_cpu=False):
+    inference_state = {}
+    inference_state["images"] = torch.zeros([1, 3, 100, 100])
+    inference_state["num_frames"] = 1
+    inference_state["offload_video_to_cpu"] = offload_video_to_cpu
+    inference_state["offload_state_to_cpu"] = offload_state_to_cpu
+    inference_state["video_height"] = 100
+    inference_state["video_width"] = 100
+    inference_state["device"] = torch.device("cuda")
+    inference_state["storage_device"] = torch.device("cpu") if offload_state_to_cpu else torch.device("cuda")
+    inference_state["point_inputs_per_obj"] = {}
+    inference_state["mask_inputs_per_obj"] = {}
+    inference_state["cached_features"] = {}
+    inference_state["constants"] = {}
+    inference_state["obj_id_to_idx"] = OrderedDict()
+    inference_state["obj_idx_to_id"] = OrderedDict()
+    inference_state["obj_ids"] = []
+    inference_state["output_dict"] = {"cond_frame_outputs": {}, "non_cond_frame_outputs": {}}
+    inference_state["output_dict_per_obj"] = {}
+    inference_state["temp_output_dict_per_obj"] = {}
+    inference_state["consolidated_frame_inds"] = {"cond_frame_outputs": set(), "non_cond_frame_outputs": set()}
+    inference_state["tracking_has_started"] = False
+    inference_state["frames_already_tracked"] = {}
+    inference_state = gr.State(inference_state)
+    return inference_state
+# (All additional helper functions such as get_frames_from_video, sam_refine, vos_tracking_video,
+# inpaint_video, generate_video_from_frames, process_example, reset_all, etc. are defined below.)
+# For brevity, they are included here in full as in your original code.
+def get_frames_from_video(video_input, video_state):
+    video_path = video_input
+    frames = []
+    user_name = time.time()
+    vr = VideoReader(video_path)
+    original_fps = vr.get_avg_fps()
+    if original_fps > 8:
+        total_frames = len(vr)
+        sample_interval = max(1, int(original_fps / 8))
+        frame_indices = list(range(0, total_frames, sample_interval))
+        frames = vr.get_batch(frame_indices).asnumpy()
+    else:
+        frames = vr.get_batch(list(range(len(vr)))).asnumpy()
+    frames = frames[:49]
+    resized_frames = [cv2.resize(frame, (720, 480)) for frame in frames]
+    frames = np.array(resized_frames)
+    init_start = time.time()
+    inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
+    init_time = time.time() - init_start
+    print(f"Inference state initialization took {init_time:.2f}s")
+    fps = 8
+    image_size = (frames[0].shape[0], frames[0].shape[1])
+    video_state = {
+        "user_name": user_name,
+        "video_name": os.path.split(video_path)[-1],
+        "origin_images": frames,
+        "painted_images": frames.copy(),
+        "masks": [np.zeros((frames[0].shape[0], frames[0].shape[1]), np.uint8)] * len(frames),
+        "logits": [None] * len(frames),
+        "select_frame_number": 0,
+        "fps": fps,
+        "ann_obj_id": 0
+    }
+    video_info = f"Video Name: {video_state['video_name']}, FPS: {video_state['fps']}, Total Frames: {len(frames)}, Image Size: {image_size}"
+    video_input_path = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/original_{video_state['video_name']}", fps=fps)
+    return (gr.update(visible=True), gr.update(visible=True), inference_state, video_state, video_info,
+            video_state["origin_images"][0], gr.update(visible=False, maximum=len(frames), value=1, interactive=True),
+            gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), gr.update(visible=True, interactive=True),
+            gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True), gr.update(visible=True),
+            gr.update(visible=True, interactive=False), create_status("Upload video complete. Ready to select targets.", StatusMessage.SUCCESS), video_input_path)
+def select_template(image_selection_slider, video_state, interactive_state, previous_status):
+    image_selection_slider -= 1
+    video_state["select_frame_number"] = image_selection_slider
+    return video_state["painted_images"][image_selection_slider], video_state, interactive_state, update_status(previous_status, f"Set tracking start at frame {image_selection_slider}.", StatusMessage.INFO)
+def get_end_number(track_pause_number_slider, video_state, interactive_state, previous_status):
+    interactive_state["track_end_number"] = track_pause_number_slider
+    return video_state["painted_images"][track_pause_number_slider], interactive_state, update_status(previous_status, f"Set tracking finish at frame {track_pause_number_slider}.", StatusMessage.INFO)
+def sam_refine(inference_state, video_state, point_prompt, click_state, interactive_state, evt, previous_status):
+    ann_obj_id = 0
+    ann_frame_idx = video_state["select_frame_number"]
+    if point_prompt == "Positive":
+        coordinate = f"[[{evt.index[0]},{evt.index[1]},1]]"
+        interactive_state["positive_click_times"] += 1
+    else:
+        coordinate = f"[[{evt.index[0]},{evt.index[1]},0]]"
+        interactive_state["negative_click_times"] += 1
+    print(f"sam_refine, point_prompt: {point_prompt}, click_state: {click_state}")
+    prompt = {"prompt_type":["click"], "input_point": click_state[0], "input_label": click_state[1], "multimask_output": "True"}
+    points = np.array(prompt["input_point"])
+    labels = np.array(prompt["input_label"])
+    height, width = video_state["origin_images"][0].shape[0:2]
+    for i in range(len(points)):
+        points[i, 0] = int(points[i, 0])
+        points[i, 1] = int(points[i, 1])
+    print(f"sam_refine points: {points}, labels: {labels}")
+    frame_idx, obj_ids, mask = predictor.add_new_points(inference_state=inference_state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, points=points, labels=labels)
+    mask_ = mask.cpu().squeeze().detach().numpy()
+    mask_[mask_ <= 0] = 0
+    mask_[mask_ > 0] = 1
+    org_image = video_state["origin_images"][video_state["select_frame_number"]]
+    mask_ = cv2.resize(mask_, (width, height))
+    mask_ = mask_[:, :, None]
+    mask_[mask_ > 0.5] = 1
+    mask_[mask_ <= 0.5] = 0
+    color = 63 * np.ones((height, width, 3)) * np.array([[[np.random.randint(5), np.random.randint(5), np.random.randint(5)]]])
+    painted_image = np.uint8((1 - 0.5 * mask_) * org_image + 0.5 * mask_ * color)
+    video_state["masks"][video_state["select_frame_number"]] = mask_
+    video_state["painted_images"][video_state["select_frame_number"]] = painted_image
+    return painted_image, video_state, interactive_state, update_status(previous_status, "Segmentation updated. Add more points or continue tracking.", StatusMessage.SUCCESS)
+def clear_click(inference_state, video_state, click_state, previous_status):
+    predictor.reset_state(inference_state)
+    click_state = [[], []]
+    template_frame = video_state["origin_images"][video_state["select_frame_number"]]
+    return inference_state, template_frame, click_state, update_status(previous_status, "Click history cleared.", StatusMessage.INFO)
+def vos_tracking_video(inference_state, video_state, interactive_state, previous_status):
+    height, width = video_state["origin_images"][0].shape[0:2]
+    masks = []
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
+        mask = np.zeros([480, 720, 1])
+        for i in range(len(out_mask_logits)):
+            out_mask = out_mask_logits[i].cpu().squeeze().detach().numpy()
+            out_mask[out_mask > 0] = 1
+            out_mask[out_mask <= 0] = 0
+            out_mask = out_mask[:, :, None]
+            mask += out_mask
+        mask = cv2.resize(mask, (width, height))
+        mask = mask[:, :, None]
+        mask[mask > 0.5] = 1
+        mask[mask < 1] = 0
+        mask = scipy.ndimage.binary_dilation(mask, iterations=6)
+        masks.append(mask)
+    masks = np.array(masks)
+    if interactive_state.get("track_end_number") is not None:
+        video_state["masks"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = masks
+        org_images = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
+        color = 255 * np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0, 1, 1]]]])
+        painted_images = np.uint8((1 - 0.5 * masks) * org_images + 0.5 * masks * color)
+        video_state["painted_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = painted_images
+    else:
+        video_state["masks"] = masks
+        org_images = video_state["origin_images"]
+        color = 255 * np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0, 1, 1]]]])
+        painted_images = np.uint8((1 - 0.5 * masks) * org_images + 0.5 * masks * color)
+        video_state["painted_images"] = painted_images
+    video_output = generate_video_from_frames(video_state["painted_images"], output_path=f"{GRADIO_TEMP_DIR}/track/{video_state['video_name']}", fps=video_state["fps"])
+    interactive_state["inference_times"] += 1
+    print(f"vos_tracking_video output: {video_output}")
+    return inference_state, video_output, video_state, interactive_state, update_status(previous_status, "Tracking complete.", StatusMessage.SUCCESS), gr.Button.update(interactive=True), gr.Button.update(interactive=True), gr.Button.update(interactive=True), gr.Button.update(interactive=True)
+def inpaint_video(video_state, video_caption, target_region_frame1_caption, interactive_state, previous_status, seed_param, cfg_scale, dilate_size):
+    seed = int(seed_param) if int(seed_param) >= 0 else np.random.randint(0, 2**32 - 1)
+    validation_images = video_state["origin_images"]
+    validation_masks = video_state["masks"]
+    validation_masks = [np.squeeze(mask) for mask in validation_masks]
+    validation_masks = [(mask > 0).astype(np.uint8) * 255 for mask in validation_masks]
+    validation_masks = [np.stack([m, m, m], axis=-1) for m in validation_masks]
+    validation_images = [Image.fromarray(np.uint8(img)).convert('RGB') for img in validation_images]
+    validation_masks = [Image.fromarray(np.uint8(mask)).convert('RGB') for mask in validation_masks]
+    validation_images = [img.resize((720, 480)) for img in validation_images]
+    validation_masks = [mask.resize((720, 480)) for mask in validation_masks]
+    print("Inpainting: video_caption=", video_caption)
+    images = generate_frames(
+        images=validation_images,
+        masks=validation_masks,
+        pipe=validation_pipeline,
+        pipe_img_inpainting=validation_pipeline_img,
+        prompt=str(video_caption),
+        image_inpainting_prompt=str(target_region_frame1_caption),
+        seed=seed,
+        cfg_scale=float(cfg_scale),
+        dilate_size=int(dilate_size)
+    )
+    images = (images * 255).astype(np.uint8)
+    video_output = generate_video_from_frames(images, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=8)
+    print(f"Inpaint_video output: {video_output}")
+    return video_output, update_status(previous_status, "Inpainting complete.", StatusMessage.SUCCESS)
+def generate_video_from_frames(frames, output_path, fps=8):
+    frames_tensor = torch.from_numpy(np.asarray(frames)).to(torch.uint8)
+    if not os.path.exists(os.path.dirname(output_path)):
+        os.makedirs(os.path.dirname(output_path))
+    torchvision.io.write_video(output_path, frames_tensor, fps=fps, video_codec="libx264")
+    return output_path
+def process_example(video_input, video_caption, target_region_frame1_caption, prompt, click_state):
+    if video_input is None or video_input == "":
+        return (gr.update(value=""), gr.update(value=""), init_state(),
+                {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
+                "", None,
+                gr.update(value=1, visible=False, interactive=False),
+                gr.update(value=1, visible=False, interactive=False),
+                gr.update(value="Positive", interactive=False),
+                gr.update(visible=True, interactive=False),
+                gr.update(visible=True, interactive=False),
+                gr.update(value=None),
+                gr.update(visible=True, interactive=False),
+                create_status("Reset complete. Ready for new input.", StatusMessage.INFO),
+                gr.update(value=None))
+    video_state = gr.State({
+        "user_name": "",
+        "video_name": "",
+        "origin_images": None,
+        "painted_images": None,
+        "masks": None,
+        "inpaint_masks": None,
+        "logits": None,
+        "select_frame_number": 0,
+        "fps": 8,
+        "ann_obj_id": 0
+    })
+    results = get_frames_from_video(video_input, video_state)
+    if click_state[0] and click_state[1]:
+        print("Example detected, executing sam_refine")
+        (video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button, tracking_button, video_output, inpaint_button, run_status, video_input) = results
+        class MockEvent:
+            def __init__(self, points, point_idx=0):
+                self.index = points[point_idx]
+        for i_click in range(len(click_state[0])):
+            evt = MockEvent(click_state[0], i_click)
+            prompt_type = "Positive" if click_state[1][i_click] == 1 else "Negative"
+            template_frame, video_state, interactive_state, run_status = sam_refine(inference_state, video_state, prompt_type, click_state, {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": None}, evt, run_status)
+        return (video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button, tracking_button, video_output, inpaint_button, run_status, video_input)
+    return results
+def reset_all():
+    return (gr.update(value=None), gr.update(value=""), gr.update(value=""), init_state(),
+            {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
+            {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": None},
+            [[], []], None, gr.update(visible=True, interactive=True), "",
+            gr.update(value=1, visible=False, interactive=False), gr.update(value=1, visible=False, interactive=False),
+            gr.update(value="Positive", interactive=False), gr.Button.update(interactive=False),
+            gr.Button.update(interactive=False), gr.Button.update(interactive=False),
+            gr.Button.update(interactive=False), gr.Button.update(interactive=False),
+            gr.Button.update(interactive=False), gr.Number.update(value=42),
+            gr.Slider.update(value=6.0), gr.Slider.update(value=16),
+            create_status("Reset complete. Ready for new input.", StatusMessage.INFO))
+###############################
+# Build Gradio Interface
+###############################
+title = """<p><h1 align="center">VideoPainter</h1></p>"""
+with gr.Blocks() as iface:
+    gr.HTML("""
+<div style="text-align: center;">
+    <h1 style="color: #333;">🖌️ VideoPainter</h1>
+    <h3 style="color: #333;">Any-length Video Inpainting and Editing with Plug-and-Play Context Control</h3>
+    <p style="font-weight: bold;">
+        <a href="https://yxbian23.github.io/project/video-painter/">🌍 Project Page</a> |
+        <a href="https://arxiv.org/abs/2503.05639">📃 ArXiv Preprint</a> |
+        <a href="https://github.com/TencentARC/VideoPainter">🧑‍💻 Github Repository</a>
+    </p>
+</div>
+    """)
+    click_state = gr.State([[], []])
+    interactive_state = gr.State({
+        "inference_times": 0,
+        "negative_click_times": 0,
+        "positive_click_times": 0,
+        "mask_save": False,
+        "multi_mask": {"mask_names": [], "masks": []},
+        "track_end_number": None,
+    })
+    video_state = gr.State({
+        "user_name": "",
+        "video_name": "",
+        "origin_images": None,
+        "painted_images": None,
+        "masks": None,
+        "inpaint_masks": None,
+        "logits": None,
+        "select_frame_number": 0,
+        "fps": 8,
+        "ann_obj_id": 0
+    })
+    inference_state = init_state()
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                video_input = gr.Video(label="Original Video", visible=True)
+            with gr.Row():
+                with gr.Column(scale=3):
+                    template_frame = gr.Image(type="pil", interactive=True, elem_id="template_frame", visible=True)
+                with gr.Column(scale=1):
+                    with gr.Accordion("Segmentation Point Prompt", open=True):
+                        point_prompt = gr.Radio(choices=["Positive", "Negative"], value="Positive", label="Point Type", interactive=False, visible=True)
+                        clear_button_click = gr.Button(value="Clear clicks", interactive=False, visible=True)
+                        gr.Markdown("✨ Positive: Include target region. <br> ✨ Negative: Exclude target region.")
+                        image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False, interactive=False)
+                        track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False, interactive=False)
+            video_output = gr.Video(label="Generated Video", visible=True)
+            with gr.Row():
+                tracking_video_predict_button = gr.Button(value="Tracking", interactive=False, visible=True)
+                inpaint_video_predict_button = gr.Button(value="Inpainting", interactive=False, visible=True)
+                reset_button = gr.Button(value="Reset All", interactive=True, visible=True)
+        with gr.Column():
+            with gr.Accordion("Global Video Caption", open=True):
+                video_caption = gr.Textbox(label="Global Video Caption", placeholder="Input global video caption...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
+                with gr.Row():
+                    gr.Markdown("✨ Enhance prompt using GPT-4o (optional).")
+                    enhance_button = gr.Button("✨ Enhance Prompt(Optional)", interactive=False)
+            with gr.Accordion("Target Object Caption", open=True):
+                target_region_frame1_caption = gr.Textbox(label="Target Object Caption", placeholder="Input target object caption...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
+                with gr.Row():
+                    gr.Markdown("✨ Generate target caption (optional).")
+                    enhance_target_region_frame1_button = gr.Button("✨ Target Prompt Generation (Optional)", interactive=False)
+            with gr.Accordion("Editing Instruction", open=False):
+                gr.Markdown("✨ Modify captions based on your instruction using GPT-4o.")
+                with gr.Row():
+                    editing_instruction = gr.Textbox(label="Editing Instruction", placeholder="Input editing instruction...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
+                    enhance_editing_instruction_button = gr.Button("✨ Modify Caption(For Editing)", interactive=False)
+            with gr.Accordion("Advanced Sampling Settings", open=False):
+                cfg_scale = gr.Slider(value=6.0, label="Classifier-Free Guidance Scale", minimum=1, maximum=10, step=0.1, interactive=True)
+                seed_param = gr.Number(label="Inference Seed (>=0)", interactive=True, value=42)
+                dilate_size = gr.Slider(value=16, label="Mask Dilate Size", minimum=0, maximum=32, step=1, interactive=True)
+                video_info = gr.Textbox(label="Video Info", visible=True, interactive=False)
+                model_type = gr.Textbox(label="Type", placeholder="Model type...", interactive=True, visible=False)
+            notes_accordion = gr.Accordion("Notes", open=False)
+            with notes_accordion:
+                gr.HTML("<p style='font-size: 1.1em;'>🧐 Reminder: VideoPainter may produce unexpected outputs. Adjust settings if needed.</p>")
+                run_status = gr.HighlightedText(value=[("", "")], visible=True, label="Operation Status", show_label=True,
+                                                color_map={"Success": "green", "Error": "red", "Warning": "orange", "Info": "blue"})
+    with gr.Row():
+        examples = gr.Examples(label="Quick Examples", examples=EXAMPLES,
+                               inputs=[video_input, video_caption, target_region_frame1_caption, point_prompt, model_type, editing_instruction, seed_param, cfg_scale, dilate_size, click_state],
+                               examples_per_page=20, cache_examples=False)
+    video_input.change(fn=process_example, inputs=[video_input, video_caption, target_region_frame1_caption, point_prompt, click_state],
+                       outputs=[video_caption, target_region_frame1_caption, inference_state, video_state, video_info,
+                                template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click,
+                                tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status, video_input])
+    image_selection_slider.release(fn=select_template, inputs=[image_selection_slider, video_state, interactive_state, run_status],
+                                   outputs=[template_frame, video_state, interactive_state, run_status])
+    track_pause_number_slider.release(fn=get_end_number, inputs=[track_pause_number_slider, video_state, interactive_state, run_status],
+                                      outputs=[template_frame, interactive_state, run_status])
+    template_frame.select(fn=sam_refine, inputs=[inference_state, video_state, point_prompt, click_state, interactive_state, run_status],
+                          outputs=[template_frame, video_state, interactive_state, run_status])
+    tracking_video_predict_button.click(fn=vos_tracking_video, inputs=[inference_state, video_state, interactive_state, run_status],
+                                        outputs=[inference_state, video_output, video_state, interactive_state, run_status,
+                                                 inpaint_video_predict_button, enhance_button, enhance_target_region_frame1_button, enhance_editing_instruction_button, notes_accordion])
+    inpaint_video_predict_button.click(fn=inpaint_video, inputs=[video_state, video_caption, target_region_frame1_caption, interactive_state, run_status, seed_param, cfg_scale, dilate_size],
+                                        outputs=[video_output, run_status], api_name=False, show_progress="full")
+    def enhance_prompt_func(video_caption):
+        return video_caption  # Replace with your convert_prompt() if available
+    def enhance_target_region_frame1_prompt_func(target_region_frame1_caption, video_state):
+        return target_region_frame1_caption  # Replace with your convert_prompt_target_region_frame1() if available
+    def enhance_editing_instruction_prompt_func(editing_instruction, video_caption, target_region_frame1_caption, video_state):
+        return video_caption, target_region_frame1_caption  # Replace with your convert_prompt_editing_instruction() if available
+    enhance_button.click(enhance_prompt_func, inputs=[video_caption], outputs=[video_caption])
+    enhance_target_region_frame1_button.click(enhance_target_region_frame1_prompt_func, inputs=[target_region_frame1_caption, video_state], outputs=[target_region_frame1_caption])
+    enhance_editing_instruction_button.click(enhance_editing_instruction_prompt_func, inputs=[editing_instruction, video_caption, target_region_frame1_caption, video_state],
+                                              outputs=[video_caption, target_region_frame1_caption])
+    video_input.clear(fn=lambda: (gr.update(visible=True), gr.update(visible=True), init_state(),
+                                    {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
+                                    {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": 0},
+                                    [[], []], None, None,
+                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True),
+                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True, value=[]),
+                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True),
+                                    gr.Button.update(interactive=False), gr.Button.update(interactive=False), gr.Button.update(interactive=False)),
+                          outputs=[video_caption, target_region_frame1_caption, inference_state, video_state, interactive_state, click_state, video_output, template_frame, tracking_video_predict_button, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, template_frame, tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status], queue=False, show_progress=False)
+    clear_button_click.click(fn=clear_click, inputs=[inference_state, video_state, click_state, run_status],
+                               outputs=[inference_state, template_frame, click_state, run_status])
+    reset_button.click(fn=reset_all, inputs=[], outputs=[video_input, video_caption, target_region_frame1_caption, inference_state, video_state, interactive_state, click_state, video_output, template_frame, video_info, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, tracking_video_predict_button, inpaint_video_predict_button, enhance_button, enhance_target_region_frame1_button, enhance_editing_instruction_button, seed_param, cfg_scale, dilate_size, run_status])
+iface.queue().launch(share=False)