Spaces:

reedmayhew
/

VideoPainter

Runtime error

App Files Files Community

reedmayhew commited on 1 day ago

Commit

7d5d19b

verified ·

1 Parent(s): 7b309dd

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -702

app.py DELETED Viewed

@@ -1,702 +0,0 @@
-#!/usr/bin/env python
-"""
-This is the full application script for VideoPainter.
-It first checks for and (if necessary) installs missing dependencies.
-When installing the custom packages (diffusers and app),
-it uses the flag --no-build-isolation so that the installed torch is seen.
-If the custom diffusers package fails to provide the expected submodules,
-the script will force-install the official diffusers package.
-"""
-import os
-import sys
-import subprocess
-import warnings
-import time
-import json
-from collections import OrderedDict
-warnings.filterwarnings("ignore")
-###############################
-# Set up temporary directories
-###############################
-GRADIO_TEMP_DIR = "./tmp_gradio"
-os.makedirs(GRADIO_TEMP_DIR, exist_ok=True)
-os.makedirs(os.path.join(GRADIO_TEMP_DIR, "track"), exist_ok=True)
-os.makedirs(os.path.join(GRADIO_TEMP_DIR, "inpaint"), exist_ok=True)
-os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
-###############################
-# Helper: Install package via pip
-###############################
-def install_package(package_spec):
-    print(f"Installing {package_spec} ...")
-    try:
-        subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
-        print(f"Successfully installed {package_spec}")
-        return True
-    except Exception as e:
-        print(f"Failed to install {package_spec}: {e}")
-        return False
-###############################
-# Ensure PyTorch is present
-###############################
-print("Checking for PyTorch ...")
-try:
-    import torch
-    print("PyTorch is already installed.")
-except ImportError:
-    print("PyTorch not found, installing...")
-    if not install_package("torch>=2.0.0 torchvision>=0.15.0"):
-        print("Failed to install PyTorch, which is required.")
-        sys.exit(1)
-###############################
-# Check/install critical dependencies
-###############################
-critical_dependencies = [
-    ("hydra", "hydra-core>=1.3.2"),
-    ("omegaconf", "omegaconf>=2.3.0"),
-    ("decord", "decord>=0.6.0"),
-    ("diffusers", "diffusers>=0.24.0"),  # This one is later replaced by our custom version.
-    ("transformers", "transformers>=4.35.0"),
-    ("gradio", "gradio>=4.0.0"),
-    ("numpy", "numpy>=1.24.0"),
-    ("cv2", "opencv-python>=4.8.0"),
-    ("PIL", "Pillow>=10.0.0"),
-    ("scipy", "scipy>=1.11.0"),
-    ("einops", "einops>=0.7.0"),
-    ("onnxruntime", "onnxruntime>=1.16.0"),
-    ("timm", "timm>=0.9.0"),
-    ("safetensors", "safetensors>=0.4.0"),
-    ("moviepy", "moviepy>=1.0.3"),
-    ("imageio", "imageio>=2.30.0"),
-    ("tqdm", "tqdm>=4.64.0"),
-    ("openai", "openai>=1.5.0"),
-    ("psutil", "psutil>=5.9.0")
-]
-for mod_name, pkg_spec in critical_dependencies:
-    try:
-        if mod_name == "PIL":
-            from PIL import Image
-        elif mod_name == "cv2":
-            import cv2
-        else:
-            __import__(mod_name)
-        print(f"{mod_name} is already installed.")
-    except ImportError:
-        print(f"{mod_name} not found, installing {pkg_spec} ...")
-        install_package(pkg_spec)
-###############################
-# Environment setup: Clone repository, install custom packages
-###############################
-print("Setting up environment...")
-# Clone the VideoPainter repository if not present
-if not os.path.exists("VideoPainter"):
-    print("Cloning VideoPainter repository...")
-    os.system("git clone https://github.com/TencentARC/VideoPainter.git")
-# Append repository folders to sys.path (if not already)
-sys.path.append(os.path.join(os.getcwd(), "VideoPainter"))
-sys.path.append(os.path.join(os.getcwd(), "VideoPainter/app"))
-sys.path.append(os.path.join(os.getcwd(), "app"))
-sys.path.append(".")
-# Install the custom diffusers package from VideoPainter/diffusers.
-if os.path.exists("VideoPainter/diffusers"):
-    print("Installing custom diffusers (editable, no-build-isolation)...")
-    os.system("pip install --no-build-isolation -e VideoPainter/diffusers")
-# Copy VideoPainter/app to local 'app' directory if needed.
-if not os.path.exists("app"):
-    os.makedirs("app", exist_ok=True)
-    print("Copying VideoPainter/app to local app directory...")
-    os.system("cp -r VideoPainter/app/* app/")
-# Install the app package in editable mode.
-if os.path.exists("app"):
-    curr_dir = os.getcwd()
-    os.chdir("app")
-    print("Installing app package (editable, no-build-isolation)...")
-    ret = os.system("pip install --no-build-isolation -e .")
-    if ret != 0:
-        print("Warning: Installing the app package failed; continuing by adding 'app' to sys.path.")
-    os.chdir(curr_dir)
-###############################
-# Import modules – if any critical module is missing, exit.
-###############################
-try:
-    print("Importing modules...")
-    import gradio as gr
-    import cv2
-    import numpy as np
-    import scipy
-    import torchvision
-    from PIL import Image
-    from huggingface_hub import snapshot_download
-    from decord import VideoReader
-    from sam2.build_sam import build_sam2_video_predictor
-    from utils import load_model, generate_frames
-    print("Standard and specialized modules imported successfully!")
-except ImportError as e:
-    print(f"Error importing modules: {e}")
-    sys.exit(1)
-###############################
-# Validate diffusers installation.
-###############################
-try:
-    from diffusers import pipelines  # Expect this to work.
-    print("Custom diffusers installation appears complete.")
-except Exception as e:
-    print("Custom diffusers installation appears broken:")
-    print(e)
-    print("Installing official diffusers package from PyPI (>=0.24.0)...")
-    if install_package("diffusers>=0.24.0 --force-reinstall"):
-        try:
-            from diffusers import pipelines
-            print("Official diffusers package installed successfully.")
-        except Exception as e2:
-            print("Failed to import diffusers even after installing official version.")
-            sys.exit(1)
-    else:
-        sys.exit(1)
-###############################
-# Begin Application Code (VideoPainter demo)
-###############################
-def download_models():
-    print("Downloading models from Hugging Face Hub...")
-    models = {
-        "CogVideoX-5b-I2V": "THUDM/CogVideoX-5b-I2V",
-        "VideoPainter": "TencentARC/VideoPainter"
-    }
-    model_paths = {}
-    os.makedirs("ckpt", exist_ok=True)
-    for name, repo_id in models.items():
-        print(f"Downloading {name} from {repo_id}...")
-        path = snapshot_download(repo_id=repo_id)
-        model_paths[name] = path
-        print(f"Downloaded {name} to {path}")
-    try:
-        flux_path = snapshot_download(repo_id="black-forest-labs/FLUX.1-Fill-dev")
-        model_paths["FLUX"] = flux_path
-    except Exception as e:
-        print(f"Failed to download FLUX model: {e}")
-        model_paths["FLUX"] = None
-    os.makedirs("ckpt/Grounded-SAM-2", exist_ok=True)
-    sam2_path = "ckpt/Grounded-SAM-2/sam2_hiera_large.pt"
-    if not os.path.exists(sam2_path):
-        print(f"Downloading SAM2 to {sam2_path}...")
-        os.system(f"wget -O {sam2_path} https://huggingface.co/spaces/sam2/sam2/resolve/main/sam2_hiera_large.pt")
-    model_paths["SAM2"] = sam2_path
-    return model_paths
-print("Initializing application environment...")
-if not os.path.exists("app"):
-    print("Setting up app folder from VideoPainter repository ...")
-    os.system("git clone https://github.com/TencentARC/VideoPainter.git")
-    os.makedirs("app", exist_ok=True)
-    os.system("cp -r VideoPainter/app/* app/")
-    os.system("pip install --no-build-isolation -e VideoPainter/diffusers")
-    os.chdir("app")
-    os.system("pip install --no-build-isolation -e .")
-    os.chdir("..")
-sys.path.append("app")
-sys.path.append(".")
-# Import project modules (again, to be safe)
-try:
-    from decord import VideoReader
-    from sam2.build_sam import build_sam2_video_predictor
-    from utils import load_model, generate_frames
-except ImportError as e:
-    print(f"Failed to import specialized modules: {e}")
-    sys.exit(1)
-# Set up OpenRouter / OpenAI (for caption generation)
-try:
-    from openai import OpenAI
-    vlm_model = OpenAI(
-        api_key=os.getenv("OPENROUTER_API_KEY", ""),
-        base_url="https://openrouter.ai/api/v1"
-    )
-    print("OpenRouter client initialized successfully")
-except Exception as e:
-    print(f"OpenRouter API not available: {e}")
-    class DummyModel:
-        def __getattr__(self, name):
-            return self
-        def __call__(self, *args, **kwargs):
-            return self
-        def create(self, *args, **kwargs):
-            class DummyResponse:
-                choices = [type('obj', (object,), {'message': type('obj', (object,), {'content': "OpenRouter API not available. Using default prompt."})})]
-            return DummyResponse()
-    vlm_model = DummyModel()
-###############################
-# Download models and initialize predictors
-###############################
-model_paths = download_models()
-base_model_path = model_paths["CogVideoX-5b-I2V"]
-videopainter_path = model_paths["VideoPainter"]
-inpainting_branch = os.path.join(videopainter_path, "checkpoints/branch")
-id_adapter = os.path.join(videopainter_path, "VideoPainterID/checkpoints")
-img_inpainting_model = model_paths.get("FLUX")
-sam2_checkpoint = "ckpt/Grounded-SAM-2/sam2_hiera_large.pt"
-model_cfg = "sam2_hiera_l.yaml"
-try:
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)
-    print("Build SAM2 predictor done!")
-    validation_pipeline, validation_pipeline_img = load_model(
-        model_path=base_model_path,
-        inpainting_branch=inpainting_branch,
-        id_adapter=id_adapter,
-        img_inpainting_model=img_inpainting_model
-    )
-    print("Load model done!")
-except Exception as e:
-    print(f"Error initializing models: {e}")
-    sys.exit(1)
-###############################
-# Helper functions & state definitions
-###############################
-EXAMPLES = [
-    [
-        "https://huggingface.co/spaces/TencentARC/VideoPainter/resolve/main/examples/ferry.mp4",
-        "A white ferry with red and blue accents, named 'COLONIA', cruises on a calm river...",
-        "White and red passenger ferry boat labeled 'COLONIA 6' with multiple windows, life buoys, and upper deck seating.",
-        "Positive",
-        "Inpaint",
-        "",
-        42,
-        6.0,
-        16,
-        [[[320, 240]], [1]],
-    ],
-    [
-        "https://huggingface.co/spaces/TencentARC/VideoPainter/resolve/main/examples/street.mp4",
-        "A bustling city street at night illuminated by festive lights, a red double-decker bus...",
-        "The rear of a black car with illuminated red tail lights and a visible license plate.",
-        "Positive",
-        "Inpaint",
-        "",
-        42,
-        6.0,
-        16,
-        [[[200, 400]], [1]],
-    ],
-]
-class StatusMessage:
-    INFO = "Info"
-    WARNING = "Warning"
-    ERROR = "Error"
-    SUCCESS = "Success"
-def create_status(message, status_type=StatusMessage.INFO):
-    timestamp = time.strftime("%H:%M:%S")
-    return [("", ""), (f"[{timestamp}]: {message}\n", status_type)]
-def update_status(previous_status, new_message, status_type=StatusMessage.INFO):
-    timestamp = time.strftime("%H:%M:%S")
-    history = previous_status[-3:]
-    history.append((f"[{timestamp}]: {new_message}\n", status_type))
-    return [("", "")] + history
-def init_state(offload_video_to_cpu=False, offload_state_to_cpu=False):
-    inference_state = {}
-    inference_state["images"] = torch.zeros([1, 3, 100, 100])
-    inference_state["num_frames"] = 1
-    inference_state["offload_video_to_cpu"] = offload_video_to_cpu
-    inference_state["offload_state_to_cpu"] = offload_state_to_cpu
-    inference_state["video_height"] = 100
-    inference_state["video_width"] = 100
-    inference_state["device"] = torch.device("cuda")
-    inference_state["storage_device"] = torch.device("cpu") if offload_state_to_cpu else torch.device("cuda")
-    inference_state["point_inputs_per_obj"] = {}
-    inference_state["mask_inputs_per_obj"] = {}
-    inference_state["cached_features"] = {}
-    inference_state["constants"] = {}
-    inference_state["obj_id_to_idx"] = OrderedDict()
-    inference_state["obj_idx_to_id"] = OrderedDict()
-    inference_state["obj_ids"] = []
-    inference_state["output_dict"] = {"cond_frame_outputs": {}, "non_cond_frame_outputs": {}}
-    inference_state["output_dict_per_obj"] = {}
-    inference_state["temp_output_dict_per_obj"] = {}
-    inference_state["consolidated_frame_inds"] = {"cond_frame_outputs": set(), "non_cond_frame_outputs": set()}
-    inference_state["tracking_has_started"] = False
-    inference_state["frames_already_tracked"] = {}
-    inference_state = gr.State(inference_state)
-    return inference_state
-# (All additional helper functions such as get_frames_from_video, sam_refine, vos_tracking_video,
-# inpaint_video, generate_video_from_frames, process_example, reset_all, etc. are defined below.)
-# For brevity, they are included here in full as in your original code.
-def get_frames_from_video(video_input, video_state):
-    video_path = video_input
-    frames = []
-    user_name = time.time()
-    vr = VideoReader(video_path)
-    original_fps = vr.get_avg_fps()
-    if original_fps > 8:
-        total_frames = len(vr)
-        sample_interval = max(1, int(original_fps / 8))
-        frame_indices = list(range(0, total_frames, sample_interval))
-        frames = vr.get_batch(frame_indices).asnumpy()
-    else:
-        frames = vr.get_batch(list(range(len(vr)))).asnumpy()
-    frames = frames[:49]
-    resized_frames = [cv2.resize(frame, (720, 480)) for frame in frames]
-    frames = np.array(resized_frames)
-    init_start = time.time()
-    inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
-    init_time = time.time() - init_start
-    print(f"Inference state initialization took {init_time:.2f}s")
-    fps = 8
-    image_size = (frames[0].shape[0], frames[0].shape[1])
-    video_state = {
-        "user_name": user_name,
-        "video_name": os.path.split(video_path)[-1],
-        "origin_images": frames,
-        "painted_images": frames.copy(),
-        "masks": [np.zeros((frames[0].shape[0], frames[0].shape[1]), np.uint8)] * len(frames),
-        "logits": [None] * len(frames),
-        "select_frame_number": 0,
-        "fps": fps,
-        "ann_obj_id": 0
-    }
-    video_info = f"Video Name: {video_state['video_name']}, FPS: {video_state['fps']}, Total Frames: {len(frames)}, Image Size: {image_size}"
-    video_input_path = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/original_{video_state['video_name']}", fps=fps)
-    return (gr.update(visible=True), gr.update(visible=True), inference_state, video_state, video_info,
-            video_state["origin_images"][0], gr.update(visible=False, maximum=len(frames), value=1, interactive=True),
-            gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), gr.update(visible=True, interactive=True),
-            gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True), gr.update(visible=True),
-            gr.update(visible=True, interactive=False), create_status("Upload video complete. Ready to select targets.", StatusMessage.SUCCESS), video_input_path)
-def select_template(image_selection_slider, video_state, interactive_state, previous_status):
-    image_selection_slider -= 1
-    video_state["select_frame_number"] = image_selection_slider
-    return video_state["painted_images"][image_selection_slider], video_state, interactive_state, update_status(previous_status, f"Set tracking start at frame {image_selection_slider}.", StatusMessage.INFO)
-def get_end_number(track_pause_number_slider, video_state, interactive_state, previous_status):
-    interactive_state["track_end_number"] = track_pause_number_slider
-    return video_state["painted_images"][track_pause_number_slider], interactive_state, update_status(previous_status, f"Set tracking finish at frame {track_pause_number_slider}.", StatusMessage.INFO)
-def sam_refine(inference_state, video_state, point_prompt, click_state, interactive_state, evt, previous_status):
-    ann_obj_id = 0
-    ann_frame_idx = video_state["select_frame_number"]
-    if point_prompt == "Positive":
-        coordinate = f"[[{evt.index[0]},{evt.index[1]},1]]"
-        interactive_state["positive_click_times"] += 1
-    else:
-        coordinate = f"[[{evt.index[0]},{evt.index[1]},0]]"
-        interactive_state["negative_click_times"] += 1
-    print(f"sam_refine, point_prompt: {point_prompt}, click_state: {click_state}")
-    prompt = {"prompt_type":["click"], "input_point": click_state[0], "input_label": click_state[1], "multimask_output": "True"}
-    points = np.array(prompt["input_point"])
-    labels = np.array(prompt["input_label"])
-    height, width = video_state["origin_images"][0].shape[0:2]
-    for i in range(len(points)):
-        points[i, 0] = int(points[i, 0])
-        points[i, 1] = int(points[i, 1])
-    print(f"sam_refine points: {points}, labels: {labels}")
-    frame_idx, obj_ids, mask = predictor.add_new_points(inference_state=inference_state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, points=points, labels=labels)
-    mask_ = mask.cpu().squeeze().detach().numpy()
-    mask_[mask_ <= 0] = 0
-    mask_[mask_ > 0] = 1
-    org_image = video_state["origin_images"][video_state["select_frame_number"]]
-    mask_ = cv2.resize(mask_, (width, height))
-    mask_ = mask_[:, :, None]
-    mask_[mask_ > 0.5] = 1
-    mask_[mask_ <= 0.5] = 0
-    color = 63 * np.ones((height, width, 3)) * np.array([[[np.random.randint(5), np.random.randint(5), np.random.randint(5)]]])
-    painted_image = np.uint8((1 - 0.5 * mask_) * org_image + 0.5 * mask_ * color)
-    video_state["masks"][video_state["select_frame_number"]] = mask_
-    video_state["painted_images"][video_state["select_frame_number"]] = painted_image
-    return painted_image, video_state, interactive_state, update_status(previous_status, "Segmentation updated. Add more points or continue tracking.", StatusMessage.SUCCESS)
-def clear_click(inference_state, video_state, click_state, previous_status):
-    predictor.reset_state(inference_state)
-    click_state = [[], []]
-    template_frame = video_state["origin_images"][video_state["select_frame_number"]]
-    return inference_state, template_frame, click_state, update_status(previous_status, "Click history cleared.", StatusMessage.INFO)
-def vos_tracking_video(inference_state, video_state, interactive_state, previous_status):
-    height, width = video_state["origin_images"][0].shape[0:2]
-    masks = []
-    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
-        mask = np.zeros([480, 720, 1])
-        for i in range(len(out_mask_logits)):
-            out_mask = out_mask_logits[i].cpu().squeeze().detach().numpy()
-            out_mask[out_mask > 0] = 1
-            out_mask[out_mask <= 0] = 0
-            out_mask = out_mask[:, :, None]
-            mask += out_mask
-        mask = cv2.resize(mask, (width, height))
-        mask = mask[:, :, None]
-        mask[mask > 0.5] = 1
-        mask[mask < 1] = 0
-        mask = scipy.ndimage.binary_dilation(mask, iterations=6)
-        masks.append(mask)
-    masks = np.array(masks)
-    if interactive_state.get("track_end_number") is not None:
-        video_state["masks"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = masks
-        org_images = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
-        color = 255 * np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0, 1, 1]]]])
-        painted_images = np.uint8((1 - 0.5 * masks) * org_images + 0.5 * masks * color)
-        video_state["painted_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = painted_images
-    else:
-        video_state["masks"] = masks
-        org_images = video_state["origin_images"]
-        color = 255 * np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0, 1, 1]]]])
-        painted_images = np.uint8((1 - 0.5 * masks) * org_images + 0.5 * masks * color)
-        video_state["painted_images"] = painted_images
-    video_output = generate_video_from_frames(video_state["painted_images"], output_path=f"{GRADIO_TEMP_DIR}/track/{video_state['video_name']}", fps=video_state["fps"])
-    interactive_state["inference_times"] += 1
-    print(f"vos_tracking_video output: {video_output}")
-    return inference_state, video_output, video_state, interactive_state, update_status(previous_status, "Tracking complete.", StatusMessage.SUCCESS), gr.Button.update(interactive=True), gr.Button.update(interactive=True), gr.Button.update(interactive=True), gr.Button.update(interactive=True)
-def inpaint_video(video_state, video_caption, target_region_frame1_caption, interactive_state, previous_status, seed_param, cfg_scale, dilate_size):
-    seed = int(seed_param) if int(seed_param) >= 0 else np.random.randint(0, 2**32 - 1)
-    validation_images = video_state["origin_images"]
-    validation_masks = video_state["masks"]
-    validation_masks = [np.squeeze(mask) for mask in validation_masks]
-    validation_masks = [(mask > 0).astype(np.uint8) * 255 for mask in validation_masks]
-    validation_masks = [np.stack([m, m, m], axis=-1) for m in validation_masks]
-    validation_images = [Image.fromarray(np.uint8(img)).convert('RGB') for img in validation_images]
-    validation_masks = [Image.fromarray(np.uint8(mask)).convert('RGB') for mask in validation_masks]
-    validation_images = [img.resize((720, 480)) for img in validation_images]
-    validation_masks = [mask.resize((720, 480)) for mask in validation_masks]
-    print("Inpainting: video_caption=", video_caption)
-    images = generate_frames(
-        images=validation_images,
-        masks=validation_masks,
-        pipe=validation_pipeline,
-        pipe_img_inpainting=validation_pipeline_img,
-        prompt=str(video_caption),
-        image_inpainting_prompt=str(target_region_frame1_caption),
-        seed=seed,
-        cfg_scale=float(cfg_scale),
-        dilate_size=int(dilate_size)
-    )
-    images = (images * 255).astype(np.uint8)
-    video_output = generate_video_from_frames(images, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=8)
-    print(f"Inpaint_video output: {video_output}")
-    return video_output, update_status(previous_status, "Inpainting complete.", StatusMessage.SUCCESS)
-def generate_video_from_frames(frames, output_path, fps=8):
-    frames_tensor = torch.from_numpy(np.asarray(frames)).to(torch.uint8)
-    if not os.path.exists(os.path.dirname(output_path)):
-        os.makedirs(os.path.dirname(output_path))
-    torchvision.io.write_video(output_path, frames_tensor, fps=fps, video_codec="libx264")
-    return output_path
-def process_example(video_input, video_caption, target_region_frame1_caption, prompt, click_state):
-    if video_input is None or video_input == "":
-        return (gr.update(value=""), gr.update(value=""), init_state(),
-                {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
-                "", None,
-                gr.update(value=1, visible=False, interactive=False),
-                gr.update(value=1, visible=False, interactive=False),
-                gr.update(value="Positive", interactive=False),
-                gr.update(visible=True, interactive=False),
-                gr.update(visible=True, interactive=False),
-                gr.update(value=None),
-                gr.update(visible=True, interactive=False),
-                create_status("Reset complete. Ready for new input.", StatusMessage.INFO),
-                gr.update(value=None))
-    video_state = gr.State({
-        "user_name": "",
-        "video_name": "",
-        "origin_images": None,
-        "painted_images": None,
-        "masks": None,
-        "inpaint_masks": None,
-        "logits": None,
-        "select_frame_number": 0,
-        "fps": 8,
-        "ann_obj_id": 0
-    })
-    results = get_frames_from_video(video_input, video_state)
-    if click_state[0] and click_state[1]:
-        print("Example detected, executing sam_refine")
-        (video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button, tracking_button, video_output, inpaint_button, run_status, video_input) = results
-        class MockEvent:
-            def __init__(self, points, point_idx=0):
-                self.index = points[point_idx]
-        for i_click in range(len(click_state[0])):
-            evt = MockEvent(click_state[0], i_click)
-            prompt_type = "Positive" if click_state[1][i_click] == 1 else "Negative"
-            template_frame, video_state, interactive_state, run_status = sam_refine(inference_state, video_state, prompt_type, click_state, {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": None}, evt, run_status)
-        return (video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button, tracking_button, video_output, inpaint_button, run_status, video_input)
-    return results
-def reset_all():
-    return (gr.update(value=None), gr.update(value=""), gr.update(value=""), init_state(),
-            {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
-            {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": None},
-            [[], []], None, gr.update(visible=True, interactive=True), "",
-            gr.update(value=1, visible=False, interactive=False), gr.update(value=1, visible=False, interactive=False),
-            gr.update(value="Positive", interactive=False), gr.Button.update(interactive=False),
-            gr.Button.update(interactive=False), gr.Button.update(interactive=False),
-            gr.Button.update(interactive=False), gr.Button.update(interactive=False),
-            gr.Button.update(interactive=False), gr.Number.update(value=42),
-            gr.Slider.update(value=6.0), gr.Slider.update(value=16),
-            create_status("Reset complete. Ready for new input.", StatusMessage.INFO))
-###############################
-# Build Gradio Interface
-###############################
-title = """<p><h1 align="center">VideoPainter</h1></p>"""
-with gr.Blocks() as iface:
-    gr.HTML("""
-<div style="text-align: center;">
-    <h1 style="color: #333;">🖌️ VideoPainter</h1>
-    <h3 style="color: #333;">Any-length Video Inpainting and Editing with Plug-and-Play Context Control</h3>
-    <p style="font-weight: bold;">
-        <a href="https://yxbian23.github.io/project/video-painter/">🌍 Project Page</a> |
-        <a href="https://arxiv.org/abs/2503.05639">📃 ArXiv Preprint</a> |
-        <a href="https://github.com/TencentARC/VideoPainter">🧑‍💻 Github Repository</a>
-    </p>
-</div>
-    """)
-    click_state = gr.State([[], []])
-    interactive_state = gr.State({
-        "inference_times": 0,
-        "negative_click_times": 0,
-        "positive_click_times": 0,
-        "mask_save": False,
-        "multi_mask": {"mask_names": [], "masks": []},
-        "track_end_number": None,
-    })
-    video_state = gr.State({
-        "user_name": "",
-        "video_name": "",
-        "origin_images": None,
-        "painted_images": None,
-        "masks": None,
-        "inpaint_masks": None,
-        "logits": None,
-        "select_frame_number": 0,
-        "fps": 8,
-        "ann_obj_id": 0
-    })
-    inference_state = init_state()
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                video_input = gr.Video(label="Original Video", visible=True)
-            with gr.Row():
-                with gr.Column(scale=3):
-                    template_frame = gr.Image(type="pil", interactive=True, elem_id="template_frame", visible=True)
-                with gr.Column(scale=1):
-                    with gr.Accordion("Segmentation Point Prompt", open=True):
-                        point_prompt = gr.Radio(choices=["Positive", "Negative"], value="Positive", label="Point Type", interactive=False, visible=True)
-                        clear_button_click = gr.Button(value="Clear clicks", interactive=False, visible=True)
-                        gr.Markdown("✨ Positive: Include target region. <br> ✨ Negative: Exclude target region.")
-                        image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False, interactive=False)
-                        track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False, interactive=False)
-            video_output = gr.Video(label="Generated Video", visible=True)
-            with gr.Row():
-                tracking_video_predict_button = gr.Button(value="Tracking", interactive=False, visible=True)
-                inpaint_video_predict_button = gr.Button(value="Inpainting", interactive=False, visible=True)
-                reset_button = gr.Button(value="Reset All", interactive=True, visible=True)
-        with gr.Column():
-            with gr.Accordion("Global Video Caption", open=True):
-                video_caption = gr.Textbox(label="Global Video Caption", placeholder="Input global video caption...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
-                with gr.Row():
-                    gr.Markdown("✨ Enhance prompt using GPT-4o (optional).")
-                    enhance_button = gr.Button("✨ Enhance Prompt(Optional)", interactive=False)
-            with gr.Accordion("Target Object Caption", open=True):
-                target_region_frame1_caption = gr.Textbox(label="Target Object Caption", placeholder="Input target object caption...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
-                with gr.Row():
-                    gr.Markdown("✨ Generate target caption (optional).")
-                    enhance_target_region_frame1_button = gr.Button("✨ Target Prompt Generation (Optional)", interactive=False)
-            with gr.Accordion("Editing Instruction", open=False):
-                gr.Markdown("✨ Modify captions based on your instruction using GPT-4o.")
-                with gr.Row():
-                    editing_instruction = gr.Textbox(label="Editing Instruction", placeholder="Input editing instruction...", interactive=True, visible=True, max_lines=5, show_copy_button=True)
-                    enhance_editing_instruction_button = gr.Button("✨ Modify Caption(For Editing)", interactive=False)
-            with gr.Accordion("Advanced Sampling Settings", open=False):
-                cfg_scale = gr.Slider(value=6.0, label="Classifier-Free Guidance Scale", minimum=1, maximum=10, step=0.1, interactive=True)
-                seed_param = gr.Number(label="Inference Seed (>=0)", interactive=True, value=42)
-                dilate_size = gr.Slider(value=16, label="Mask Dilate Size", minimum=0, maximum=32, step=1, interactive=True)
-                video_info = gr.Textbox(label="Video Info", visible=True, interactive=False)
-                model_type = gr.Textbox(label="Type", placeholder="Model type...", interactive=True, visible=False)
-            notes_accordion = gr.Accordion("Notes", open=False)
-            with notes_accordion:
-                gr.HTML("<p style='font-size: 1.1em;'>🧐 Reminder: VideoPainter may produce unexpected outputs. Adjust settings if needed.</p>")
-                run_status = gr.HighlightedText(value=[("", "")], visible=True, label="Operation Status", show_label=True,
-                                                color_map={"Success": "green", "Error": "red", "Warning": "orange", "Info": "blue"})
-    with gr.Row():
-        examples = gr.Examples(label="Quick Examples", examples=EXAMPLES,
-                               inputs=[video_input, video_caption, target_region_frame1_caption, point_prompt, model_type, editing_instruction, seed_param, cfg_scale, dilate_size, click_state],
-                               examples_per_page=20, cache_examples=False)
-    video_input.change(fn=process_example, inputs=[video_input, video_caption, target_region_frame1_caption, point_prompt, click_state],
-                       outputs=[video_caption, target_region_frame1_caption, inference_state, video_state, video_info,
-                                template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click,
-                                tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status, video_input])
-    image_selection_slider.release(fn=select_template, inputs=[image_selection_slider, video_state, interactive_state, run_status],
-                                   outputs=[template_frame, video_state, interactive_state, run_status])
-    track_pause_number_slider.release(fn=get_end_number, inputs=[track_pause_number_slider, video_state, interactive_state, run_status],
-                                      outputs=[template_frame, interactive_state, run_status])
-    template_frame.select(fn=sam_refine, inputs=[inference_state, video_state, point_prompt, click_state, interactive_state, run_status],
-                          outputs=[template_frame, video_state, interactive_state, run_status])
-    tracking_video_predict_button.click(fn=vos_tracking_video, inputs=[inference_state, video_state, interactive_state, run_status],
-                                        outputs=[inference_state, video_output, video_state, interactive_state, run_status,
-                                                 inpaint_video_predict_button, enhance_button, enhance_target_region_frame1_button, enhance_editing_instruction_button, notes_accordion])
-    inpaint_video_predict_button.click(fn=inpaint_video, inputs=[video_state, video_caption, target_region_frame1_caption, interactive_state, run_status, seed_param, cfg_scale, dilate_size],
-                                        outputs=[video_output, run_status], api_name=False, show_progress="full")
-    def enhance_prompt_func(video_caption):
-        return video_caption  # Replace with your convert_prompt() if available
-    def enhance_target_region_frame1_prompt_func(target_region_frame1_caption, video_state):
-        return target_region_frame1_caption  # Replace with your convert_prompt_target_region_frame1() if available
-    def enhance_editing_instruction_prompt_func(editing_instruction, video_caption, target_region_frame1_caption, video_state):
-        return video_caption, target_region_frame1_caption  # Replace with your convert_prompt_editing_instruction() if available
-    enhance_button.click(enhance_prompt_func, inputs=[video_caption], outputs=[video_caption])
-    enhance_target_region_frame1_button.click(enhance_target_region_frame1_prompt_func, inputs=[target_region_frame1_caption, video_state], outputs=[target_region_frame1_caption])
-    enhance_editing_instruction_button.click(enhance_editing_instruction_prompt_func, inputs=[editing_instruction, video_caption, target_region_frame1_caption, video_state],
-                                              outputs=[video_caption, target_region_frame1_caption])
-    video_input.clear(fn=lambda: (gr.update(visible=True), gr.update(visible=True), init_state(),
-                                    {"user_name": "", "video_name": "", "origin_images": None, "painted_images": None, "masks": None, "inpaint_masks": None, "logits": None, "select_frame_number": 0, "fps": 8, "ann_obj_id": 0},
-                                    {"inference_times": 0, "negative_click_times": 0, "positive_click_times": 0, "mask_save": False, "multi_mask": {"mask_names": [], "masks": []}, "track_end_number": 0},
-                                    [[], []], None, None,
-                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True),
-                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True, value=[]),
-                                    gr.update(visible=True), gr.update(visible=True), gr.update(visible=True),
-                                    gr.Button.update(interactive=False), gr.Button.update(interactive=False), gr.Button.update(interactive=False)),
-                          outputs=[video_caption, target_region_frame1_caption, inference_state, video_state, interactive_state, click_state, video_output, template_frame, tracking_video_predict_button, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, template_frame, tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status], queue=False, show_progress=False)
-    clear_button_click.click(fn=clear_click, inputs=[inference_state, video_state, click_state, run_status],
-                               outputs=[inference_state, template_frame, click_state, run_status])
-    reset_button.click(fn=reset_all, inputs=[], outputs=[video_input, video_caption, target_region_frame1_caption, inference_state, video_state, interactive_state, click_state, video_output, template_frame, video_info, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, tracking_video_predict_button, inpaint_video_predict_button, enhance_button, enhance_target_region_frame1_button, enhance_editing_instruction_button, seed_param, cfg_scale, dilate_size, run_status])
-iface.queue().launch(share=False)