Spaces:

BestWishYsh
/

ConsisID-preview-Space

Sleeping

App Files Files Community

smile123456789 commited on Dec 10, 2024

Commit

b65930c

1 Parent(s): c7b92cf

reorganize code

Browse files

Files changed (13) hide show

.gitattributes +2 -0
app.py +27 -94
asserts/example_images/4.png +0 -0
models/local_facial_extractor.py +75 -35
models/pipeline_consisid.py +65 -36
models/transformer_consisid.py +70 -35
models/utils.py +102 -12
requirements.txt +1 -1
util/dataloader.py +0 -1010
util/deepspeed_configs/accelerate_config_machine_multi.yaml +0 -18
util/deepspeed_configs/accelerate_config_machine_single.yaml +0 -13
util/deepspeed_configs/hostfile.txt +0 -2
util/deepspeed_configs/zero_stage2_config.json +0 -17

.gitattributes CHANGED Viewed

@@ -1,3 +1,5 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

+__pycache__/
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,37 +1,26 @@
 import os
 import math
 import time
-import numpy
 import spaces
 import random
 import threading
 import gradio as gr
-from PIL import Image, ImageOps
 from moviepy import VideoFileClip
 from datetime import datetime, timedelta
 from huggingface_hub import hf_hub_download, snapshot_download
-import insightface
-from insightface.app import FaceAnalysis
-from facexlib.parsing import init_parsing_model
-from facexlib.utils.face_restoration_helper import FaceRestoreHelper
 import torch
-from diffusers import CogVideoXDPMScheduler
-from diffusers.utils import load_image
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.training_utils import free_memory
 from util.utils import *
 from util.rife_model import load_rife_model, rife_inference_with_latents
-from models.utils import process_face_embeddings
 from models.transformer_consisid import ConsisIDTransformer3DModel
 from models.pipeline_consisid import ConsisIDPipeline
-from models.eva_clip import create_model_and_transforms
-from models.eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-from models.eva_clip.utils_qformer import resize_numpy_image_long
 model_path = "ckpts"
 lora_path = None
@@ -51,72 +40,30 @@ if os.path.exists(os.path.join(model_path, "transformer_ema")):
     subfolder = "transformer_ema"
 else:
     subfolder = "transformer"
-transformer = ConsisIDTransformer3DModel.from_pretrained_cus(model_path, subfolder=subfolder)
-scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
-try:
-    is_kps = transformer.config.is_kps
-except:
-    is_kps = False
-# 1. load face helper models
-face_helper = FaceRestoreHelper(
-    upscale_factor=1,
-    face_size=512,
-    crop_ratio=(1, 1),
-    det_model='retinaface_resnet50',
-    save_ext='png',
-    device=device,
-    model_rootpath=os.path.join(model_path, "face_encoder")
-)
-face_helper.face_parse = None
-face_helper.face_parse = init_parsing_model(model_name='bisenet', device=device, model_rootpath=os.path.join(model_path, "face_encoder"))
-face_helper.face_det.eval()
-face_helper.face_parse.eval()
-model, _, _ = create_model_and_transforms('EVA02-CLIP-L-14-336', os.path.join(model_path, "face_encoder", "EVA02_CLIP_L_336_psz14_s6B.pt"), force_custom_clip=True)
-face_clip_model = model.visual
-face_clip_model.eval()
-eva_transform_mean = getattr(face_clip_model, 'image_mean', OPENAI_DATASET_MEAN)
-eva_transform_std = getattr(face_clip_model, 'image_std', OPENAI_DATASET_STD)
-if not isinstance(eva_transform_mean, (list, tuple)):
-    eva_transform_mean = (eva_transform_mean,) * 3
-if not isinstance(eva_transform_std, (list, tuple)):
-    eva_transform_std = (eva_transform_std,) * 3
-eva_transform_mean = eva_transform_mean
-eva_transform_std = eva_transform_std
-face_main_model = FaceAnalysis(name='antelopev2', root=os.path.join(model_path, "face_encoder"), providers=['CUDAExecutionProvider'])
-handler_ante = insightface.model_zoo.get_model(f'{model_path}/face_encoder/models/antelopev2/glintr100.onnx', providers=['CUDAExecutionProvider'])
-face_main_model.prepare(ctx_id=0, det_size=(640, 640))
-handler_ante.prepare(ctx_id=0)
-face_clip_model.to(device, dtype=dtype)
-face_helper.face_det.to(device)
-face_helper.face_parse.to(device)
 transformer.to(device, dtype=dtype)
-free_memory()
-pipe = ConsisIDPipeline.from_pretrained(model_path, transformer=transformer, scheduler=scheduler, torch_dtype=dtype)
 # If you're using with lora, add this code
 if lora_path:
     pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
     pipe.fuse_lora(lora_scale=1 / lora_rank)
-scheduler_args = {}
-if "variance_type" in pipe.scheduler.config:
-    variance_type = pipe.scheduler.config.variance_type
-    if variance_type in ["learned", "learned_range"]:
-        variance_type = "fixed_small"
-    scheduler_args["variance_type"] = variance_type
-pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
 pipe.to(device)
-# Enable CPU offload for the model.
-# turn on if you don't have multiple GPUs or enough GPU memory(such as H100) and it will cost more time in inference, it may also reduce the quality
 pipe.enable_model_cpu_offload()
 pipe.enable_sequential_cpu_offload()
 # pipe.vae.enable_slicing()
@@ -125,6 +72,7 @@ pipe.enable_sequential_cpu_offload()
 os.makedirs("./output", exist_ok=True)
 os.makedirs("./gradio_tmp", exist_ok=True)
 upscale_model = load_sd_upscale(f"{model_path}/model_real_esran/RealESRGAN_x4.pth", device)
 frame_interpolation_model = load_rife_model(f"{model_path}/model_rife")
@@ -142,34 +90,21 @@ def generate(
     if seed == -1:
         seed = random.randint(0, 2**8 - 1)
-    id_image = np.array(ImageOps.exif_transpose(Image.fromarray(image_input)).convert("RGB"))
-    id_image = resize_numpy_image_long(id_image, 1024)
-    id_cond, id_vit_hidden, align_crop_face_image, face_kps = process_face_embeddings(face_helper, face_clip_model, handler_ante,
                                                                             eva_transform_mean, eva_transform_std,
-                                                                            face_main_model, device, dtype, id_image,
-                                                                            original_id_image=id_image, is_align_face=True,
-                                                                            cal_uncond=False)
-    if is_kps:
-        kps_cond = face_kps
-    else:
-        kps_cond = None
-    tensor = align_crop_face_image.cpu().detach()
-    tensor = tensor.squeeze()
-    tensor = tensor.permute(1, 2, 0)
-    tensor = tensor.numpy() * 255
-    tensor = tensor.astype(np.uint8)
-    image  = ImageOps.exif_transpose(Image.fromarray(tensor))
     prompt = prompt.strip('"')
-    if len(negative_prompt) == 0:
-        negative_prompt = None
     if negative_prompt:
         negative_prompt = negative_prompt.strip('"')
-    generator = torch.Generator(device).manual_seed(seed) if seed else None
     video_pt = pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
@@ -388,8 +323,6 @@ with gr.Blocks() as demo:
         seed_update = gr.update(visible=True, value=seed)
         return video_path, video_update, gif_update, seed_update
-    run.zerogpu = True
     generate_button.click(
         fn=run,
@@ -400,4 +333,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.queue(max_size=15)
-    demo.launch()

 import os
 import math
 import time
 import spaces
 import random
 import threading
 import gradio as gr
 from moviepy import VideoFileClip
 from datetime import datetime, timedelta
 from huggingface_hub import hf_hub_download, snapshot_download
 import torch
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.training_utils import free_memory
 from util.utils import *
 from util.rife_model import load_rife_model, rife_inference_with_latents
+from models.utils import process_face_embeddings_infer, prepare_face_models
 from models.transformer_consisid import ConsisIDTransformer3DModel
 from models.pipeline_consisid import ConsisIDPipeline
+# 0. Pre config
 model_path = "ckpts"
 lora_path = None
     subfolder = "transformer_ema"
 else:
     subfolder = "transformer"
+# 1. Prepare all the face models
+    face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std = prepare_face_models(model_path, device, dtype)
+# 2. Load Pipeline.
+transformer = ConsisIDTransformer3DModel.from_pretrained_cus(model_path, subfolder=subfolder)
 transformer.to(device, dtype=dtype)
+pipe = ConsisIDPipeline.from_pretrained(model_path, transformer=transformer, torch_dtype=dtype)
 # If you're using with lora, add this code
 if lora_path:
     pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
     pipe.fuse_lora(lora_scale=1 / lora_rank)
+# 3. Move to device.
+face_helper_1.face_det.to(device)
+face_helper_1.face_parse.to(device)
+face_clip_model.to(device, dtype=dtype)
+transformer.to(device, dtype=dtype)
 pipe.to(device)
+# Save Memory. Turn on if you don't have multiple GPUs or enough GPU memory(such as H100) and it will cost more time in inference, it may also reduce the quality
 pipe.enable_model_cpu_offload()
 pipe.enable_sequential_cpu_offload()
 # pipe.vae.enable_slicing()
 os.makedirs("./output", exist_ok=True)
 os.makedirs("./gradio_tmp", exist_ok=True)
+# load upscale and interpolation model
 upscale_model = load_sd_upscale(f"{model_path}/model_real_esran/RealESRGAN_x4.pth", device)
 frame_interpolation_model = load_rife_model(f"{model_path}/model_rife")
     if seed == -1:
         seed = random.randint(0, 2**8 - 1)
+    # 4. Prepare model input
+    id_cond, id_vit_hidden, image, face_kps = process_face_embeddings_infer(face_helper_1, face_clip_model, face_helper_2,
                                                                             eva_transform_mean, eva_transform_std,
+                                                                            face_main_model, device, dtype,
+                                                                            image_input, is_align_face=True)
+    is_kps = getattr(transformer.config, 'is_kps', False)
+    kps_cond = face_kps if is_kps else None
     prompt = prompt.strip('"')
     if negative_prompt:
         negative_prompt = negative_prompt.strip('"')
+    # 5. Generate Identity-Preserving Video
+    generator = torch.Generator(device).manual_seed(seed) if seed else None
     video_pt = pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
         seed_update = gr.update(visible=True, value=seed)
         return video_path, video_update, gif_update, seed_update
     generate_button.click(
         fn=run,
 if __name__ == "__main__":
     demo.queue(max_size=15)
+    demo.launch()

asserts/example_images/4.png ADDED Viewed

models/local_facial_extractor.py CHANGED Viewed

@@ -4,7 +4,18 @@ import torch.nn as nn
 # FFN
-def FeedForward(dim, mult=4):
     inner_dim = int(dim * mult)
     return nn.Sequential(
         nn.LayerNorm(dim),
@@ -15,20 +26,41 @@ def FeedForward(dim, mult=4):
 def reshape_tensor(x, heads):
     bs, length, width = x.shape
-    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
     x = x.view(bs, length, heads, -1)
-    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
     x = x.transpose(1, 2)
-    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
     x = x.reshape(bs, heads, length, -1)
     return x
 class PerceiverAttention(nn.Module):
     def __init__(self, *, dim, dim_head=64, heads=8, kv_dim=None):
         super().__init__()
-        self.scale = dim_head ** -0.5
         self.dim_head = dim_head
         self.heads = heads
         inner_dim = dim_head * heads
@@ -42,21 +74,27 @@ class PerceiverAttention(nn.Module):
     def forward(self, x, latents):
         """
         Args:
-            x (torch.Tensor): image features
-                shape (b, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, n2, D)
         """
         x = self.norm1(x)
         latents = self.norm2(latents)
-        b, seq_len, _ = latents.shape
         q = self.to_q(latents)
         kv_input = torch.cat((x, latents), dim=-2)
         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
         q = reshape_tensor(q, self.heads)
         k = reshape_tensor(k, self.heads)
         v = reshape_tensor(v, self.heads)
@@ -67,6 +105,7 @@ class PerceiverAttention(nn.Module):
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         out = weight @ v
         out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
         return self.to_out(out)
@@ -74,22 +113,22 @@ class PerceiverAttention(nn.Module):
 class LocalFacialExtractor(nn.Module):
     def __init__(
-            self,
-            dim=1024,
-            depth=10,
-            dim_head=64,
-            heads=16,
-            num_id_token=5,
-            num_queries=32,
-            output_dim=2048,
-            ff_mult=4,
     ):
         """
         Initializes the LocalFacialExtractor class.
         Parameters:
         - dim (int): The dimensionality of latent features.
-        - depth (int): Total number of PerceiverAttention and FeedForward layers.
         - dim_head (int): Dimensionality of each attention head.
         - heads (int): Number of attention heads.
         - num_id_token (int): Number of tokens used for identity features.
@@ -105,21 +144,21 @@ class LocalFacialExtractor(nn.Module):
         self.num_queries = num_queries
         assert depth % 5 == 0
         self.depth = depth // 5
-        scale = dim ** -0.5
         # Learnable latent query embeddings
         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) * scale)
         # Projection layer to map the latent output to the desired dimension
         self.proj_out = nn.Parameter(scale * torch.randn(dim, output_dim))
-        # Attention and FeedForward layer stack
         self.layers = nn.ModuleList([])
         for _ in range(depth):
             self.layers.append(
                 nn.ModuleList(
                     [
                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),  # Perceiver Attention layer
-                        FeedForward(dim=dim, mult=ff_mult),  # FeedForward layer
                     ]
                 )
             )
@@ -128,7 +167,7 @@ class LocalFacialExtractor(nn.Module):
         for i in range(5):
             setattr(
                 self,
-                f'mapping_{i}',
                 nn.Sequential(
                     nn.Linear(1024, 1024),
                     nn.LayerNorm(1024),
@@ -175,30 +214,30 @@ class LocalFacialExtractor(nn.Module):
         # Process each of the 5 visual feature inputs
         for i in range(5):
-            vit_feature = getattr(self, f'mapping_{i}')(y[i])
             ctx_feature = torch.cat((x, vit_feature), dim=1)
-            # Pass through the PerceiverAttention and FeedForward layers
-            for attn, ff in self.layers[i * self.depth: (i + 1) * self.depth]:
                 latents = attn(ctx_feature, latents) + latents
                 latents = ff(latents) + latents
         # Retain only the query latents
-        latents = latents[:, :self.num_queries]
         # Project the latents to the output dimension
         latents = latents @ self.proj_out
         return latents
 class PerceiverCrossAttention(nn.Module):
     """
     Args:
         dim (int): Dimension of the input latent and output. Default is 3072.
         dim_head (int): Dimension of each attention head. Default is 128.
         heads (int): Number of attention heads. Default is 16.
         kv_dim (int): Dimension of the key/value input, allowing flexible cross-attention. Default is 2048.
     Attributes:
         scale (float): Scaling factor used in dot-product attention for numerical stability.
         norm1 (nn.LayerNorm): Layer normalization applied to the input image features.
@@ -208,9 +247,10 @@ class PerceiverCrossAttention(nn.Module):
         to_out (nn.Linear): Linear layer for outputting the final result after attention.
     """
     def __init__(self, *, dim=3072, dim_head=128, heads=16, kv_dim=2048):
         super().__init__()
-        self.scale = dim_head ** -0.5
         self.dim_head = dim_head
         self.heads = heads
         inner_dim = dim_head * heads
@@ -232,13 +272,13 @@ class PerceiverCrossAttention(nn.Module):
                 - batch_size (b): Number of samples in the batch.
                 - n1: Sequence length (e.g., number of patches or tokens).
                 - D: Feature dimension.
             latents (torch.Tensor): Latent feature representations with shape (batch_size, n2, D), where:
                 - n2: Number of latent elements.
         Returns:
             torch.Tensor: Attention-modulated features with shape (batch_size, n2, D).
         """
         # Apply layer normalization to the input image and latent features
         x = self.norm1(x)

 # FFN
+def ConsisIDFeedForward(dim, mult=4):
+    """
+    Creates a consistent ID feedforward block consisting of layer normalization,
+    two linear layers, and a GELU activation.
+    Args:
+        dim (int): The input dimension of the tensor.
+        mult (int, optional): Multiplier for the inner dimension. Default is 4.
+    Returns:
+        nn.Sequential: A sequence of layers comprising LayerNorm, Linear layers, and GELU.
+    """
     inner_dim = int(dim * mult)
     return nn.Sequential(
         nn.LayerNorm(dim),
 def reshape_tensor(x, heads):
+    """
+    Reshapes the input tensor for multi-head attention.
+    Args:
+        x (torch.Tensor): The input tensor with shape (batch_size, length, width).
+        heads (int): The number of attention heads.
+    Returns:
+        torch.Tensor: The reshaped tensor, with shape (batch_size, heads, length, width).
+    """
     bs, length, width = x.shape
     x = x.view(bs, length, heads, -1)
     x = x.transpose(1, 2)
     x = x.reshape(bs, heads, length, -1)
     return x
 class PerceiverAttention(nn.Module):
+    """
+    Implements the Perceiver attention mechanism with multi-head attention.
+    This layer takes two inputs: 'x' (image features) and 'latents' (latent features),
+    applying multi-head attention to both and producing an output tensor with the same
+    dimension as the input tensor 'x'.
+    Args:
+        dim (int): The input dimension.
+        dim_head (int, optional): The dimension of each attention head. Default is 64.
+        heads (int, optional): The number of attention heads. Default is 8.
+        kv_dim (int, optional): The key-value dimension. If None, `dim` is used for both keys and values.
+    """
     def __init__(self, *, dim, dim_head=64, heads=8, kv_dim=None):
         super().__init__()
+        self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
         inner_dim = dim_head * heads
     def forward(self, x, latents):
         """
+        Forward pass for Perceiver attention.
         Args:
+            x (torch.Tensor): Image features tensor with shape (batch_size, num_pixels, D).
+            latents (torch.Tensor): Latent features tensor with shape (batch_size, num_latents, D).
+        Returns:
+            torch.Tensor: Output tensor after applying attention and transformation.
         """
+        # Apply normalization
         x = self.norm1(x)
         latents = self.norm2(latents)
+        b, seq_len, _ = latents.shape  # Get batch size and sequence length
+        # Compute query, key, and value matrices
         q = self.to_q(latents)
         kv_input = torch.cat((x, latents), dim=-2)
         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        # Reshape the tensors for multi-head attention
         q = reshape_tensor(q, self.heads)
         k = reshape_tensor(k, self.heads)
         v = reshape_tensor(v, self.heads)
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         out = weight @ v
+        # Reshape and return the final output
         out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
         return self.to_out(out)
 class LocalFacialExtractor(nn.Module):
     def __init__(
+        self,
+        dim=1024,
+        depth=10,
+        dim_head=64,
+        heads=16,
+        num_id_token=5,
+        num_queries=32,
+        output_dim=2048,
+        ff_mult=4,
     ):
         """
         Initializes the LocalFacialExtractor class.
         Parameters:
         - dim (int): The dimensionality of latent features.
+        - depth (int): Total number of PerceiverAttention and ConsisIDFeedForward layers.
         - dim_head (int): Dimensionality of each attention head.
         - heads (int): Number of attention heads.
         - num_id_token (int): Number of tokens used for identity features.
         self.num_queries = num_queries
         assert depth % 5 == 0
         self.depth = depth // 5
+        scale = dim**-0.5
         # Learnable latent query embeddings
         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) * scale)
         # Projection layer to map the latent output to the desired dimension
         self.proj_out = nn.Parameter(scale * torch.randn(dim, output_dim))
+        # Attention and ConsisIDFeedForward layer stack
         self.layers = nn.ModuleList([])
         for _ in range(depth):
             self.layers.append(
                 nn.ModuleList(
                     [
                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),  # Perceiver Attention layer
+                        ConsisIDFeedForward(dim=dim, mult=ff_mult),  # ConsisIDFeedForward layer
                     ]
                 )
             )
         for i in range(5):
             setattr(
                 self,
+                f"mapping_{i}",
                 nn.Sequential(
                     nn.Linear(1024, 1024),
                     nn.LayerNorm(1024),
         # Process each of the 5 visual feature inputs
         for i in range(5):
+            vit_feature = getattr(self, f"mapping_{i}")(y[i])
             ctx_feature = torch.cat((x, vit_feature), dim=1)
+            # Pass through the PerceiverAttention and ConsisIDFeedForward layers
+            for attn, ff in self.layers[i * self.depth : (i + 1) * self.depth]:
                 latents = attn(ctx_feature, latents) + latents
                 latents = ff(latents) + latents
         # Retain only the query latents
+        latents = latents[:, : self.num_queries]
         # Project the latents to the output dimension
         latents = latents @ self.proj_out
         return latents
 class PerceiverCrossAttention(nn.Module):
     """
     Args:
         dim (int): Dimension of the input latent and output. Default is 3072.
         dim_head (int): Dimension of each attention head. Default is 128.
         heads (int): Number of attention heads. Default is 16.
         kv_dim (int): Dimension of the key/value input, allowing flexible cross-attention. Default is 2048.
     Attributes:
         scale (float): Scaling factor used in dot-product attention for numerical stability.
         norm1 (nn.LayerNorm): Layer normalization applied to the input image features.
         to_out (nn.Linear): Linear layer for outputting the final result after attention.
     """
     def __init__(self, *, dim=3072, dim_head=128, heads=16, kv_dim=2048):
         super().__init__()
+        self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
         inner_dim = dim_head * heads
                 - batch_size (b): Number of samples in the batch.
                 - n1: Sequence length (e.g., number of patches or tokens).
                 - D: Feature dimension.
             latents (torch.Tensor): Latent feature representations with shape (batch_size, n2, D), where:
                 - n2: Number of latent elements.
         Returns:
             torch.Tensor: Attention-modulated features with shape (batch_size, n2, D).
         """
         # Apply layer normalization to the input image and latent features
         x = self.norm1(x)

models/pipeline_consisid.py CHANGED Viewed

@@ -1,8 +1,16 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 import inspect
 import math
@@ -13,20 +21,19 @@ import sys
 import PIL
 import numpy as np
 import cv2
-from PIL import Image
 import torch
 from transformers import T5EncoderModel, T5Tokenizer
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from diffusers.models.embeddings import get_3d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
-from diffusers.utils import logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
-from diffusers.pipelines.cogvideo.pipeline_output import CogVideoXPipelineOutput
 from models.transformer_consisid import ConsisIDTransformer3DModel
@@ -37,26 +44,28 @@ for project_root in project_roots:
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import CogVideoXImageToVideoPipeline
         >>> from diffusers.utils import export_to_video, load_image
-        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
-        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
         >>> image = load_image(
-        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
         ... )
         >>> video = pipe(image, prompt, use_dynamic_cfg=True)
         >>> export_to_video(video.frames[0], "output.mp4", fps=8)
         ```
 """
-def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]):
     stickwidth = 4
     limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
     kps = np.array(kps)
@@ -72,7 +81,9 @@ def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,2
         y = kps[index][:, 1]
         length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
         angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
-        polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
         out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
     out_img = (out_img * 0.6).astype(np.uint8)
@@ -81,9 +92,10 @@ def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,2
         x, y = kp
         out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
-    out_img_pil = Image.fromarray(out_img.astype(np.uint8))
     return out_img_pil
 def process_image(image, vae):
     image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=image.device)
     image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=image.dtype)
@@ -92,6 +104,7 @@ def process_image(image, vae):
     image_latent_dist = vae.encode(input_image).latent_dist
     return image_latent_dist
 # Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
 def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
     tw = tgt_width
@@ -185,9 +198,24 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 class ConsisIDPipeline(DiffusionPipeline):
     r"""
-    Pipeline for image-to-video generation using CogVideoX.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -196,7 +224,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
         text_encoder ([`T5EncoderModel`]):
-            Frozen text-encoder. CogVideoX uses
             [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
             [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
         tokenizer (`T5Tokenizer`):
@@ -222,7 +250,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         tokenizer: T5Tokenizer,
         text_encoder: T5EncoderModel,
         vae: AutoencoderKLCogVideoX,
-        transformer: Union[ConsisIDTransformer3DModel, CogVideoXTransformer3DModel],
         scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
     ):
         super().__init__()
@@ -246,7 +274,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
         self,
         prompt: Union[str, List[str]] = None,
@@ -289,7 +317,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         return prompt_embeds
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -409,7 +437,8 @@ class ConsisIDPipeline(DiffusionPipeline):
             if kps_cond is not None:
                 kps_cond = kps_cond.unsqueeze(2)
                 kps_cond_latents = [
-                    retrieve_latents(self.vae.encode(kps_cond[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
                 ]
         else:
             image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
@@ -455,7 +484,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         latents = latents * self.scheduler.init_noise_sigma
         return latents, image_latents
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
     def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
         latents = 1 / self.vae_scaling_factor_image * latents
@@ -554,13 +583,13 @@ class ConsisIDPipeline(DiffusionPipeline):
                     f" {negative_prompt_embeds.shape}."
                 )
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
     def fuse_qkv_projections(self) -> None:
         r"""Enables fused QKV projections."""
         self.fusing_transformer = True
         self.transformer.fuse_qkv_projections()
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
     def unfuse_qkv_projections(self) -> None:
         r"""Disable QKV projection fusion if enabled."""
         if not self.fusing_transformer:
@@ -569,7 +598,7 @@ class ConsisIDPipeline(DiffusionPipeline):
             self.transformer.unfuse_qkv_projections()
             self.fusing_transformer = False
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
     def _prepare_rotary_positional_embeddings(
         self,
         height: int,
@@ -638,7 +667,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         id_vit_hidden: Optional[torch.Tensor] = None,
         id_cond: Optional[torch.Tensor] = None,
         kps_cond: Optional[torch.Tensor] = None,
-    ) -> Union[CogVideoXPipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -658,7 +687,7 @@ class ConsisIDPipeline(DiffusionPipeline):
                 The width in pixels of the generated image. This is set to 720 by default for the best results.
             num_frames (`int`, defaults to `48`):
                 Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
-                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
                 num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
                 needs to be satisfied is that of divisibility mentioned above.
             num_inference_steps (`int`, *optional*, defaults to 50):
@@ -712,8 +741,8 @@ class ConsisIDPipeline(DiffusionPipeline):
         Examples:
         Returns:
-            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
-            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
         if num_frames > 49:
@@ -784,7 +813,7 @@ class ConsisIDPipeline(DiffusionPipeline):
         image = self.video_processor.preprocess(image, height=height, width=width).to(
             device, dtype=prompt_embeds.dtype
         )
         latent_channels = self.transformer.config.in_channels // 2
         latents, image_latents = self.prepare_latents(
             image,
@@ -797,9 +826,9 @@ class ConsisIDPipeline(DiffusionPipeline):
             device,
             generator,
             latents,
-            kps_cond
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -836,8 +865,8 @@ class ConsisIDPipeline(DiffusionPipeline):
                     timestep=timestep,
                     image_rotary_emb=image_rotary_emb,
                     return_dict=False,
-                    id_vit_hidden = id_vit_hidden,
-                    id_cond = id_cond,
                 )[0]
                 noise_pred = noise_pred.float()
@@ -891,4 +920,4 @@ class ConsisIDPipeline(DiffusionPipeline):
         if not return_dict:
             return (video,)
-        return CogVideoXPipelineOutput(frames=video)

+# Copyright 2024 ConsisID Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import inspect
 import math
 import PIL
 import numpy as np
 import cv2
 import torch
+from dataclasses import dataclass
 from transformers import T5EncoderModel, T5Tokenizer
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKLCogVideoX
 from diffusers.models.embeddings import get_3d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import logging, replace_example_docstring, BaseOutput
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
 from models.transformer_consisid import ConsisIDTransformer3DModel
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
+        >>> from diffusers import ConsisIDPipeline
         >>> from diffusers.utils import export_to_video, load_image
+        >>> pipe = ConsisIDPipeline.from_pretrained("https://huggingface.co/BestWishYsh/ConsisID-preview", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
+        >>> prompt = "A woman adorned with a delicate flower crown, is standing amidst a field of gently swaying wildflowers. Her eyes sparkle with a serene gaze, and a faint smile graces her lips, suggesting a moment of peaceful contentment. The shot is framed from the waist up, highlighting the gentle breeze lightly tousling her hair. The background reveals an expansive meadow under a bright blue sky, capturing the tranquility of a sunny afternoon."
         >>> image = load_image(
+        ...     "https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/1.png?raw=true"
         ... )
         >>> video = pipe(image, prompt, use_dynamic_cfg=True)
         >>> export_to_video(video.frames[0], "output.mp4", fps=8)
         ```
 """
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
     stickwidth = 4
     limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
     kps = np.array(kps)
         y = kps[index][:, 1]
         length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
         angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
         out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
     out_img = (out_img * 0.6).astype(np.uint8)
         x, y = kp
         out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+    out_img_pil = PIL.Image.fromarray(out_img.astype(np.uint8))
     return out_img_pil
 def process_image(image, vae):
     image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=image.device)
     image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=image.dtype)
     image_latent_dist = vae.encode(input_image).latent_dist
     return image_latent_dist
 # Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
 def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
     tw = tgt_width
         raise AttributeError("Could not access latents of provided encoder_output")
+@dataclass
+class ConsisIDPipelineOutput(BaseOutput):
+    r"""
+    Output class for ConsisID pipelines.
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+    frames: torch.Tensor
 class ConsisIDPipeline(DiffusionPipeline):
     r"""
+    Pipeline for image-to-video generation using ConsisID.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
         text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. ConsisID uses
             [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
             [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
         tokenizer (`T5Tokenizer`):
         tokenizer: T5Tokenizer,
         text_encoder: T5EncoderModel,
         vae: AutoencoderKLCogVideoX,
+        transformer: Union[ConsisIDTransformer3DModel],
         scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
     ):
         super().__init__()
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    # Copied from diffusers.pipelines.consisid.pipeline_consisID.ConsisIDPipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
         self,
         prompt: Union[str, List[str]] = None,
         return prompt_embeds
+    # Copied from diffusers.pipelines.consisid.pipeline_consisid.ConsisIDPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
             if kps_cond is not None:
                 kps_cond = kps_cond.unsqueeze(2)
                 kps_cond_latents = [
+                    retrieve_latents(self.vae.encode(kps_cond[i].unsqueeze(0)), generator[i])
+                    for i in range(batch_size)
                 ]
         else:
             image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
         latents = latents * self.scheduler.init_noise_sigma
         return latents, image_latents
+    # Copied from diffusers.pipelines.consisid.pipeline_consisid.ConsisIDPipeline.decode_latents
     def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
         latents = 1 / self.vae_scaling_factor_image * latents
                     f" {negative_prompt_embeds.shape}."
                 )
+    # Copied from diffusers.pipelines.consisid.pipeline_consisid.ConsisIDPipeline.fuse_qkv_projections
     def fuse_qkv_projections(self) -> None:
         r"""Enables fused QKV projections."""
         self.fusing_transformer = True
         self.transformer.fuse_qkv_projections()
+    # Copied from diffusers.pipelines.consisid.pipeline_consisid.ConsisIDPipeline.unfuse_qkv_projections
     def unfuse_qkv_projections(self) -> None:
         r"""Disable QKV projection fusion if enabled."""
         if not self.fusing_transformer:
             self.transformer.unfuse_qkv_projections()
             self.fusing_transformer = False
+    # Copied from diffusers.pipelines.consisid.pipeline_consisid.ConsisIDPipeline._prepare_rotary_positional_embeddings
     def _prepare_rotary_positional_embeddings(
         self,
         height: int,
         id_vit_hidden: Optional[torch.Tensor] = None,
         id_cond: Optional[torch.Tensor] = None,
         kps_cond: Optional[torch.Tensor] = None,
+    ) -> Union[ConsisIDPipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
                 The width in pixels of the generated image. This is set to 720 by default for the best results.
             num_frames (`int`, defaults to `48`):
                 Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because ConsisID is conditioned with (num_seconds * fps + 1) frames where
                 num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
                 needs to be satisfied is that of divisibility mentioned above.
             num_inference_steps (`int`, *optional*, defaults to 50):
         Examples:
         Returns:
+            [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] or `tuple`:
+            [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
         if num_frames > 49:
         image = self.video_processor.preprocess(image, height=height, width=width).to(
             device, dtype=prompt_embeds.dtype
         )
         latent_channels = self.transformer.config.in_channels // 2
         latents, image_latents = self.prepare_latents(
             image,
             device,
             generator,
             latents,
+            kps_cond,
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
                     timestep=timestep,
                     image_rotary_emb=image_rotary_emb,
                     return_dict=False,
+                    id_vit_hidden=id_vit_hidden,
+                    id_cond=id_cond,
                 )[0]
                 noise_pred = noise_pred.float()
         if not return_dict:
             return (video,)
+        return ConsisIDPipelineOutput(frames=video)

models/transformer_consisid.py CHANGED Viewed

@@ -1,8 +1,16 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 from typing import Any, Dict, Optional, Tuple, Union
 import os
@@ -38,9 +46,9 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 @maybe_allow_in_graph
-class CogVideoXBlock(nn.Module):
     r"""
-    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
     Parameters:
         dim (`int`):
@@ -132,9 +140,6 @@ class CogVideoXBlock(nn.Module):
             hidden_states, encoder_hidden_states, temb
         )
-        # insert here
-        # pass
         # attention
         attn_hidden_states, attn_encoder_hidden_states = self.attn1(
             hidden_states=norm_hidden_states,
@@ -162,7 +167,7 @@ class CogVideoXBlock(nn.Module):
 class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
     """
-    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
     Parameters:
         num_attention_heads (`int`, defaults to `30`):
@@ -191,7 +196,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             The height of the input latents.
         sample_frames (`int`, defaults to `49`):
             The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
-            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
             but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
             K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
         patch_size (`int`, defaults to `2`):
@@ -212,6 +217,32 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             Scaling factor to apply in 3D positional embeddings across spatial dimensions.
         temporal_interpolation_scale (`float`, defaults to `1.0`):
             Scaling factor to apply in 3D positional embeddings across temporal dimensions.
     """
     _supports_gradient_checkpointing = True
@@ -257,7 +288,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
             raise ValueError(
-                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
                 "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
                 "issue at https://github.com/huggingface/diffusers/issues."
             )
@@ -288,7 +319,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 3. Define spatio-temporal transformers blocks
         self.transformer_blocks = nn.ModuleList(
             [
-                CogVideoXBlock(
                     dim=inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
@@ -319,6 +350,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         self.is_train_face = is_train_face
         self.is_kps = is_kps
         if is_train_face:
             self.inner_dim = inner_dim
             self.cross_attn_interval = cross_attn_interval
@@ -338,21 +370,26 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         weight_dtype = next(self.transformer_blocks.parameters()).dtype
         self.local_facial_extractor = LocalFacialExtractor()
         self.local_facial_extractor.to(device, dtype=weight_dtype)
-        self.perceiver_cross_attention = nn.ModuleList([
-            PerceiverCrossAttention(dim=self.inner_dim, dim_head=128, heads=16, kv_dim=self.LFE_final_output_dim).to(device, dtype=weight_dtype) for _ in range(self.num_ca)
-        ])
     def save_face_modules(self, path: str):
         save_dict = {
-            'local_facial_extractor': self.local_facial_extractor.state_dict(),
-            'perceiver_cross_attention': [ca.state_dict() for ca in self.perceiver_cross_attention],
         }
         torch.save(save_dict, path)
     def load_face_modules(self, path: str):
         checkpoint = torch.load(path, map_location=self.device)
-        self.local_facial_extractor.load_state_dict(checkpoint['local_facial_extractor'])
-        for ca, state_dict in zip(self.perceiver_cross_attention, checkpoint['perceiver_cross_attention']):
             ca.load_state_dict(state_dict)
     @property
@@ -463,14 +500,16 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         timestep_cond: Optional[torch.Tensor] = None,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
-        id_cond: Optional[torch.Tensor] = None,
         id_vit_hidden: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
         # fuse clip and insightface
         if self.is_train_face:
             assert id_cond is not None and id_vit_hidden is not None
-            valid_face_emb = self.local_facial_extractor(id_cond, id_vit_hidden)  # torch.Size([1, 1280]), list[5](torch.Size([1, 577, 1024]))  ->  torch.Size([1, 32, 2048])
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
@@ -506,7 +545,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         text_seq_length = encoder_hidden_states.shape[1]
         encoder_hidden_states = hidden_states[:, :text_seq_length]  # torch.Size([1, 226, 3072])
-        hidden_states = hidden_states[:, text_seq_length:]   # torch.Size([1, 17550, 3072])
         # 3. Transformer blocks
         ca_idx = 0
@@ -538,17 +577,14 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             if self.is_train_face:
                 if i % self.cross_attn_interval == 0 and valid_face_emb is not None:
-                    hidden_states = hidden_states + self.local_face_scale * self.perceiver_cross_attention[ca_idx](valid_face_emb, hidden_states)  # torch.Size([2, 32, 2048])  torch.Size([2, 17550, 3072])
                     ca_idx += 1
-        if not self.config.use_rotary_positional_embeddings:
-            # CogVideoX-2B
-            hidden_states = self.norm_final(hidden_states)
-        else:
-            # CogVideoX-5B
-            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-            hidden_states = self.norm_final(hidden_states)
-            hidden_states = hidden_states[:, text_seq_length:]
         # 4. Final block
         hidden_states = self.norm_out(hidden_states, temb=emb)
@@ -556,8 +592,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 5. Unpatchify
         # Note: we use `-1` instead of `channels`:
-        #   - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
-        #   - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
         p = self.config.patch_size
         output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
         output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)

+# Copyright 2024 ConsisID Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Any, Dict, Optional, Tuple, Union
 import os
 @maybe_allow_in_graph
+class ConsisIDBlock(nn.Module):
     r"""
+    Transformer block used in [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) model.
     Parameters:
         dim (`int`):
             hidden_states, encoder_hidden_states, temb
         )
         # attention
         attn_hidden_states, attn_encoder_hidden_states = self.attn1(
             hidden_states=norm_hidden_states,
 class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
     """
+    A Transformer model for video-like data in [ConsisID](https://github.com/PKU-YuanGroup/ConsisID).
     Parameters:
         num_attention_heads (`int`, defaults to `30`):
             The height of the input latents.
         sample_frames (`int`, defaults to `49`):
             The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because ConsisID processed 13 latent frames at once in its default and recommended settings,
             but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
             K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
         patch_size (`int`, defaults to `2`):
             Scaling factor to apply in 3D positional embeddings across spatial dimensions.
         temporal_interpolation_scale (`float`, defaults to `1.0`):
             Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+        is_train_face (`bool`, defaults to `False`):
+            Whether to use enable the identity-preserving module during the training process.
+            When set to `True`, the model will focus on identity-preserving tasks.
+        is_kps (`bool`, defaults to `False`):
+            Whether to enable keypoint for global facial extractor.
+            If `True`, keypoints will be in the model.
+        cross_attn_interval (`int`, defaults to `1`):
+            The interval between cross-attention layers in the Transformer architecture.
+            A larger value may reduce the frequency of cross-attention computations,
+            which can help reduce computational overhead.
+        LFE_num_tokens (`int`, defaults to `32`):
+            The number of tokens to use in the Local Facial Extractor (LFE).
+            This module is responsible for capturing high frequency representations
+            of the face.
+        LFE_output_dim (`int`, defaults to `768`):
+            The output dimension of the Local Facial Extractor (LFE) module.
+            This dimension determines the size of the feature vectors produced
+            by the LFE module.
+        LFE_heads (`int`, defaults to `12`):
+            The number of attention heads used in the Local Facial Extractor (LFE) module.
+            More heads may improve the ability to capture diverse features, but
+            can also increase computational complexity.
+        local_face_scale (`float`, defaults to `1.0`):
+            A scaling factor used to adjust the importance of local facial features
+            in the model. This can influence how strongly the model focuses on
+            high frequency face-related content.
     """
     _supports_gradient_checkpointing = True
         if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
             raise ValueError(
+                "There are no ConsisID checkpoints available with disable rotary embeddings and learned positional "
                 "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
                 "issue at https://github.com/huggingface/diffusers/issues."
             )
         # 3. Define spatio-temporal transformers blocks
         self.transformer_blocks = nn.ModuleList(
             [
+                ConsisIDBlock(
                     dim=inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
         self.is_train_face = is_train_face
         self.is_kps = is_kps
+        # 5. Define identity-preserving config
         if is_train_face:
             self.inner_dim = inner_dim
             self.cross_attn_interval = cross_attn_interval
         weight_dtype = next(self.transformer_blocks.parameters()).dtype
         self.local_facial_extractor = LocalFacialExtractor()
         self.local_facial_extractor.to(device, dtype=weight_dtype)
+        self.perceiver_cross_attention = nn.ModuleList(
+            [
+                PerceiverCrossAttention(
+                    dim=self.inner_dim, dim_head=128, heads=16, kv_dim=self.LFE_final_output_dim
+                ).to(device, dtype=weight_dtype)
+                for _ in range(self.num_ca)
+            ]
+        )
     def save_face_modules(self, path: str):
         save_dict = {
+            "local_facial_extractor": self.local_facial_extractor.state_dict(),
+            "perceiver_cross_attention": [ca.state_dict() for ca in self.perceiver_cross_attention],
         }
         torch.save(save_dict, path)
     def load_face_modules(self, path: str):
         checkpoint = torch.load(path, map_location=self.device)
+        self.local_facial_extractor.load_state_dict(checkpoint["local_facial_extractor"])
+        for ca, state_dict in zip(self.perceiver_cross_attention, checkpoint["perceiver_cross_attention"]):
             ca.load_state_dict(state_dict)
     @property
         timestep_cond: Optional[torch.Tensor] = None,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
+        id_cond: Optional[torch.Tensor] = None,
         id_vit_hidden: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
         # fuse clip and insightface
         if self.is_train_face:
             assert id_cond is not None and id_vit_hidden is not None
+            valid_face_emb = self.local_facial_extractor(
+                id_cond, id_vit_hidden
+            )  # torch.Size([1, 1280]), list[5](torch.Size([1, 577, 1024]))  ->  torch.Size([1, 32, 2048])
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
         text_seq_length = encoder_hidden_states.shape[1]
         encoder_hidden_states = hidden_states[:, :text_seq_length]  # torch.Size([1, 226, 3072])
+        hidden_states = hidden_states[:, text_seq_length:]  # torch.Size([1, 17550, 3072])
         # 3. Transformer blocks
         ca_idx = 0
             if self.is_train_face:
                 if i % self.cross_attn_interval == 0 and valid_face_emb is not None:
+                    hidden_states = hidden_states + self.local_face_scale * self.perceiver_cross_attention[ca_idx](
+                        valid_face_emb, hidden_states
+                    )  # torch.Size([2, 32, 2048])  torch.Size([2, 17550, 3072])
                     ca_idx += 1
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        hidden_states = self.norm_final(hidden_states)
+        hidden_states = hidden_states[:, text_seq_length:]
         # 4. Final block
         hidden_states = self.norm_out(hidden_states, temb=emb)
         # 5. Unpatchify
         # Note: we use `-1` instead of `channels`:
+        #   - It is okay to `channels` use for ConsisID (number of input channels is equal to output channels)
         p = self.config.patch_size
         output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
         output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)

models/utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import cv2
 import math
 import numpy as np
-from PIL import Image
 import torch
 from torchvision.transforms import InterpolationMode
@@ -10,7 +11,16 @@ from transformers import T5EncoderModel, T5Tokenizer
 from typing import List, Optional, Tuple, Union
 from diffusers.models.embeddings import get_3d_rotary_pos_embed
 from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
 def tensor_to_pil(src_img_tensor):
     img = src_img_tensor.clone().detach()
@@ -204,12 +214,12 @@ def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,2
     return out_img_pil
-def process_face_embeddings(face_helper, clip_vision_model, handler_ante, eva_transform_mean, eva_transform_std, app, device, weight_dtype, image, original_id_image=None, is_align_face=True, cal_uncond=False):
     """
     Args:
         image: numpy rgb image, range [0, 255]
     """
-    face_helper.clean_all()
     image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # (724, 502, 3)
     # get antelopev2 embedding
     face_info = app.get(image_bgr)
@@ -224,19 +234,19 @@ def process_face_embeddings(face_helper, clip_vision_model, handler_ante, eva_tr
         face_kps = None
     # using facexlib to detect and align face
-    face_helper.read_image(image_bgr)
-    face_helper.get_face_landmarks_5(only_center_face=True)
     if face_kps is None:
-        face_kps = face_helper.all_landmarks_5[0]
-    face_helper.align_warp_face()
-    if len(face_helper.cropped_faces) == 0:
         raise RuntimeError('facexlib align face fail')
-    align_face = face_helper.cropped_faces[0]  # (512, 512, 3)  # RGB
     # incase insightface didn't detect face
     if id_ante_embedding is None:
         print('fail to detect face using insightface, extract embedding on align face')
-        id_ante_embedding = handler_ante.get_feat(align_face)
     id_ante_embedding = torch.from_numpy(id_ante_embedding).to(device, weight_dtype)  # torch.Size([512])
     if id_ante_embedding.ndim == 1:
@@ -246,7 +256,7 @@ def process_face_embeddings(face_helper, clip_vision_model, handler_ante, eva_tr
     if is_align_face:
         input = img2tensor(align_face, bgr2rgb=True).unsqueeze(0) / 255.0  # torch.Size([1, 3, 512, 512])
         input = input.to(device)
-        parsing_out = face_helper.face_parse(normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
         parsing_out = parsing_out.argmax(dim=1, keepdim=True)  # torch.Size([1, 1, 512, 512])
         bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
         bg = sum(parsing_out == i for i in bg_label).bool()
@@ -270,4 +280,84 @@ def process_face_embeddings(face_helper, clip_vision_model, handler_ante, eva_tr
     id_cond = torch.cat([id_ante_embedding, id_cond_vit], dim=-1)  # torch.Size([1, 512]), torch.Size([1, 768])  ->  torch.Size([1, 1280])
-    return id_cond, id_vit_hidden, return_face_features_image_2, face_kps    # torch.Size([1, 1280]), list(torch.Size([1, 577, 1024]))

+import os
 import cv2
 import math
 import numpy as np
+from PIL import Image, ImageOps
 import torch
 from torchvision.transforms import InterpolationMode
 from typing import List, Optional, Tuple, Union
 from diffusers.models.embeddings import get_3d_rotary_pos_embed
 from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from diffusers.utils import load_image
+import insightface
+from insightface.app import FaceAnalysis
+from facexlib.parsing import init_parsing_model
+from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+from models.eva_clip import create_model_and_transforms
+from models.eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from models.eva_clip.utils_qformer import resize_numpy_image_long
 def tensor_to_pil(src_img_tensor):
     img = src_img_tensor.clone().detach()
     return out_img_pil
+def process_face_embeddings(face_helper_1, clip_vision_model, face_helper_2, eva_transform_mean, eva_transform_std, app, device, weight_dtype, image, original_id_image=None, is_align_face=True):
     """
     Args:
         image: numpy rgb image, range [0, 255]
     """
+    face_helper_1.clean_all()
     image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # (724, 502, 3)
     # get antelopev2 embedding
     face_info = app.get(image_bgr)
         face_kps = None
     # using facexlib to detect and align face
+    face_helper_1.read_image(image_bgr)
+    face_helper_1.get_face_landmarks_5(only_center_face=True)
     if face_kps is None:
+        face_kps = face_helper_1.all_landmarks_5[0]
+    face_helper_1.align_warp_face()
+    if len(face_helper_1.cropped_faces) == 0:
         raise RuntimeError('facexlib align face fail')
+    align_face = face_helper_1.cropped_faces[0]  # (512, 512, 3)  # RGB
     # incase insightface didn't detect face
     if id_ante_embedding is None:
         print('fail to detect face using insightface, extract embedding on align face')
+        id_ante_embedding = face_helper_2.get_feat(align_face)
     id_ante_embedding = torch.from_numpy(id_ante_embedding).to(device, weight_dtype)  # torch.Size([512])
     if id_ante_embedding.ndim == 1:
     if is_align_face:
         input = img2tensor(align_face, bgr2rgb=True).unsqueeze(0) / 255.0  # torch.Size([1, 3, 512, 512])
         input = input.to(device)
+        parsing_out = face_helper_1.face_parse(normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
         parsing_out = parsing_out.argmax(dim=1, keepdim=True)  # torch.Size([1, 1, 512, 512])
         bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
         bg = sum(parsing_out == i for i in bg_label).bool()
     id_cond = torch.cat([id_ante_embedding, id_cond_vit], dim=-1)  # torch.Size([1, 512]), torch.Size([1, 768])  ->  torch.Size([1, 1280])
+    return id_cond, id_vit_hidden, return_face_features_image_2, face_kps    # torch.Size([1, 1280]), list(torch.Size([1, 577, 1024]))
+def process_face_embeddings_infer(face_helper_1, clip_vision_model, face_helper_2, eva_transform_mean, eva_transform_std, app, device, weight_dtype, img_file_path, is_align_face=True):
+    """
+    Args:
+        image: numpy rgb image, range [0, 255]
+    """
+    if isinstance(img_file_path, str):
+        image = np.array(load_image(image=img_file_path).convert("RGB"))
+    else:
+        image = np.array(ImageOps.exif_transpose(Image.fromarray(img_file_path)).convert("RGB"))
+    image = resize_numpy_image_long(image, 1024)
+    original_id_image = image
+    id_cond, id_vit_hidden, align_crop_face_image, face_kps = process_face_embeddings(face_helper_1, clip_vision_model, face_helper_2, eva_transform_mean, eva_transform_std, app, device, weight_dtype, image, original_id_image, is_align_face)
+    tensor = align_crop_face_image.cpu().detach()
+    tensor = tensor.squeeze()
+    tensor = tensor.permute(1, 2, 0)
+    tensor = tensor.numpy() * 255
+    tensor = tensor.astype(np.uint8)
+    image = ImageOps.exif_transpose(Image.fromarray(tensor))
+    return id_cond, id_vit_hidden, image, face_kps
+def prepare_face_models(model_path, device, dtype):
+    """
+    Prepare all face models for the facial recognition task.
+    Parameters:
+    - model_path: Path to the directory containing model files.
+    - device: The device (e.g., 'cuda', 'cpu') where models will be loaded.
+    - dtype: Data type (e.g., torch.float32) for model inference.
+    Returns:
+    - face_helper_1: First face restoration helper.
+    - face_helper_2: Second face restoration helper.
+    - face_clip_model: CLIP model for face extraction.
+    - eva_transform_mean: Mean value for image normalization.
+    - eva_transform_std: Standard deviation value for image normalization.
+    - face_main_model: Main face analysis model.
+    """
+    # get helper model
+    face_helper_1 = FaceRestoreHelper(
+        upscale_factor=1,
+        face_size=512,
+        crop_ratio=(1, 1),
+        det_model='retinaface_resnet50',
+        save_ext='png',
+        device=device,
+        model_rootpath=os.path.join(model_path, "face_encoder")
+    )
+    face_helper_1.face_parse = None
+    face_helper_1.face_parse = init_parsing_model(model_name='bisenet', device=device, model_rootpath=os.path.join(model_path, "face_encoder"))
+    face_helper_2 = insightface.model_zoo.get_model(f'{model_path}/face_encoder/models/antelopev2/glintr100.onnx', providers=['CUDAExecutionProvider'])
+    face_helper_2.prepare(ctx_id=0)
+    # get local facial extractor part 1
+    model, _, _ = create_model_and_transforms('EVA02-CLIP-L-14-336', os.path.join(model_path, "face_encoder", "EVA02_CLIP_L_336_psz14_s6B.pt"), force_custom_clip=True)
+    face_clip_model = model.visual
+    eva_transform_mean = getattr(face_clip_model, 'image_mean', OPENAI_DATASET_MEAN)
+    eva_transform_std = getattr(face_clip_model, 'image_std', OPENAI_DATASET_STD)
+    if not isinstance(eva_transform_mean, (list, tuple)):
+        eva_transform_mean = (eva_transform_mean,) * 3
+    if not isinstance(eva_transform_std, (list, tuple)):
+        eva_transform_std = (eva_transform_std,) * 3
+    eva_transform_mean = eva_transform_mean
+    eva_transform_std = eva_transform_std
+    # get local facial extractor part 2
+    face_main_model = FaceAnalysis(name='antelopev2', root=os.path.join(model_path, "face_encoder"), providers=['CUDAExecutionProvider'])
+    face_main_model.prepare(ctx_id=0, det_size=(640, 640))
+    # move face models to device
+    face_helper_1.face_det.eval()
+    face_helper_1.face_parse.eval()
+    face_clip_model.eval()
+    return face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std

requirements.txt CHANGED Viewed

@@ -6,7 +6,7 @@ onnx==1.17.0
 onnxruntime-gpu==1.19.2
 deepspeed==0.15.2
 accelerate==1.1.1
-diffusers==0.31.0
 transformers==4.46.3
 tokenizers==0.20.1
 peft==0.12.0

 onnxruntime-gpu==1.19.2
 deepspeed==0.15.2
 accelerate==1.1.1
+git+https://github.com/SHYuanBest/ConsisID_diffusers.git
 transformers==4.46.3
 tokenizers==0.20.1
 peft==0.12.0

util/dataloader.py DELETED Viewed

@@ -1,1010 +0,0 @@
-import os
-import gc
-import cv2
-import json
-import math
-import decord
-import random
-import numpy as np
-from PIL import Image
-from tqdm import tqdm
-from decord import VideoReader
-from contextlib import contextmanager
-from func_timeout import FunctionTimedOut
-from typing import Optional, Sized, Iterator
-import torch
-from torch.utils.data import Dataset, Sampler
-import torch.nn.functional as F
-from torchvision.transforms import ToPILImage
-from torchvision import transforms
-from accelerate.logging import get_logger
-logger = get_logger(__name__)
-import threading
-log_lock = threading.Lock()
-def log_error_to_file(error_message, video_path):
-    with log_lock:
-        with open("error_log.txt", "a") as f:
-            f.write(f"Error: {error_message}\n")
-            f.write(f"Video Path: {video_path}\n")
-            f.write("-" * 50 + "\n")
-def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]):
-    stickwidth = 4
-    limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
-    kps = np.array(kps)
-    w, h = image_pil.size
-    out_img = np.zeros([h, w, 3])
-    for i in range(len(limbSeq)):
-        index = limbSeq[i]
-        color = color_list[index[0]]
-        x = kps[index][:, 0]
-        y = kps[index][:, 1]
-        length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
-        angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
-        polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
-        out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
-    out_img = (out_img * 0.6).astype(np.uint8)
-    for idx_kp, kp in enumerate(kps):
-        color = color_list[idx_kp]
-        x, y = kp
-        out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
-    out_img_pil = Image.fromarray(out_img.astype(np.uint8))
-    return out_img_pil
-@contextmanager
-def VideoReader_contextmanager(*args, **kwargs):
-    vr = VideoReader(*args, **kwargs)
-    try:
-        yield vr
-    finally:
-        del vr
-        gc.collect()
-def get_valid_segments(valid_frame, tolerance=5):
-    valid_positions = sorted(set(valid_frame['face']).union(set(valid_frame['head'])))
-    valid_segments = []
-    current_segment = [valid_positions[0]]
-    for i in range(1, len(valid_positions)):
-        if valid_positions[i] - valid_positions[i - 1] <= tolerance:
-            current_segment.append(valid_positions[i])
-        else:
-            valid_segments.append(current_segment)
-            current_segment = [valid_positions[i]]
-    if current_segment:
-        valid_segments.append(current_segment)
-    return valid_segments
-def get_frame_indices_adjusted_for_face(valid_frames, n_frames):
-    valid_length = len(valid_frames)
-    if valid_length >= n_frames:
-        return valid_frames[:n_frames]
-    additional_frames_needed = n_frames - valid_length
-    repeat_indices = []
-    for i in range(additional_frames_needed):
-        index_to_repeat = i % valid_length
-        repeat_indices.append(valid_frames[index_to_repeat])
-    all_indices = valid_frames + repeat_indices
-    all_indices.sort()
-    return all_indices
-def generate_frame_indices_for_face(n_frames, sample_stride, valid_frame, tolerance=7, skip_frames_start_percent=0.0, skip_frames_end_percent=1.0, skip_frames_start=0, skip_frames_end=0):
-    valid_segments = get_valid_segments(valid_frame, tolerance)
-    selected_segment = max(valid_segments, key=len)
-    valid_length = len(selected_segment)
-    if skip_frames_start_percent != 0.0 or skip_frames_end_percent != 1.0:
-        # print("use skip frame percent")
-        valid_start = int(valid_length * skip_frames_start_percent)
-        valid_end = int(valid_length * skip_frames_end_percent)
-    elif skip_frames_start != 0 or skip_frames_end != 0:
-        # print("use skip frame")
-        valid_start = skip_frames_start
-        valid_end = valid_length - skip_frames_end
-    else:
-        # print("no use skip frame")
-        valid_start = 0
-        valid_end = valid_length
-    if valid_length <= n_frames:
-        return get_frame_indices_adjusted_for_face(selected_segment, n_frames), valid_length
-    else:
-        adjusted_length = valid_end - valid_start
-        if adjusted_length <= 0:
-            print(f"video_length: {valid_length}, adjusted_length: {adjusted_length}, valid_start:{valid_start}, skip_frames_end: {valid_end}")
-            raise ValueError("Skipping too many frames results in no frames left to sample.")
-        clip_length = min(adjusted_length, (n_frames - 1) * sample_stride + 1)
-        start_idx_position = random.randint(valid_start, valid_end - clip_length)
-        start_frame = selected_segment[start_idx_position]
-        selected_frames = []
-        for i in range(n_frames):
-            next_frame = start_frame + i * sample_stride
-            if next_frame in selected_segment:
-                selected_frames.append(next_frame)
-            else:
-                break
-        if len(selected_frames) < n_frames:
-            return get_frame_indices_adjusted_for_face(selected_frames, n_frames), len(selected_frames)
-        return selected_frames, len(selected_frames)
-def frame_has_required_confidence(bbox_data, frame, ID, conf_threshold=0.88):
-    frame_str = str(frame)
-    if frame_str not in bbox_data:
-        return False
-    frame_data = bbox_data[frame_str]
-    face_conf = any(
-        item['confidence'] > conf_threshold and item['new_track_id'] == ID
-        for item in frame_data.get('face', [])
-    )
-    head_conf = any(
-        item['confidence'] > conf_threshold and item['new_track_id'] == ID
-        for item in frame_data.get('head', [])
-    )
-    return face_conf and head_conf
-def select_mask_frames_from_index(batch_frame, original_batch_frame, valid_id, corresponding_data, control_sam2_frame,
-                                  valid_frame, bbox_data, base_dir, min_distance=3, min_frames=1, max_frames=5,
-                                  mask_type='face', control_mask_type='head', dense_masks=False,
-                                  ensure_control_frame=True):
-    """
-    Selects frames with corresponding mask images while ensuring a minimum distance constraint between frames,
-    and that the frames exist in both batch_frame and valid_frame.
-    Parameters:
-        base_path (str): Base directory where the JSON files and mask results are located.
-        min_distance (int): Minimum distance between selected frames.
-        min_frames (int): Minimum number of frames to select.
-        max_frames (int): Maximum number of frames to select.
-        mask_type (str): Type of mask to select frames for ('face' or 'head').
-        control_mask_type (str): Type of mask used for control frame selection ('face' or 'head').
-    Returns:
-        dict: A dictionary where keys are IDs and values are lists of selected mask PNG paths.
-    """
-    # Helper function to randomly select frames with at least X frames apart
-    def select_frames_with_distance_constraint(frames, num_frames, min_distance, control_frame, bbox_data, ID,
-                                               ensure_control_frame=True, fallback=True):
-        """
-        Selects frames with a minimum distance constraint. If not enough frames can be selected, a fallback plan is applied.
-        Parameters:
-            frames (list): List of frame indices to select from.
-            num_frames (int): Number of frames to select.
-            min_distance (int): Minimum distance between selected frames.
-            control_frame (int): The control frame that must always be included.
-            fallback (bool): Whether to apply a fallback strategy if not enough frames meet the distance constraint.
-        Returns:
-            list: List of selected frames.
-        """
-        conf_thresholds = [0.95, 0.94, 0.93, 0.92, 0.91, 0.90]
-        if ensure_control_frame:
-            selected_frames = [control_frame]  # Ensure control frame is always included
-        else:
-            valid_initial_frames = []
-            for conf_threshold in conf_thresholds:
-                valid_initial_frames = [
-                    f for f in frames
-                    if frame_has_required_confidence(bbox_data, f, ID, conf_threshold=conf_threshold)
-                ]
-                if valid_initial_frames:
-                    break
-            if valid_initial_frames:
-                selected_frames = [random.choice(valid_initial_frames)]
-            else:
-                # If no frame meets the initial confidence, fall back to a random frame (or handle as per your preference)
-                selected_frames = [random.choice(frames)]
-        available_frames = [f for f in frames if f != selected_frames[0]]  # Exclude control frame for random selection
-        random.shuffle(available_frames)  # Shuffle to introduce randomness
-        while available_frames and len(selected_frames) < num_frames:
-            last_selected_frame = selected_frames[-1]
-            valid_choices = []
-            for conf_threshold in conf_thresholds:
-                valid_choices = [
-                    f for f in available_frames
-                    if abs(f - last_selected_frame) >= min_distance and
-                       frame_has_required_confidence(bbox_data, f, ID, conf_threshold=conf_threshold)
-                ]
-                if valid_choices:
-                    break
-            if valid_choices:
-                frame = random.choice(valid_choices)
-                available_frames.remove(frame)
-                selected_frames.append(frame)
-            else:
-                if fallback:
-                    # Fallback strategy: uniformly distribute remaining frames if distance constraint cannot be met
-                    remaining_needed = num_frames - len(selected_frames)
-                    remaining_frames = available_frames[:remaining_needed]
-                    # Distribute the remaining frames evenly if possible
-                    if remaining_frames:
-                        step = max(1, len(remaining_frames) // remaining_needed)
-                        evenly_selected = remaining_frames[::step][:remaining_needed]
-                        selected_frames.extend(evenly_selected)
-                    break
-                else:
-                    break  # No valid choices remain and no fallback strategy is allowed
-        if len(selected_frames) < num_frames:
-            return None
-        return selected_frames
-    # Convert batch_frame list to a set to remove duplicates
-    batch_frame_set = set(batch_frame)
-    # Dictionary to store selected mask PNGs
-    selected_masks_dict = {}
-    selected_bboxs_dict = {}
-    dense_masks_dict = {}
-    selected_frames_dict = {}
-    # ID
-    try:
-        mask_valid_frames = valid_frame[mask_type]  # Select frames based on the specified mask type
-        control_valid_frames = valid_frame[control_mask_type]  # Control frames for control_mask_type
-    except KeyError:
-        if mask_type not in valid_frame.keys():
-            print(f"no valid {mask_type}")
-        if control_mask_type not in valid_frame.keys():
-            print(f"no valid {control_mask_type}")
-    # Get the control frame for the control mask type
-    control_frame = control_sam2_frame[valid_id][control_mask_type]
-    # Filter frames to only those which are in both valid_frame and batch_frame_set
-    valid_frames = []
-    # valid_frames = [frame for frame in mask_valid_frames if frame in control_valid_frames and frame in batch_frame_set]
-    for frame in mask_valid_frames:
-        if frame in control_valid_frames and frame in batch_frame_set:
-            # Check if bbox_data has 'head' or 'face' for the frame
-            if str(frame) in bbox_data:
-                frame_data = bbox_data[str(frame)]
-                if 'head' in frame_data or 'face' in frame_data:
-                    valid_frames.append(frame)
-    # Ensure the control frame is included in the valid frames
-    if ensure_control_frame and (control_frame not in valid_frames):
-        valid_frames.append(control_frame)
-    # Select a random number of frames between min_frames and max_frames
-    num_frames_to_select = random.randint(min_frames, max_frames)
-    selected_frames = select_frames_with_distance_constraint(valid_frames, num_frames_to_select, min_distance,
-                                                             control_frame, bbox_data, valid_id, ensure_control_frame)
-    # Store the selected frames as mask PNGs and bbox
-    selected_masks_dict[valid_id] = []
-    selected_bboxs_dict[valid_id] = []
-    # Initialize the dense_masks_dict entry for the current ID
-    dense_masks_dict[valid_id] = []
-    # Store the selected frames in the dictionary
-    selected_frames_dict[valid_id] = selected_frames
-    if dense_masks:
-        for frame in original_batch_frame:
-            mask_data_path = f"{base_dir}/{valid_id}/annotated_frame_{int(frame):05d}.png"
-            mask_array = np.array(Image.open(mask_data_path))
-            binary_mask = np.where(mask_array > 0, 1, 0).astype(np.uint8)
-            dense_masks_dict[valid_id].append(binary_mask)
-    for frame in selected_frames:
-        mask_data_path = f"{base_dir}/{valid_id}/annotated_frame_{frame:05d}.png"
-        mask_array = np.array(Image.open(mask_data_path))
-        binary_mask = np.where(mask_array > 0, 1, 0).astype(np.uint8)
-        selected_masks_dict[valid_id].append(binary_mask)
-        try:
-            for item in bbox_data[f"{frame}"]["head"]:
-                if item['new_track_id'] == int(valid_id):
-                    temp_bbox = item['box']
-                    break
-        except (KeyError, StopIteration):
-            try:
-                for item in bbox_data[f"{frame}"]["face"]:
-                    if item['new_track_id'] == int(valid_id):
-                        temp_bbox = item['box']
-                        break
-            except (KeyError, StopIteration):
-                temp_bbox = None
-        selected_bboxs_dict[valid_id].append(temp_bbox)
-    return selected_frames_dict, selected_masks_dict, selected_bboxs_dict, dense_masks_dict
-def pad_tensor(tensor, target_size, dim=0):
-    padding_size = target_size - tensor.size(dim)
-    if padding_size > 0:
-        pad_shape = list(tensor.shape)
-        pad_shape[dim] = padding_size
-        padding_tensor = torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)
-        return torch.cat([tensor, padding_tensor], dim=dim)
-    else:
-        return tensor[:target_size]
-def crop_images(selected_frame_index, selected_bboxs_dict, video_reader, return_ori=False):
-    """
-    Crop images based on given bounding boxes and frame indices from a video.
-    Args:
-        selected_frame_index (list): List of frame indices to be cropped.
-        selected_bboxs_dict (list of dict): List of dictionaries, each containing 'x1', 'y1', 'x2', 'y2' bounding box coordinates.
-        video_reader (VideoReader or list of numpy arrays): Video frames accessible by index, where each frame is a numpy array (H, W, C).
-    Returns:
-        list: A list of cropped images in PIL Image format.
-    """
-    expanded_cropped_images = []
-    original_cropped_images = []
-    for frame_idx, bbox in zip(selected_frame_index, selected_bboxs_dict):
-        # Get the specific frame from the video reader using the frame index
-        frame = video_reader[frame_idx]  # torch.tensor # (H, W, C)
-        # Extract bounding box coordinates and convert them to integers
-        x1, y1, x2, y2 = int(bbox['x1']), int(bbox['y1']), int(bbox['x2']), int(bbox['y2'])
-        # Crop to minimize the bounding box to a square
-        width = x2 - x1  # Calculate the width of the bounding box
-        height = y2 - y1  # Calculate the height of the bounding box
-        side_length = max(width, height)  # Determine the side length of the square (max of width or height)
-        # Calculate the center of the bounding box
-        center_x = (x1 + x2) // 2
-        center_y = (y1 + y2) // 2
-        # Calculate new coordinates for the square region centered around the original bounding box
-        new_x1 = max(0, center_x - side_length // 2)  # Ensure x1 is within image bounds
-        new_y1 = max(0, center_y - side_length // 2)  # Ensure y1 is within image bounds
-        new_x2 = min(frame.shape[1], new_x1 + side_length)  # Ensure x2 does not exceed image width
-        new_y2 = min(frame.shape[0], new_y1 + side_length)  # Ensure y2 does not exceed image height
-        # Adjust coordinates if the cropped area is smaller than the desired side length
-        # Ensure final width and height are equal, keeping it a square
-        actual_width = new_x2 - new_x1
-        actual_height = new_y2 - new_y1
-        if actual_width < side_length:
-            # Adjust x1 or x2 to ensure the correct side length, while staying in bounds
-            if new_x1 == 0:
-                new_x2 = min(frame.shape[1], new_x1 + side_length)
-            else:
-                new_x1 = max(0, new_x2 - side_length)
-        if actual_height < side_length:
-            # Adjust y1 or y2 to ensure the correct side length, while staying in bounds
-            if new_y1 == 0:
-                new_y2 = min(frame.shape[0], new_y1 + side_length)
-            else:
-                new_y1 = max(0, new_y2 - side_length)
-        # Expand the square by 20%
-        expansion_ratio = 0.2  # Define the expansion ratio
-        expansion_amount = int(side_length * expansion_ratio)  # Calculate the number of pixels to expand by
-        # Calculate expanded coordinates, ensuring they stay within image bounds
-        expanded_x1 = max(0, new_x1 - expansion_amount)  # Expand left, ensuring x1 is within bounds
-        expanded_y1 = max(0, new_y1 - expansion_amount)  # Expand up, ensuring y1 is within bounds
-        expanded_x2 = min(frame.shape[1], new_x2 + expansion_amount)  # Expand right, ensuring x2 does not exceed bounds
-        expanded_y2 = min(frame.shape[0], new_y2 + expansion_amount)  # Expand down, ensuring y2 does not exceed bounds
-        # Ensure the expanded area is still a square
-        expanded_width = expanded_x2 - expanded_x1
-        expanded_height = expanded_y2 - expanded_y1
-        final_side_length = min(expanded_width, expanded_height)
-        # Adjust to ensure square shape if necessary
-        if expanded_width != expanded_height:
-            if expanded_width > expanded_height:
-                expanded_x2 = expanded_x1 + final_side_length
-            else:
-                expanded_y2 = expanded_y1 + final_side_length
-        expanded_cropped_rgb_tensor = frame[expanded_y1:expanded_y2, expanded_x1:expanded_x2, :]
-        expanded_cropped_rgb = Image.fromarray(np.array(expanded_cropped_rgb_tensor)).convert('RGB')
-        expanded_cropped_images.append(expanded_cropped_rgb)
-        if return_ori:
-            original_cropped_rgb_tensor = frame[new_y1:new_y2, new_x1:new_x2, :]
-            original_cropped_rgb = Image.fromarray(np.array(original_cropped_rgb_tensor)).convert('RGB')
-            original_cropped_images.append(original_cropped_rgb)
-            return expanded_cropped_images, original_cropped_images
-    return expanded_cropped_images, None
-def process_cropped_images(expand_images_pil, original_images_pil, target_size=(480, 480)):
-    """
-    Process a list of cropped images in PIL format.
-    Parameters:
-    expand_images_pil (list of PIL.Image): List of cropped images in PIL format.
-    target_size (tuple of int): The target size for resizing images, default is (480, 480).
-    Returns:
-    torch.Tensor: A tensor containing the processed images.
-    """
-    expand_face_imgs = []
-    original_face_imgs = []
-    if len(original_images_pil) != 0:
-        for expand_img, original_img in zip(expand_images_pil, original_images_pil):
-            expand_resized_img = expand_img.resize(target_size, Image.LANCZOS)
-            expand_src_img = np.array(expand_resized_img)
-            expand_src_img = np.transpose(expand_src_img, (2, 0, 1))
-            expand_src_img = torch.from_numpy(expand_src_img).unsqueeze(0).float()
-            expand_face_imgs.append(expand_src_img)
-            original_resized_img = original_img.resize(target_size, Image.LANCZOS)
-            original_src_img = np.array(original_resized_img)
-            original_src_img = np.transpose(original_src_img, (2, 0, 1))
-            original_src_img = torch.from_numpy(original_src_img).unsqueeze(0).float()
-            original_face_imgs.append(original_src_img)
-        expand_face_imgs = torch.cat(expand_face_imgs, dim=0)
-        original_face_imgs = torch.cat(original_face_imgs, dim=0)
-    else:
-        for expand_img in expand_images_pil:
-            expand_resized_img = expand_img.resize(target_size, Image.LANCZOS)
-            expand_src_img = np.array(expand_resized_img)
-            expand_src_img = np.transpose(expand_src_img, (2, 0, 1))
-            expand_src_img = torch.from_numpy(expand_src_img).unsqueeze(0).float()
-            expand_face_imgs.append(expand_src_img)
-        expand_face_imgs = torch.cat(expand_face_imgs, dim=0)
-        original_face_imgs = None
-    return expand_face_imgs, original_face_imgs
-class RandomSampler(Sampler[int]):
-    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
-    If with replacement, then user can specify :attr:`num_samples` to draw.
-    Args:
-        data_source (Dataset): dataset to sample from
-        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
-        num_samples (int): number of samples to draw, default=`len(dataset)`.
-        generator (Generator): Generator used in sampling.
-    """
-    data_source: Sized
-    replacement: bool
-    def __init__(self, data_source: Sized, replacement: bool = False,
-                 num_samples: Optional[int] = None, generator=None) -> None:
-        self.data_source = data_source
-        self.replacement = replacement
-        self._num_samples = num_samples
-        self.generator = generator
-        self._pos_start = 0
-        if not isinstance(self.replacement, bool):
-            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
-        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
-            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
-    @property
-    def num_samples(self) -> int:
-        # dataset size might change at runtime
-        if self._num_samples is None:
-            return len(self.data_source)
-        return self._num_samples
-    def __iter__(self) -> Iterator[int]:
-        n = len(self.data_source)
-        if self.generator is None:
-            seed = int(torch.empty((), dtype=torch.int64).random_().item())
-            generator = torch.Generator()
-            generator.manual_seed(seed)
-        else:
-            generator = self.generator
-        if self.replacement:
-            for _ in range(self.num_samples // 32):
-                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
-            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
-        else:
-            for _ in range(self.num_samples // n):
-                xx = torch.randperm(n, generator=generator).tolist()
-                if self._pos_start >= n:
-                    self._pos_start = 0
-                print("xx top 10", xx[:10], self._pos_start)
-                for idx in range(self._pos_start, n):
-                    yield xx[idx]
-                    self._pos_start = (self._pos_start + 1) % n
-                self._pos_start = 0
-            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
-    def __len__(self) -> int:
-        return self.num_samples
-class SequentialSampler(Sampler[int]):
-    r"""Samples elements sequentially, always in the same order.
-    Args:
-        data_source (Dataset): dataset to sample from
-    """
-    data_source: Sized
-    def __init__(self, data_source: Sized) -> None:
-        self.data_source = data_source
-        self._pos_start = 0
-    def __iter__(self) -> Iterator[int]:
-        n = len(self.data_source)
-        for idx in range(self._pos_start, n):
-            yield idx
-            self._pos_start = (self._pos_start + 1) % n
-        self._pos_start = 0
-    def __len__(self) -> int:
-        return len(self.data_source)
-class ConsisID_Dataset(Dataset):
-    def __init__(
-            self,
-            instance_data_root: Optional[str] = None,
-            id_token: Optional[str] = None,
-            height=480,
-            width=640,
-            max_num_frames=49,
-            sample_stride=3,
-            skip_frames_start_percent=0.0,
-            skip_frames_end_percent=1.0,
-            skip_frames_start=0,
-            skip_frames_end=0,
-            text_drop_ratio=-1,
-            is_train_face=False,
-            is_single_face=False,
-            miss_tolerance=6,
-            min_distance=3,
-            min_frames=1,
-            max_frames=5,
-            is_cross_face=False,
-            is_reserve_face=False,
-    ):
-        self.id_token = id_token or ""
-        # ConsisID
-        self.skip_frames_start_percent = skip_frames_start_percent
-        self.skip_frames_end_percent   = skip_frames_end_percent
-        self.skip_frames_start         = skip_frames_start
-        self.skip_frames_end           = skip_frames_end
-        self.is_train_face             = is_train_face
-        self.is_single_face            = is_single_face
-        if is_train_face:
-            self.miss_tolerance     = miss_tolerance
-            self.min_distance       = min_distance
-            self.min_frames         = min_frames
-            self.max_frames         = max_frames
-            self.is_cross_face      = is_cross_face
-            self.is_reserve_face    = is_reserve_face
-        # Loading annotations from files
-        print(f"loading annotations from {instance_data_root} ...")
-        with open(instance_data_root, 'r') as f:
-            folder_anno = [i.strip().split(',') for i in f.readlines() if len(i.strip()) > 0]
-        self.instance_prompts = []
-        self.instance_video_paths = []
-        self.instance_annotation_base_paths = []
-        for sub_root, anno, anno_base in tqdm(folder_anno):
-            print(anno)
-            self.instance_annotation_base_paths.append(anno_base)
-            with open(anno, 'r') as f:
-                sub_list = json.load(f)
-            for i in tqdm(sub_list):
-                path = os.path.join(sub_root, os.path.basename(i['path']))
-                cap = i.get('cap', None)
-                fps = i.get('fps', 0)
-                duration = i.get('duration', 0)
-                if fps * duration < 49.0:
-                    continue
-                self.instance_prompts.append(cap)
-                self.instance_video_paths.append(path)
-        self.num_instance_videos = len(self.instance_video_paths)
-        self.text_drop_ratio = text_drop_ratio
-        # Video params
-        self.sample_stride = sample_stride
-        self.max_num_frames = max_num_frames
-        self.height = height
-        self.width = width
-    def _get_frame_indices_adjusted(self, video_length, n_frames):
-        indices = list(range(video_length))
-        additional_frames_needed = n_frames - video_length
-        repeat_indices = []
-        for i in range(additional_frames_needed):
-            index_to_repeat = i % video_length
-            repeat_indices.append(indices[index_to_repeat])
-        all_indices = indices + repeat_indices
-        all_indices.sort()
-        return all_indices
-    def _generate_frame_indices(self, video_length, n_frames, sample_stride, skip_frames_start_percent=0.0, skip_frames_end_percent=1.0, skip_frames_start=0, skip_frames_end=0):
-        if skip_frames_start_percent != 0.0 or  skip_frames_end_percent != 1.0:
-            print("use skip frame percent")
-            valid_start = int(video_length * skip_frames_start_percent)
-            valid_end = int(video_length * skip_frames_end_percent)
-        elif skip_frames_start != 0 or skip_frames_end != 0:
-            print("use skip frame")
-            valid_start = skip_frames_start
-            valid_end = video_length - skip_frames_end
-        else:
-            print("no use skip frame")
-            valid_start = 0
-            valid_end = video_length
-        adjusted_length = valid_end - valid_start
-        if adjusted_length <= 0:
-            print(f"video_length: {video_length}, adjusted_length: {adjusted_length}, valid_start:{valid_start}, skip_frames_end: {valid_end}")
-            raise ValueError("Skipping too many frames results in no frames left to sample.")
-        if video_length <= n_frames:
-            return self._get_frame_indices_adjusted(video_length, n_frames)
-        else:
-            # clip_length = min(video_length, (n_frames - 1) * sample_stride + 1)
-            # start_idx = random.randint(0, video_length - clip_length)
-            # frame_indices = np.linspace(start_idx, start_idx + clip_length - 1, n_frames, dtype=int).tolist()
-            clip_length = min(adjusted_length, (n_frames - 1) * sample_stride + 1)
-            start_idx = random.randint(valid_start, valid_end - clip_length)
-            frame_indices = np.linspace(start_idx, start_idx + clip_length - 1, n_frames, dtype=int).tolist()
-            return frame_indices
-    def _short_resize_and_crop(self, frames, target_width, target_height):
-        """
-        Resize frames and crop to the specified size.
-        Args:
-            frames (torch.Tensor): Input frames of shape [T, H, W, C].
-            target_width (int): Desired width.
-            target_height (int): Desired height.
-        Returns:
-            torch.Tensor: Cropped frames of shape [T, target_height, target_width, C].
-        """
-        T, H, W, C = frames.shape
-        aspect_ratio = W / H
-        # Determine new dimensions ensuring they are at least target size
-        if aspect_ratio > target_width / target_height:
-            new_width = target_width
-            new_height = int(target_width / aspect_ratio)
-            if new_height < target_height:
-                new_height = target_height
-                new_width = int(target_height * aspect_ratio)
-        else:
-            new_height = target_height
-            new_width = int(target_height * aspect_ratio)
-            if new_width < target_width:
-                new_width = target_width
-                new_height = int(target_width / aspect_ratio)
-        resize_transform = transforms.Resize((new_height, new_width))
-        crop_transform = transforms.CenterCrop((target_height, target_width))
-        frames_tensor = frames.permute(0, 3, 1, 2)  # (T, H, W, C) -> (T, C, H, W)
-        resized_frames = resize_transform(frames_tensor)
-        cropped_frames = crop_transform(resized_frames)
-        sample = cropped_frames.permute(0, 2, 3, 1)
-        return sample
-    def _resize_with_aspect_ratio(self, frames, target_width, target_height):
-        """
-            Resize frames while maintaining the aspect ratio by padding or cropping.
-            Args:
-                frames (torch.Tensor): Input frames of shape [T, H, W, C].
-                target_width (int): Desired width.
-                target_height (int): Desired height.
-            Returns:
-                torch.Tensor: Resized and padded frames of shape [T, target_height, target_width, C].
-        """
-        T, frame_height, frame_width, C = frames.shape
-        aspect_ratio = frame_width / frame_height  # 1.77, 1280 720 -> 720 406
-        target_aspect_ratio = target_width / target_height  # 1.50, 720 480 ->
-        # If the frame is wider than the target, resize based on width
-        if aspect_ratio > target_aspect_ratio:
-            new_width = target_width
-            new_height = int(target_width / aspect_ratio)
-        else:
-            new_height = target_height
-            new_width = int(target_height * aspect_ratio)
-        # Resize using batch processing
-        frames = frames.permute(0, 3, 1, 2)  # [T, C, H, W]
-        frames = F.interpolate(frames, size=(new_height, new_width), mode='bilinear', align_corners=False)
-        # Calculate padding
-        pad_top = (target_height - new_height) // 2
-        pad_bottom = target_height - new_height - pad_top
-        pad_left = (target_width - new_width) // 2
-        pad_right = target_width - new_width - pad_left
-        # Apply padding
-        frames = F.pad(frames, (pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
-        frames = frames.permute(0, 2, 3, 1)  # [T, H, W, C]
-        return frames
-    def _save_frame(self, frame, name="1.png"):
-        # [H, W, C] -> [C, H, W]
-        img = frame
-        img = img.permute(2, 0, 1)
-        to_pil = ToPILImage()
-        img = to_pil(img)
-        img.save(name)
-    def _save_video(self, torch_frames, name="output.mp4"):
-        from moviepy.editor import ImageSequenceClip
-        frames_np = torch_frames.cpu().numpy()
-        if frames_np.dtype != 'uint8':
-            frames_np = frames_np.astype('uint8')
-        frames_list = [frame for frame in frames_np]
-        desired_fps = 24
-        clip = ImageSequenceClip(frames_list, fps=desired_fps)
-        clip.write_videofile(name, codec="libx264")
-    def get_batch(self, idx):
-        decord.bridge.set_bridge("torch")
-        video_dir = self.instance_video_paths[idx]
-        text = self.instance_prompts[idx]
-        train_transforms = transforms.Compose(
-            [
-                transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
-            ]
-        )
-        with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
-            video_num_frames = len(video_reader)
-            if self.is_train_face:
-                reserve_face_imgs = None
-                file_base_name = os.path.basename(video_dir.replace(".mp4", ""))
-                anno_base_path = self.instance_annotation_base_paths[idx]
-                valid_frame_path = os.path.join(anno_base_path, "track_masks_data", file_base_name, "valid_frame.json")
-                control_sam2_frame_path = os.path.join(anno_base_path, "track_masks_data", file_base_name, "control_sam2_frame.json")
-                corresponding_data_path = os.path.join(anno_base_path, "track_masks_data", file_base_name, "corresponding_data.json")
-                masks_data_path = os.path.join(anno_base_path, "track_masks_data", file_base_name, "tracking_mask_results")
-                bboxs_data_path = os.path.join(anno_base_path, "refine_bbox_jsons", f"{file_base_name}.json")
-                with open(corresponding_data_path, 'r') as f:
-                    corresponding_data = json.load(f)
-                with open(control_sam2_frame_path, 'r') as f:
-                    control_sam2_frame = json.load(f)
-                with open(valid_frame_path, 'r') as f:
-                    valid_frame = json.load(f)
-                with open(bboxs_data_path, 'r') as f:
-                    bbox_data = json.load(f)
-                if self.is_single_face:
-                    if len(corresponding_data) != 1:
-                        raise ValueError(f"Using single face, but {idx} is multi person.")
-                # get random valid id
-                valid_ids = []
-                backup_ids = []
-                for id_key, data in corresponding_data.items():
-                    if 'face' in data and 'head' in data:
-                        valid_ids.append(id_key)
-                valid_id = random.choice(valid_ids) if valid_ids else (random.choice(backup_ids) if backup_ids else None)
-                if valid_id is None:
-                    raise ValueError("No valid ID found: both valid_ids and backup_ids are empty.")
-                # get video
-                total_index = list(range(video_num_frames))
-                batch_index, _ = generate_frame_indices_for_face(self.max_num_frames, self.sample_stride, valid_frame[valid_id],
-                                                                          self.miss_tolerance, self.skip_frames_start_percent, self.skip_frames_end_percent,
-                                                                          self.skip_frames_start, self.skip_frames_end)
-                if self.is_cross_face:
-                    remaining_batch_index_index = [i for i in total_index if i not in batch_index]
-                    try:
-                        selected_frame_index, selected_masks_dict, selected_bboxs_dict, dense_masks_dict = select_mask_frames_from_index(
-                                                                                                                            remaining_batch_index_index,
-                                                                                                                            batch_index, valid_id,
-                                                                                                                            corresponding_data, control_sam2_frame,
-                                                                                                                            valid_frame[valid_id], bbox_data, masks_data_path,
-                                                                                                                            min_distance=self.min_distance, min_frames=self.min_frames,
-                                                                                                                            max_frames=self.max_frames, dense_masks=True,
-                                                                                                                            ensure_control_frame=False,
-                                                                                                                        )
-                    except:
-                        selected_frame_index, selected_masks_dict, selected_bboxs_dict, dense_masks_dict = select_mask_frames_from_index(
-                                                                                                                            batch_index,
-                                                                                                                            batch_index, valid_id,
-                                                                                                                            corresponding_data, control_sam2_frame,
-                                                                                                                            valid_frame[valid_id], bbox_data, masks_data_path,
-                                                                                                                            min_distance=self.min_distance, min_frames=self.min_frames,
-                                                                                                                            max_frames=self.max_frames, dense_masks=True,
-                                                                                                                            ensure_control_frame=False,
-                                                                                                                        )
-                else:
-                    selected_frame_index, selected_masks_dict, selected_bboxs_dict, dense_masks_dict = select_mask_frames_from_index(
-                                                                                                                        batch_index,
-                                                                                                                        batch_index, valid_id,
-                                                                                                                        corresponding_data, control_sam2_frame,
-                                                                                                                        valid_frame[valid_id], bbox_data, masks_data_path,
-                                                                                                                        min_distance=self.min_distance, min_frames=self.min_frames,
-                                                                                                                        max_frames=self.max_frames, dense_masks=True,
-                                                                                                                        ensure_control_frame=True,
-                                                                                                                    )
-                    if self.is_reserve_face:
-                        reserve_frame_index, _, reserve_bboxs_dict, _ = select_mask_frames_from_index(
-                                                                        batch_index,
-                                                                        batch_index, valid_id,
-                                                                        corresponding_data, control_sam2_frame,
-                                                                        valid_frame[valid_id], bbox_data, masks_data_path,
-                                                                        min_distance=3, min_frames=4,
-                                                                        max_frames=4, dense_masks=False,
-                                                                        ensure_control_frame=False,
-                                                                    )
-                # get mask and aligned_face_img
-                selected_frame_index = selected_frame_index[valid_id]
-                valid_frame = valid_frame[valid_id]
-                selected_masks_dict = selected_masks_dict[valid_id]
-                selected_bboxs_dict = selected_bboxs_dict[valid_id]
-                dense_masks_dict = dense_masks_dict[valid_id]
-                if self.is_reserve_face:
-                    reserve_frame_index = reserve_frame_index[valid_id]
-                    reserve_bboxs_dict = reserve_bboxs_dict[valid_id]
-                selected_masks_tensor = torch.stack([torch.tensor(mask) for mask in selected_masks_dict])
-                temp_dense_masks_tensor = torch.stack([torch.tensor(mask) for mask in dense_masks_dict])
-                dense_masks_tensor = self._short_resize_and_crop(temp_dense_masks_tensor.unsqueeze(-1), self.width, self.height).squeeze(-1)  # [T, H, W] -> [T, H, W, 1] -> [T, H, W]
-                expand_images_pil, original_images_pil = crop_images(selected_frame_index, selected_bboxs_dict, video_reader, return_ori=True)
-                expand_face_imgs, original_face_imgs = process_cropped_images(expand_images_pil, original_images_pil, target_size=(480, 480))
-                if self.is_reserve_face:
-                    reserve_images_pil, _ = crop_images(reserve_frame_index, reserve_bboxs_dict, video_reader, return_ori=False)
-                    reserve_face_imgs, _ = process_cropped_images(reserve_images_pil, [], target_size=(480, 480))
-                if len(expand_face_imgs) == 0 or len(original_face_imgs) == 0:
-                    raise ValueError(f"No face detected in input image pool")
-                # post process id related data
-                expand_face_imgs = pad_tensor(expand_face_imgs, self.max_frames, dim=0)
-                original_face_imgs = pad_tensor(original_face_imgs, self.max_frames, dim=0)
-                selected_frame_index = torch.tensor(selected_frame_index)                         # torch.Size(([15, 13])          [N1]
-                selected_frame_index = pad_tensor(selected_frame_index, self.max_frames, dim=0)
-            else:
-                batch_index = self._generate_frame_indices(video_num_frames, self.max_num_frames, self.sample_stride,
-                                                            self.skip_frames_start_percent, self.skip_frames_end_percent,
-                                                            self.skip_frames_start, self.skip_frames_end)
-            try:
-                frames = video_reader.get_batch(batch_index) # torch [T, H, W, C]
-                frames = self._short_resize_and_crop(frames, self.width, self.height)  # [T, H, W, C]
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            # Apply training transforms in batch
-            frames = frames.float()
-            frames = train_transforms(frames)
-            pixel_values = frames.permute(0, 3, 1, 2).contiguous()  # [T, C, H, W]
-            del video_reader
-            # Random use no text generation
-            if random.random() < self.text_drop_ratio:
-                text = ''
-        if self.is_train_face:
-            return pixel_values, text, 'video', video_dir, expand_face_imgs, dense_masks_tensor, selected_frame_index, reserve_face_imgs, original_face_imgs
-        else:
-            return pixel_values, text, 'video', video_dir
-    def __len__(self):
-        return self.num_instance_videos
-    def __getitem__(self, idx):
-        sample = {}
-        if self.is_train_face:
-            pixel_values, cap, data_type, video_dir, expand_face_imgs, dense_masks_tensor, selected_frame_index, reserve_face_imgs, original_face_imgs = self.get_batch(idx)
-            sample["instance_prompt"] = self.id_token + cap
-            sample["instance_video"] = pixel_values
-            sample["video_path"] = video_dir
-            if self.is_train_face:
-                sample["expand_face_imgs"] = expand_face_imgs
-                sample["dense_masks_tensor"] = dense_masks_tensor
-                sample["selected_frame_index"] = selected_frame_index
-                if reserve_face_imgs is not None:
-                    sample["reserve_face_imgs"] = reserve_face_imgs
-                if original_face_imgs is not None:
-                    sample["original_face_imgs"] = original_face_imgs
-        else:
-            pixel_values, cap, data_type, video_dir = self.get_batch(idx)
-            sample["instance_prompt"] = self.id_token + cap
-            sample["instance_video"] = pixel_values
-            sample["video_path"] = video_dir
-        return sample
-        # while True:
-        #     sample = {}
-        #     try:
-        #         if self.is_train_face:
-        #             pixel_values, cap, data_type, video_dir, expand_face_imgs, dense_masks_tensor, selected_frame_index, reserve_face_imgs, original_face_imgs = self.get_batch(idx)
-        #             sample["instance_prompt"] = self.id_token + cap
-        #             sample["instance_video"] = pixel_values
-        #             sample["video_path"] = video_dir
-        #             if self.is_train_face:
-        #                 sample["expand_face_imgs"] = expand_face_imgs
-        #                 sample["dense_masks_tensor"] = dense_masks_tensor
-        #                 sample["selected_frame_index"] = selected_frame_index
-        #                 if reserve_face_imgs is not None:
-        #                     sample["reserve_face_imgs"] = reserve_face_imgs
-        #                 if original_face_imgs is not None:
-        #                     sample["original_face_imgs"] = original_face_imgs
-        #         else:
-        #             pixel_values, cap, data_type, video_dir, = self.get_batch(idx)
-        #             sample["instance_prompt"] = self.id_token + cap
-        #             sample["instance_video"] = pixel_values
-        #             sample["video_path"] = video_dir
-        #         break
-        #     except Exception as e:
-        #         error_message = str(e)
-        #         video_path = self.instance_video_paths[idx % len(self.instance_video_paths)]
-        #         print(error_message, video_path)
-        #         log_error_to_file(error_message, video_path)
-        #         idx = random.randint(0, self.num_instance_videos - 1)
-        # return sample

util/deepspeed_configs/accelerate_config_machine_multi.yaml DELETED Viewed

@@ -1,18 +0,0 @@
-compute_environment: LOCAL_MACHINE
-distributed_type: DEEPSPEED
-deepspeed_config:
- deepspeed_config_file: util/deepspeed_configs/zero_stage2_config.json
- deepspeed_hostfile: util/deepspeed_configs/hostfile.txt
-fsdp_config: {}
-machine_rank: 0
-main_process_ip: 100.64.24.6
-main_process_port: 12343
-main_training_function: main
-num_machines: 2
-num_processes: 16
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false

util/deepspeed_configs/accelerate_config_machine_single.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-compute_environment: LOCAL_MACHINE
-distributed_type: DEEPSPEED
-deepspeed_config:
-  deepspeed_config_file: util/deepspeed_configs/zero_stage2_config.json
-fsdp_config: {}
-machine_rank: 0
-main_process_ip: null
-main_process_port: 12345
-main_training_function: main
-num_machines: 1
-num_processes: 8
-gpu_ids: 0,1,2,3,4,5,6,7
-use_cpu: false

util/deepspeed_configs/hostfile.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- [email protected] slots=8
2	- [email protected] slots=8

util/deepspeed_configs/zero_stage2_config.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-    "bf16": {
-        "enabled": true
-    },
-    "train_micro_batch_size_per_gpu": "auto",
-    "train_batch_size": "auto",
-    "gradient_clipping": 1.0,
-    "gradient_accumulation_steps": "auto",
-    "dump_state": true,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": 5e8
-    }
-}