ConsistentID

Runtime error

App Files Files Community

adaface-neurips commited on Aug 24, 2024

Commit

ad6476f

1 Parent(s): 57aa583

Better organize code

Browse files

Files changed (2) hide show

app.py +2 -27
pipline_ConsistentID.py +38 -56

app.py CHANGED Viewed

@@ -10,11 +10,6 @@ from PIL import Image
 from diffusers.utils import load_image
 from diffusers import EulerDiscreteScheduler
 from pipline_ConsistentID import ConsistentIDPipeline
-from huggingface_hub import hf_hub_download
-### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
-### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
-### Thanks for the open source of face-parsing model.
-from models.BiSeNet.model import BiSeNet
 # zero = torch.Tensor([0]).cuda()
 # print(zero.device) # <-- 'cpu' 🤔
@@ -26,9 +21,6 @@ script_directory = os.path.dirname(os.path.realpath(__file__))
 # download ConsistentID checkpoint to cache
 base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
-consistentID_path = hf_hub_download(repo_id="JackAILab/ConsistentID",
-                                    filename="ConsistentID-v1.bin", repo_type="model",
-                                    local_dir="./models")
 ### Load base model
 pipe = ConsistentIDPipeline.from_pretrained(
@@ -38,30 +30,13 @@ pipe = ConsistentIDPipeline.from_pretrained(
     variant="fp16"
 ).to(device)
-### Load other pretrained models
-## BiSenet
-bise_net_cp_path = hf_hub_download(repo_id="JackAILab/ConsistentID",
-                                   filename="face_parsing.pth", local_dir="./models")
-bise_net = BiSeNet(n_classes = 19)
-bise_net.load_state_dict(torch.load(bise_net_cp_path, map_location="cpu")) # device fail
-bise_net.cuda()
 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
-    os.path.dirname(consistentID_path),
-    bise_net,
-    subfolder="",
-    weight_name=os.path.basename(consistentID_path),
-    trigger_word="img",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-### Load to cuda
-pipe.to(device)
-pipe.clip_encoder.to(device)
-pipe.image_proj_model.to(device)
-pipe.FacialEncoder.to(device)
 @spaces.GPU
 def process(selected_template_images, custom_image, prompt,

 from diffusers.utils import load_image
 from diffusers import EulerDiscreteScheduler
 from pipline_ConsistentID import ConsistentIDPipeline
 # zero = torch.Tensor([0]).cuda()
 # print(zero.device) # <-- 'cpu' 🤔
 # download ConsistentID checkpoint to cache
 base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
 ### Load base model
 pipe = ConsistentIDPipeline.from_pretrained(
     variant="fp16"
 ).to(device)
 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
+    consistentID_weight_path="./models/ConsistentID-v1.bin",
+    bise_net_weight_path="./models/face_parsing.pth",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 @spaces.GPU
 def process(selected_template_images, custom_image, prompt,

pipline_ConsistentID.py CHANGED Viewed

@@ -17,6 +17,12 @@ from functions import insert_markers_to_prompt, masks_for_unique_values, apply_m
 from functions import ProjPlusModel, masks_for_unique_values
 from attention import Consistent_IPAttProcessor, Consistent_AttProcessor, FacialEncoder
 from easydict import EasyDict as edict
 PipelineImageInput = Union[
     PIL.Image.Image,
@@ -51,11 +57,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
-        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        bise_net,
-        weight_name: str,
-        subfolder: str = '',
-        trigger_word_ID: str = '<|image|>',
         trigger_word_facial: str = '<|facial|>',
         # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
         # output dim: 1280.
@@ -73,7 +76,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
             self.device, dtype=self.torch_dtype
         )
-        self.clip_preprocessor = CLIPImageProcessor()
         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
@@ -81,13 +84,22 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         self.app = FaceAnalysis(name="buffalo_l", providers=['CPUExecutionProvider'])
         self.app.prepare(ctx_id=0, det_size=(640, 640))
-        ### BiSeNet
-        # self.bise_net = BiSeNet(n_classes = 19)
-        # self.bise_net.cuda() # CUDA must not be initialized in the main process on Spaces with Stateless GPU environment
-        # self.bise_net_cp=bise_net_cp_path
-        # self.bise_net.load_state_dict(torch.load(self.bise_net_cp))
-        self.bise_net = bise_net # load from outside
-        self.bise_net.eval()
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
                     [255, 0, 85], [255, 0, 170],
@@ -108,47 +120,18 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         ).to(self.device, dtype=self.torch_dtype)
         self.FacialEncoder = FacialEncoder().to(self.device, dtype=self.torch_dtype)
-        # Load the main state dict first.
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        revision = kwargs.pop("revision", None)
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            model_file = _get_model_file(
-                pretrained_model_name_or_path_or_dict,
-                weights_name=weight_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            if weight_name.endswith(".safetensors"):
-                state_dict = {"id_encoder": {}, "lora_weights": {}}
-                with safe_open(model_file, framework="pt", device="cpu") as f:
-                    ### TODO safetensors add
-                    for key in f.keys():
-                        if key.startswith("FacialEncoder."):
-                            state_dict["FacialEncoder"][key.replace("FacialEncoder.", "")] = f.get_tensor(key)
-                        elif key.startswith("image_proj."):
-                            state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
-            else:
-                state_dict = torch.load(model_file, map_location="cpu")
         else:
-            state_dict = pretrained_model_name_or_path_or_dict
-        self.trigger_word_ID = trigger_word_ID
         self.trigger_word_facial = trigger_word_facial
         self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
@@ -159,7 +142,6 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         # Add trigger word token
         if self.tokenizer is not None:
-            self.tokenizer.add_tokens([self.trigger_word_ID], special_tokens=True)
             self.tokenizer.add_tokens([self.trigger_word_facial], special_tokens=True)
     def set_ip_adapter(self):
@@ -264,7 +246,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
             image_resize_PIL = image
             img = to_tensor(image)
             img = torch.unsqueeze(img, 0)
-            img = img.float().cuda()
             out = self.bise_net(img)[0]
             parsing_anno = out.squeeze(0).cpu().numpy().argmax(0)
@@ -337,7 +319,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         # Remove "<|facial|>" from prompt_face.
         # augmented_prompt: 'A person, police officer, half body shot Detail:
         # The person has one nose , two ears , two eyes , and a mouth , '
-        augmented_prompt = prompt_face.replace("<|facial|>", "").replace("<|image|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
         image_token_id = None

 from functions import ProjPlusModel, masks_for_unique_values
 from attention import Consistent_IPAttProcessor, Consistent_AttProcessor, FacialEncoder
 from easydict import EasyDict as edict
+from huggingface_hub import hf_hub_download
+### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
+### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
+### Thanks for the open source of face-parsing model.
+from models.BiSeNet.model import BiSeNet
+import os
 PipelineImageInput = Union[
     PIL.Image.Image,
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
+        consistentID_weight_path: str,
+        bise_net_weight_path: str,
         trigger_word_facial: str = '<|facial|>',
         # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
         # output dim: 1280.
         self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
             self.device, dtype=self.torch_dtype
         )
+        self.clip_preprocessor  = CLIPImageProcessor()
         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
         self.app = FaceAnalysis(name="buffalo_l", providers=['CPUExecutionProvider'])
         self.app.prepare(ctx_id=0, det_size=(640, 640))
+        if not os.path.exists(consistentID_weight_path):
+            ### Download pretrained models
+            hf_hub_download(repo_id="JackAILab/ConsistentID", repo_type="model",
+                            filename=os.path.basename(consistentID_weight_path),
+                            local_dir=os.path.dirname(consistentID_weight_path))
+        if not os.path.exists(bise_net_weight_path):
+            hf_hub_download(repo_id="JackAILab/ConsistentID",
+                            filename=os.path.basename(bise_net_weight_path),
+                            local_dir=os.path.dirname(bise_net_weight_path))
+        bise_net = BiSeNet(n_classes = 19)
+        bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
+        bise_net.to(self.device, dtype=self.torch_dtype)
+        bise_net.eval()
+        self.bise_net = bise_net
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
                     [255, 0, 85], [255, 0, 170],
         ).to(self.device, dtype=self.torch_dtype)
         self.FacialEncoder = FacialEncoder().to(self.device, dtype=self.torch_dtype)
+        if consistentID_weight_path.endswith(".safetensors"):
+            state_dict = {"id_encoder": {}, "lora_weights": {}}
+            with safe_open(consistentID_weight_path, framework="pt", device="cpu") as f:
+                ### TODO safetensors add
+                for key in f.keys():
+                    if key.startswith("FacialEncoder."):
+                        state_dict["FacialEncoder"][key.replace("FacialEncoder.", "")] = f.get_tensor(key)
+                    elif key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
         else:
+            state_dict = torch.load(consistentID_weight_path, map_location="cpu")
         self.trigger_word_facial = trigger_word_facial
         self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
         # Add trigger word token
         if self.tokenizer is not None:
             self.tokenizer.add_tokens([self.trigger_word_facial], special_tokens=True)
     def set_ip_adapter(self):
             image_resize_PIL = image
             img = to_tensor(image)
             img = torch.unsqueeze(img, 0)
+            img = img.to(self.device, dtype=self.torch_dtype)
             out = self.bise_net(img)[0]
             parsing_anno = out.squeeze(0).cpu().numpy().argmax(0)
         # Remove "<|facial|>" from prompt_face.
         # augmented_prompt: 'A person, police officer, half body shot Detail:
         # The person has one nose , two ears , two eyes , and a mouth , '
+        augmented_prompt = prompt_face.replace("<|facial|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
         image_token_id = None