Spaces:

RED-AIGC
/

InstantID-XS

Running on Zero

App Files Files Community

XuDongZhou commited on Dec 20, 2024

Commit

4375fb1

verified ·

1 Parent(s): 1633b7b

Update app.py

Browse files

Files changed (1) hide show

app.py +354 -106

app.py CHANGED Viewed

@@ -1,154 +1,402 @@
-import gradio as gr
-import numpy as np
 import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
     prompt,
     negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
     guidance_scale,
-    num_inference_steps,
     progress=gr.Progress(track_tqdm=True),
 ):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
         guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
         height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
 css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
 """
 with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
                 label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
             )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
                 minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
                 )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
                 )
-            with gr.Row():
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
-                    minimum=0.0,
                     maximum=10.0,
                     step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
                 )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
                     step=1,
-                    value=2,  # Replace with defaults that work for your model
                 )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
     )
-if __name__ == "__main__":
-    demo.launch()

+import cv2, os
+import torch
 import random
+import numpy as np
+import spaces
+import PIL
+from PIL import Image
+from typing import Tuple
+import diffusers
+from diffusers.utils import load_image
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from huggingface_hub import hf_hub_download
+from insightface.app import FaceAnalysis
+from pipeline_controlnet_xs_sd_xl_instantid import StableDiffusionXLInstantIDXSPipeline, UNetControlNetXSModel
+from utils.controlnet_xs import ControlNetXSAdapter
+# from controlnet_aux import OpenposeDetector
+import gradio as gr
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+# global variable
 MAX_SEED = np.iinfo(np.int32).max
+device = "cuda" if torch.cuda.is_available() else "cpu"
+weight_dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
+base_model = 'frankjoshua/realvisxlV40_v40Bakedvae'
+vae_path = 'madebyollin/sdxl-vae-fp16-fix'
+ckpt = 'RED-AIGC/InstantID-XS'
+image_proj_path = os.path.join(ckpt, "image_proj.bin")
+cnxs_path =  os.path.join(ckpt, "controlnetxs.bin")
+cross_attn_path = os.path.join(ckpt, "cross_attn.bin")
+# Load face encoder
+app = FaceAnalysis(
+    name="antelopev2",
+    root="./",
+    providers=["CPUExecutionProvider"],
+)
+app.prepare(ctx_id=0, det_size=(640, 640))
+def get_ControlNetXS(base_model, cnxs_path, device, size_ratio=0.125, weight_dtype=torch.float16):
+    unet = UNet2DConditionModel.from_pretrained(base_model, subfolder="unet").to(device, dtype=weight_dtype)
+    controlnet = ControlNetXSAdapter.from_unet(unet, size_ratio=size_ratio, learn_time_embedding=True)
+    state_dict = torch.load(cnxs_path, map_location="cpu", weights_only=True)
+    ctrl_state_dict = {}
+    for key, value in state_dict.items():
+        if 'attn2.processor' not in key:
+            if 'ctrl_' in key and 'ctrl_to_base' not in key:
+                key = key.replace('ctrl_', '')
+            if 'up_blocks' in key:
+                key = key.replace('up_blocks', 'up_connections')
+            ctrl_state_dict[key] = value
+    controlnet.load_state_dict(ctrl_state_dict, strict=True)
+    controlnet.to(device, dtype=weight_dtype)
+    ControlNetXS = UNetControlNetXSModel.from_unet(unet, controlnet).to(device, dtype=weight_dtype)
+    return ControlNetXS
+ControlNetXS = get_ControlNetXS(base_model, cnxs_path, device, size_ratio=0.125, weight_dtype=weight_dtype)
+vae = AutoencoderKL.from_pretrained(vae_path)
+pipe = StableDiffusionXLInstantIDXSPipeline.from_pretrained(
+    pretrained_model_name_or_path,
+    vae=vae,
+    unet=ControlNetXS,
+    controlnet=None,
+    torch_dtype=weight_dtype,
+)
+pipe.cuda(device=device, dtype=weight_dtype, use_xformers=True)
+pipe.load_ip_adapter(image_proj_path, cross_attn_path)
+pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+pipe.unet.config.ctrl_learn_time_embedding = True
+pipe = pipe.to(args.device)
+def toggle_lcm_ui(value):
+    if value:
+        return (
+            gr.update(minimum=0, maximum=100, step=1, value=5),
+            gr.update(minimum=0.1, maximum=20.0, step=0.1, value=1.5),
+        )
+    else:
+        return (
+            gr.update(minimum=5, maximum=100, step=1, value=30),
+            gr.update(minimum=0.1, maximum=20.0, step=0.1, value=5),
+        )
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def remove_tips():
+    return gr.update(visible=False)
+def get_example():
+    case = [
+        [
+            "./examples/1.jpg",
+            None,
+            "a woman,(looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2),   perfect composition, natural lighting, best quality, authentic, natural posture",
+            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
+        ],
+        [
+            "./examples/1.jpeg",
+            "./examples/poses/pose1.jpg",
+            "a woman,(looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2),   perfect composition, natural lighting, best quality, authentic, natural posture",
+            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
+        ],
+    return case
+def run_for_examples(face_file, pose_file, prompt, style, negative_prompt):
+    return generate_image(
+        face_file,
+        pose_file,
+        prompt,
+        negative_prompt,
+        20,  # num_steps
+        0.8,  # identitynet_strength_ratio
+        0.8,  # adapter_strength_ratio
+        0.8,  # pose_strength
+        5.0,  # guidance_scale
+        42,  # seed
+    )
+def convert_from_cv2_to_image(img: np.ndarray) -> Image:
+    return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+def convert_from_image_to_cv2(img: Image) -> np.ndarray:
+    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+def resize_img(
+    input_image,
+    max_side=1280,
+    min_side=1024,
+    size=None,
+    pad_to_max_side=False,
+    mode=PIL.Image.BILINEAR,
+    base_pixel_number=64,
+):
+    w, h = input_image.size
+    if size is not None:
+        w_resize_new, h_resize_new = size
+    else:
+        ratio = min_side / min(h, w)
+        w, h = round(ratio * w), round(ratio * h)
+        ratio = max_side / max(h, w)
+        input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
+        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
+        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
+    input_image = input_image.resize([w_resize_new, h_resize_new], mode)
+    if pad_to_max_side:
+        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
+        offset_x = (max_side - w_resize_new) // 2
+        offset_y = (max_side - h_resize_new) // 2
+        res[
+            offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
+        ] = np.array(input_image)
+        input_image = Image.fromarray(res)
+    return input_image
+@spaces.GPU
+def generate_image(
+    face_image_path,
+    pose_image_path,
     prompt,
     negative_prompt,
+    num_steps,
+    controlnet_conditioning_scale,
+    adapter_strength_ratio,
     guidance_scale,
+    seed,
     progress=gr.Progress(track_tqdm=True),
 ):
+    if face_image_path is None:
+        raise gr.Error(
+            f"Cannot find any input face image! Please upload the face image"
+        )
+    if prompt is None:
+        prompt = "a person"
+    # apply the style template
+    prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
+    face_image = load_image(face_image_path)
+    face_image = resize_img(face_image, max_side=1024)
+    face_image_cv2 = convert_from_image_to_cv2(face_image)
+    height, width, _ = face_image_cv2.shape
+    # Extract face features
+    face_info = app.get(face_image_cv2)
+    if len(face_info) == 0:
+        raise gr.Error(
+            f"Unable to detect a face in the image. Please upload a different photo with a clear face."
+        )
+    face_info = sorted(
+        face_info,
+        key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
+    )[-1]  # only use the maximum face
+    face_emb = torch.from_numpy(face_info.normed_embedding)
+    face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
+    img_controlnet = face_image
+    if pose_image_path is not None:
+        pose_image = load_image(pose_image_path)
+        pose_image = resize_img(pose_image, max_side=1024)
+        img_controlnet = pose_image
+        pose_image_cv2 = convert_from_image_to_cv2(pose_image)
+        face_info = app.get(pose_image_cv2)
+        if len(face_info) == 0:
+            raise gr.Error(
+                f"Cannot find any face in the reference image! Please upload another person image"
+            )
+        face_info = face_info[-1]
+        face_kps = draw_kps(pose_image, face_info["kps"])
+        width, height = face_kps.size
+    print("Start inference...")
+    print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
+    pipe.set_ip_adapter_scale(adapter_strength_ratio)
+    images = pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
+        image_embeds=face_emb,
+        image=face_kps,
+        control_mask=control_mask,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        num_inference_steps=num_steps,
         guidance_scale=guidance_scale,
         height=height,
+        width=width,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    ).images
+    return images[0], gr.update(visible=True)
 css = """
+.gradio-container {width: 85% !important}
 """
 with gr.Blocks(css=css) as demo:
+    # description
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row(equal_height=True):
+                # upload face image
+                face_file = gr.Image(
+                    label="Upload a photo of your face", type="filepath"
+                )
+                # optional: upload a reference pose image
+                pose_file = gr.Image(
+                    label="Upload a reference pose image (Optional)",
+                    type="filepath",
+                )
+            # prompt
+            prompt = gr.Textbox(
                 label="Prompt",
+                info="Give simple prompt is enough to achieve good face fidelity",
+                placeholder="A photo of a person",
+                value="",
             )
+            submit = gr.Button("Submit", variant="primary")
+            enable_LCM = gr.Checkbox(
+                label="Enable Fast Inference with LCM", value=enable_lcm_arg,
+                info="LCM speeds up the inference step, the trade-off is the quality of the generated image. It performs better with portrait face images rather than distant faces",
             )
+            # strength
+            controlnet_conditioning_scale = gr.Slider(
+                label="IdentityNet strength (for fidelity)",
                 minimum=0,
+                maximum=1.0,
+                step=0.1,
+                value=0.8,
+            )
+            adapter_strength_ratio = gr.Slider(
+                label="Image adapter strength (for detail)",
+                minimum=0,
+                maximum=1.2,
+                step=0.1,
+                value=0.8,
             )
+            with gr.Accordion(open=False, label="Advanced Options"):
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    placeholder="low quality",
+                    value="(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
                 )
+                num_steps = gr.Slider(
+                    label="Number of sample steps",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=20,
                 )
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
+                    minimum=0.1,
                     maximum=10.0,
                     step=0.1,
+                    value=5.0,
                 )
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
                     step=1,
+                    value=42,
                 )
+                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+        with gr.Column(scale=1):
+            gallery = gr.Image(label="Generated Images")
+            usage_tips = gr.Markdown(
+                label="InstantID Usage Tips", value=tips, visible=False
+            )
+        submit.click(
+            fn=remove_tips,
+            outputs=usage_tips,
+        ).then(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=generate_image,
+            inputs=[
+                face_file,
+                pose_file,
+                prompt,
+                negative_prompt,
+                num_steps,
+                controlnet_conditioning_scale,
+                adapter_strength_ratio,
+                guidance_scale,
+                seed,
+            ],
+            outputs=[gallery, usage_tips],
+        )
+        enable_LCM.input(
+            fn=toggle_lcm_ui,
+            inputs=[enable_LCM],
+            outputs=[num_steps, guidance_scale],
+            queue=False,
+        )
+    gr.Examples(
+        examples=get_example(),
+        inputs=[face_file, pose_file, prompt, negative_prompt],
+        fn=run_for_examples,
+        outputs=[gallery, usage_tips],
+        cache_examples=True,
     )
+    gr.Markdown(article)
+demo.queue(api_open=False)
+demo.launch()