import cv2, os
import torch
import random
import numpy as np

import spaces

import PIL
from PIL import Image
from typing import Tuple

import diffusers
from diffusers.utils import load_image

from diffusers import (
    AutoencoderKL,
    UNet2DConditionModel,
    UniPCMultistepScheduler,
)

from huggingface_hub import hf_hub_download

from insightface.app import FaceAnalysis

from pipeline_controlnet_xs_sd_xl_instantid import StableDiffusionXLInstantIDXSPipeline, UNetControlNetXSModel

from utils.controlnet_xs import ControlNetXSAdapter
# from controlnet_aux import OpenposeDetector

import gradio as gr

import torch.nn.functional as F
from torchvision.transforms import Compose

# global variable
MAX_SEED = np.iinfo(np.int32).max
device = "cuda" if torch.cuda.is_available() else "cpu"
weight_dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32


base_model = 'frankjoshua/realvisxlV40_v40Bakedvae'
vae_path = 'madebyollin/sdxl-vae-fp16-fix'
ckpt = 'RED-AIGC/InstantID-XS'

image_proj_path = os.path.join(ckpt, "image_proj.bin")
cnxs_path =  os.path.join(ckpt, "controlnetxs.bin")
cross_attn_path = os.path.join(ckpt, "cross_attn.bin")


# Load face encoder
app = FaceAnalysis(
    name="antelopev2",
    root="./",
    providers=["CPUExecutionProvider"],
)
app.prepare(ctx_id=0, det_size=(640, 640))


def get_ControlNetXS(base_model, cnxs_path, device, size_ratio=0.125, weight_dtype=torch.float16):
    unet = UNet2DConditionModel.from_pretrained(base_model, subfolder="unet").to(device, dtype=weight_dtype)
    controlnet = ControlNetXSAdapter.from_unet(unet, size_ratio=size_ratio, learn_time_embedding=True)
    state_dict = torch.load(cnxs_path, map_location="cpu", weights_only=True)
    ctrl_state_dict = {}
    for key, value in state_dict.items():
        if 'attn2.processor' not in key:
            if 'ctrl_' in key and 'ctrl_to_base' not in key:
                key = key.replace('ctrl_', '')
            if 'up_blocks' in key:
                key = key.replace('up_blocks', 'up_connections')
            ctrl_state_dict[key] = value
    controlnet.load_state_dict(ctrl_state_dict, strict=True)
    controlnet.to(device, dtype=weight_dtype)
    ControlNetXS = UNetControlNetXSModel.from_unet(unet, controlnet).to(device, dtype=weight_dtype)
    
    return ControlNetXS

ControlNetXS = get_ControlNetXS(base_model, cnxs_path, device, size_ratio=0.125, weight_dtype=weight_dtype)
vae = AutoencoderKL.from_pretrained(vae_path)
pipe = StableDiffusionXLInstantIDXSPipeline.from_pretrained(
    pretrained_model_name_or_path,
    vae=vae,
    unet=ControlNetXS,
    controlnet=None,
    torch_dtype=weight_dtype,
)

pipe.cuda(device=device, dtype=weight_dtype, use_xformers=True)
pipe.load_ip_adapter(image_proj_path, cross_attn_path)

pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.unet.config.ctrl_learn_time_embedding = True
pipe = pipe.to(args.device)


def toggle_lcm_ui(value):
    if value:
        return (
            gr.update(minimum=0, maximum=100, step=1, value=5),
            gr.update(minimum=0.1, maximum=20.0, step=0.1, value=1.5),
        )
    else:
        return (
            gr.update(minimum=5, maximum=100, step=1, value=30),
            gr.update(minimum=0.1, maximum=20.0, step=0.1, value=5),
        )

def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed

def remove_tips():
    return gr.update(visible=False)

def get_example():
    case = [
        [
            "./examples/1.jpg",
            None,
            "a woman,(looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2),   perfect composition, natural lighting, best quality, authentic, natural posture",
            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
        ],
        [
            "./examples/1.jpeg",
            "./examples/poses/pose1.jpg",
            "a woman,(looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2),   perfect composition, natural lighting, best quality, authentic, natural posture",
            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
        ],
    ]
    return case

def run_for_examples(face_file, pose_file, prompt, style, negative_prompt):
    return generate_image(
        face_file,
        pose_file,
        prompt,
        negative_prompt,
        20,  # num_steps
        0.8,  # identitynet_strength_ratio
        0.8,  # adapter_strength_ratio
        0.8,  # pose_strength
        5.0,  # guidance_scale
        42,  # seed
    )

def convert_from_cv2_to_image(img: np.ndarray) -> Image:
    return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

def convert_from_image_to_cv2(img: Image) -> np.ndarray:
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def resize_img(
    input_image,
    max_side=1280,
    min_side=1024,
    size=None,
    pad_to_max_side=False,
    mode=PIL.Image.BILINEAR,
    base_pixel_number=64,
):
    w, h = input_image.size
    if size is not None:
        w_resize_new, h_resize_new = size
    else:
        ratio = min_side / min(h, w)
        w, h = round(ratio * w), round(ratio * h)
        ratio = max_side / max(h, w)
        input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
    input_image = input_image.resize([w_resize_new, h_resize_new], mode)

    if pad_to_max_side:
        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
        offset_x = (max_side - w_resize_new) // 2
        offset_y = (max_side - h_resize_new) // 2
        res[
            offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
        ] = np.array(input_image)
        input_image = Image.fromarray(res)
    return input_image


@spaces.GPU
def generate_image(
    face_image_path,
    pose_image_path,
    prompt,
    negative_prompt,
    num_steps,
    controlnet_conditioning_scale,
    adapter_strength_ratio,
    guidance_scale,
    seed,
    progress=gr.Progress(track_tqdm=True),
):

    if face_image_path is None:
        raise gr.Error(
            f"Cannot find any input face image! Please upload the face image"
        )

    if prompt is None:
        prompt = "a person"

    # apply the style template
    prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)

    face_image = load_image(face_image_path)
    face_image = resize_img(face_image, max_side=1024)
    face_image_cv2 = convert_from_image_to_cv2(face_image)
    height, width, _ = face_image_cv2.shape

    # Extract face features
    face_info = app.get(face_image_cv2)

    if len(face_info) == 0:
        raise gr.Error(
            f"Unable to detect a face in the image. Please upload a different photo with a clear face."
        )

    face_info = sorted(
        face_info,
        key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
    )[-1]  # only use the maximum face

    face_emb = torch.from_numpy(face_info.normed_embedding)
    face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
    img_controlnet = face_image
    if pose_image_path is not None:
        pose_image = load_image(pose_image_path)
        pose_image = resize_img(pose_image, max_side=1024)
        img_controlnet = pose_image
        pose_image_cv2 = convert_from_image_to_cv2(pose_image)

        face_info = app.get(pose_image_cv2)

        if len(face_info) == 0:
            raise gr.Error(
                f"Cannot find any face in the reference image! Please upload another person image"
            )

        face_info = face_info[-1]
        face_kps = draw_kps(pose_image, face_info["kps"])

        width, height = face_kps.size

    print("Start inference...")
    print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")

    pipe.set_ip_adapter_scale(adapter_strength_ratio)
    images = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image_embeds=face_emb,
        image=face_kps,
        control_mask=control_mask,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        num_inference_steps=num_steps,
        guidance_scale=guidance_scale,
        height=height,
        width=width,
        generator=torch.Generator(device=device).manual_seed(seed),
    ).images

    return images[0], gr.update(visible=True)


css = """
.gradio-container {width: 85% !important}
"""
with gr.Blocks(css=css) as demo:
    # description
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Row():
        with gr.Column():
            with gr.Row(equal_height=True):
                # upload face image
                face_file = gr.Image(
                    label="Upload a photo of your face", type="filepath"
                )
                # optional: upload a reference pose image
                pose_file = gr.Image(
                    label="Upload a reference pose image (Optional)",
                    type="filepath",
                )

            # prompt
            prompt = gr.Textbox(
                label="Prompt",
                info="Give simple prompt is enough to achieve good face fidelity",
                placeholder="A photo of a person",
                value="",
            )

            submit = gr.Button("Submit", variant="primary")
            enable_LCM = gr.Checkbox(
                label="Enable Fast Inference with LCM", value=enable_lcm_arg,
                info="LCM speeds up the inference step, the trade-off is the quality of the generated image. It performs better with portrait face images rather than distant faces",
            )

            # strength
            controlnet_conditioning_scale = gr.Slider(
                label="IdentityNet strength (for fidelity)",
                minimum=0,
                maximum=1.0,
                step=0.1,
                value=0.8,
            )
            adapter_strength_ratio = gr.Slider(
                label="Image adapter strength (for detail)",
                minimum=0,
                maximum=1.2,
                step=0.1,
                value=0.8,
            )

            with gr.Accordion(open=False, label="Advanced Options"):
                negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    placeholder="low quality",
                    value="(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
                )
                num_steps = gr.Slider(
                    label="Number of sample steps",
                    minimum=1,
                    maximum=100,
                    step=1,
                    value=20,
                )
                guidance_scale = gr.Slider(
                    label="Guidance scale",
                    minimum=0.1,
                    maximum=10.0,
                    step=0.1,
                    value=5.0,
                )
                seed = gr.Slider(
                    label="Seed",
                    minimum=0,
                    maximum=MAX_SEED,
                    step=1,
                    value=42,
                )

                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

        with gr.Column(scale=1):
            gallery = gr.Image(label="Generated Images")
            usage_tips = gr.Markdown(
                label="InstantID Usage Tips", value=tips, visible=False
            )

        submit.click(
            fn=remove_tips,
            outputs=usage_tips,
        ).then(
            fn=randomize_seed_fn,
            inputs=[seed, randomize_seed],
            outputs=seed,
            queue=False,
            api_name=False,
        ).then(
            fn=generate_image,
            inputs=[
                face_file,
                pose_file,
                prompt,
                negative_prompt,
                num_steps,
                controlnet_conditioning_scale,
                adapter_strength_ratio,
                guidance_scale,
                seed,
            ],
            outputs=[gallery, usage_tips],
        )

        enable_LCM.input(
            fn=toggle_lcm_ui,
            inputs=[enable_LCM],
            outputs=[num_steps, guidance_scale],
            queue=False,
        )

    gr.Examples(
        examples=get_example(),
        inputs=[face_file, pose_file, prompt, negative_prompt],
        fn=run_for_examples,
        outputs=[gallery, usage_tips],
        cache_examples=True,
    )

    gr.Markdown(article)

demo.queue(api_open=False)
demo.launch()