import gradio as gr
import numpy as np
import random
#import spaces #[uncomment to use ZeroGPU]
import os
from PIL import Image, ImageDraw, ImageFont
import torch
from PIL import Image
from diffusers.utils import load_image
from diffusers import DPMSolverSDEScheduler
from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, StableDiffusionXLControlNetPipeline, ControlNetModel
from diffusers.utils import load_image
from diffusers.image_processor import IPAdapterMaskProcessor
from torch import nn

### auxiliary functions

def ip_guide(guide, pool):
    distances = []
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    for embed in pool:
        dist = cos(guide, embed.to('cuda'))
        distances.append(dist)
    ### find the indexes of the top 5 embeddings
    indexed_distances = list(enumerate(distances))
    # Sort the list of pairs based on the scores
    sorted_distances = sorted(indexed_distances, key=lambda x: x[1])
    # Extract the indexes of the lowest scores
    lowest_indexes = [index for index, score in sorted_distances[:5]]

    ### return the embeddings with lowest_indexes
    return [pool[i] for i in lowest_indexes], lowest_indexes


def make_inpaint_condition(image, image_mask):
    image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

    assert image.shape[0:1] == image_mask.shape[0:1]
    image[image_mask > 0.5] = -1.0  # set as masked pixel
    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return image

device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_id = "stabilityai/sdxl-turbo" #Replace to the model you would like to use


if torch.cuda.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32

processor_mask = IPAdapterMaskProcessor()
controlnets = [
    ControlNetModel.from_pretrained(
        "diffusers/controlnet-depth-sdxl-1.0",variant="fp16",use_safetensors=True,torch_dtype=torch.float16
    ),
    ControlNetModel.from_pretrained(
        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True,variant="fp16"
    ),
]

###load pipelines

pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16,controlnet=[controlnets[0],controlnets[0],controlnets[1]], use_safetensors=True, variant='fp16')
pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
pipe_CN.scheduler=DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0",subfolder="scheduler",use_karras_sigmas=True)
pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
pipe_CN.to("cuda")

##############################load loras

state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors')
pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood')
pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood')
pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder ,prefix='2', adapter_name='text_2_ourhood')
pipe_CN.set_adapters(["unet_ourhood","text_ourhood","text_2_ourhood"], adapter_weights=[1.0, 1.0,1.0])

pipe_CN.fuse_lora()

refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",text_encoder_2=pipe_CN.text_encoder_2,vae=pipe_CN.vae,torch_dtype=torch.float16,use_safetensors=True,variant="fp16")
refiner.to("cuda")

ip_pool = torch.load("./embeds_cases_for_ip.pt")

pool = list(ip_pool.values())

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

def ourhood_inference(prompt=str,num_inference_steps=int,scaffold=int,seed=int):

###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init

    guide = pipe_CN.encode_prompt(prompt)

    closest,indexes = ip_guide(guide[2],pool)

    print( [list(ip_pool.keys())[i] for i in indexes])

    ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]),0),torch.unsqueeze(closest[0],0)],0).to(dtype=torch.float16,device='cuda')

    pipe_CN.set_ip_adapter_scale([[0.5]])
    
    prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt

### function has no formats defined

    scaff_dic={1:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png",
                  'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png",
                  'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png",
                  'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"},
               2:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png",
                  'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png",
                  'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png",
                  'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"},
               3:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png",
                  'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png",
                  'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png",
                  'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}}
### mask init

    output_height = 1024
    output_width = 1024

    mask1 = load_image(scaff_dic[scaffold]['mask1'])
    mask2 = load_image(scaff_dic[scaffold]['mask2'])

    masks = processor_mask.preprocess([mask1], height=output_height, width=output_width)
    masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]

###precomputed depth image

    depth_image = load_image(scaff_dic[scaffold]['depth_image'])
    canny_image = load_image(scaff_dic[scaffold]['canny_image'])
    masked_depth=make_inpaint_condition(depth_image,mask2)

    images_CN = [depth_image, canny_image]


### inference

    n_steps = num_inference_steps

    generator = torch.Generator(device="cuda").manual_seed(seed)

    results = pipe_CN(
        prompt=prompt1,
        ip_adapter_image_embeds = [ip_embeds],
        negative_prompt="deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark",
        generator=generator,
        num_inference_steps=n_steps,
        num_images_per_prompt=1,
        denoising_end=0.95,
        image=[depth_image,masked_depth,canny_image],
        output_type="latent",
        control_guidance_start=[0.0, 0.35, 0.35],
        control_guidance_end=[0.35, 0.95, 0.95],
        controlnet_conditioning_scale=[0.35,0.95, 0.95],
        cross_attention_kwargs={"ip_adapter_masks": masks}
    ).images[0]


    image = refiner(
        prompt=prompt1,
        generator=generator,
        num_inference_steps=n_steps,
        denoising_start=0.95,
        image=results,
    ).images[0]

    return image


#@spaces.GPU #[uncomment to use ZeroGPU]

examples = [
    "in a British museum, pavillion, masonry, high-tables and chairs",
    "in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour",
    "in a colorful open office environment",
    " in a Nordic atrium environment"]

css="""
#col-container {
    margin: 0 auto;
    max-width: 640px;
}
"""

with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
        gr.Markdown(f"""
        # HB8-Ourhood inference test
        """)
        
        with gr.Row():
            
            prompt = gr.Text(
                label="Setting prompt",
                show_label=False,
                max_lines=1,
                placeholder="Where do you want to show the Ourhood pod?",
                container=False,
            )
            
            run_button = gr.Button("Run", scale=0)
        
        result = gr.Image(label="Result", show_label=False)

        with gr.Accordion("Advanced Settings", open=False):
            
            perspective = gr.Slider(
                label="perspective",
                minimum=1,
                maximum=3,
                step=1,
                value=1,
            )
            
            seed = gr.Slider(
                label="Tracking number (seed)",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
            )
            

            with gr.Row():
                
                
                num_inference_steps = gr.Slider(
                    label="Detail steps",
                    minimum=35,
                    maximum=75,
                    step=1,
                    value=50, #Replace with defaults that work for your model
                )
        
        gr.Examples(
            examples = examples,
            inputs = [prompt]
        )
    gr.on(
        triggers=[run_button.click, prompt.submit],
        fn = ourhood_inference,
        inputs = [prompt, num_inference_steps, perspective, seed],
        outputs = [result]
    )

demo.queue().launch()