Spaces:

Tonioesparza
/

hb8_ourhood_pilot

Sleeping

File size: 14,897 Bytes

import gradio as gr
import numpy as np
import random
# import spaces #[uncomment to use ZeroGPU]
import os
from PIL import Image, ImageDraw, ImageFont
import torch
from PIL import Image
from diffusers.utils import load_image
from diffusers import DPMSolverSDEScheduler
from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, \
    StableDiffusionXLControlNetPipeline, ControlNetModel
from diffusers.utils import load_image
from diffusers.image_processor import IPAdapterMaskProcessor
from torch import nn


### auxiliary functions

def ip_guide(guide, pool,num=3):
    distances = []
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    for embed in pool:
        dist = cos(guide, embed.to('cuda'))
        distances.append(dist)
    ### find the indexes of the top 5 embeddings
    indexed_distances = list(enumerate(distances))
    # Sort the list of pairs based on the scores
    sorted_distances = sorted(indexed_distances, key=lambda x: x[1])
    # Extract the indexes of the lowest scores
    lowest_indexes = [index for index, score in sorted_distances[:num]]

    ### return the embeddings with lowest_indexes
    return [pool[i] for i in lowest_indexes], lowest_indexes


def make_inpaint_condition(image, image_mask):
    image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

    assert image.shape[0:1] == image_mask.shape[0:1]
    image[image_mask > 0.5] = -1.0  # set as masked pixel
    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return image

def find_token_sequence_in_pre_tokenized(input_string, other_string,pipe):
    # Load the tokenizer
    tokenizer = pipe.tokenizer

    # Tokenize the input string
    input_tokens = tokenizer.tokenize(input_string)

    # Tokenize the other string

    pre_tokenized_tokens = tokenizer.tokenize(other_string)
    # Find matching token sequences and their indexes
    matching_sequences = []
    input_length = len(input_tokens)
    for i in range(len(pre_tokenized_tokens) - input_length + 1):
        if pre_tokenized_tokens[i:i + input_length] == input_tokens:
            matching_sequences.append((pre_tokenized_tokens[i:i + input_length], i))

    return matching_sequences


device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use

if torch.cuda.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32

processor_mask = IPAdapterMaskProcessor()
controlnets = [
    ControlNetModel.from_pretrained(
        "diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16
    ),
    ControlNetModel.from_pretrained(
        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
    ),
]

###load pipelines

pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16,
                                                              controlnet=[controlnets[0], controlnets[0],
                                                                          controlnets[1]], use_safetensors=True,
                                                              variant='fp16')
###pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
pipe_CN.scheduler = DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0", subfolder="scheduler",
                                                          use_karras_sigmas=True)
pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
pipe_CN.to("cuda")

##############################load loras
pipe_CN.load_lora_weights('Tonioesparza/ourhood_training_dreambooth_lora_2_0',
                                   weight_name='pytorch_lora_weights.safetensors')
#state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors')
#pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood')
#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood')
#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, prefix='2', adapter_name='text_2_ourhood')
#pipe_CN.set_adapters(["unet_ourhood", "text_ourhood", "text_2_ourhood"], adapter_weights=[1.0, 1.0, 1.0])

pipe_CN.fuse_lora()

refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",
                                                           text_encoder_2=pipe_CN.text_encoder_2, vae=pipe_CN.vae,
                                                           torch_dtype=torch.float16, use_safetensors=True,
                                                           variant="fp16")
refiner.to("cuda")

ip_pool = torch.load("./embeds_cases_for_ip.pt")

pool = list(ip_pool.values())

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

slingshot = torch.load("./slingshot.pt")

def ourhood_inference(prompt=str, num_inference_steps=int, scaffold=int, seed=int, cases_strength=float , cases_scope=int ):
    ###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init

    condition = 'both'

    guide = pipe_CN.encode_prompt(prompt)

    closest, indexes = ip_guide(guide[2], pool,cases_scope)

    ### torch.mean de los indexes

    ip_means = torch.mean(torch.stack([pool[i] for i in indexes]), dim=0)

    print([list(ip_pool.keys())[i] for i in indexes])

    ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]), 0), torch.unsqueeze(ip_means, 0)], 0).to(
        dtype=torch.float16, device='cuda')

    pipe_CN.set_ip_adapter_scale([[cases_strength]])

    prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt

    ### prompt encoding

    text_inputs = pipe_CN.tokenizer(
        prompt1,
        padding="max_length",
        max_length=pipe_CN.tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",
    )

    text_input_ids = text_inputs.input_ids

    prompt_embeds_1 = pipe_CN.text_encoder(text_input_ids.to('cuda'), output_hidden_states=True)

    prompt_embeds_1 = prompt_embeds_1.hidden_states[-2]

    ###embed prompt encoding 2

    prompt_embeds_2 = pipe_CN.text_encoder_2(text_input_ids.to('cuda'), output_hidden_states=True)

    pooled_prompt_embeds_2 = prompt_embeds_2[0]

    prompt_embeds_2 = prompt_embeds_2.hidden_states[-2]

    #### substraction

    if condition == 'both':

        matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN)

        items = []

        for match in matches:
            for w in range(len(match[0])):
                items.append(match[1] + w)

        for it in items:
            prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda')

        pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda')

    elif condition == 'pooled':

        pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda')

    elif condition == 'embeds':

        matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN)

        items = []

        for match in matches:
            for w in range(len(match[0])):
                items.append(match[1] + w)

        for it in items:
            prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda')

    ### concatenation

    prompt_embeds = torch.cat([prompt_embeds_1, prompt_embeds_2], dim=-1)

    ### create negative embeds text encoder 1

    negative_prompt = "deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark"

    max_length = prompt_embeds.shape[1]

    uncond_input = pipe_CN.tokenizer(
        negative_prompt,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )

    uncond_input_ids = uncond_input.input_ids

    negative_prompt_embeds_1 = pipe_CN.text_encoder(
        uncond_input_ids.to('cuda'),
        output_hidden_states=True,
    )

    negative_prompt_embeds_1 = negative_prompt_embeds_1.hidden_states[-2]

    ### create negative embeds text encoder 2

    negative_prompt_embeds_2 = pipe_CN.text_encoder_2(
        uncond_input_ids.to('cuda'),
        output_hidden_states=True,
    )

    negative_pooled_prompt_embeds = negative_prompt_embeds_2[0]

    negative_prompt_embeds_2 = negative_prompt_embeds_2.hidden_states[-2]

    ### negative concatenation

    negative_prompt_embeds = torch.cat([negative_prompt_embeds_1, negative_prompt_embeds_2], dim=-1)

    ### function has no formats defined

    scaff_dic = {1: {
        'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png",
        'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png",
        'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png",
        'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"},
                 2: {
                     'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png",
                     'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png",
                     'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png",
                     'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"},
                 3: {
                     'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png",
                     'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png",
                     'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png",
                     'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}}
    ### mask init

    output_height = 1024
    output_width = 1024

    mask1 = load_image(scaff_dic[scaffold]['mask1'])
    mask2 = load_image(scaff_dic[scaffold]['mask2'])

    masks = processor_mask.preprocess([mask1], height=output_height, width=output_width)
    masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]

    ###precomputed depth image

    depth_image = load_image(scaff_dic[scaffold]['depth_image'])
    canny_image = load_image(scaff_dic[scaffold]['canny_image'])
    masked_depth = make_inpaint_condition(depth_image, mask2)

    images_CN = [depth_image, canny_image]

    ### inference

    n_steps = num_inference_steps

    generator = torch.Generator(device="cuda").manual_seed(seed)

    results = pipe_CN(
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
        pooled_prompt_embeds=pooled_prompt_embeds_2,
        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
        ip_adapter_image_embeds=[ip_embeds],
        generator=generator,
        num_inference_steps=n_steps,
        num_images_per_prompt=1,
        denoising_end=0.95,
        image=[depth_image, masked_depth, canny_image],
        output_type="latent",
        control_guidance_start=[0.0, 0.35, 0.35],
        control_guidance_end=[0.35, 0.95, 0.95],
        controlnet_conditioning_scale=[0.35, 0.95, 0.95],
        cross_attention_kwargs={"ip_adapter_masks": masks}
    ).images[0]

    image = refiner(
        prompt=prompt1,
        generator=generator,
        num_inference_steps=n_steps,
        denoising_start=0.95,
        image=results,
    ).images[0]

    return image



#@spaces.GPU #[uncomment to use ZeroGPU]

examples = [
    "in a British museum, pavillion, masonry, high-tables and chairs",
    "in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour",
    "in a colorful open office environment",
    " in a Nordic atrium environment"]

css="""
#col-container {
    margin: 0 auto;
    max-width: 640px;
}
"""

with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
        gr.Markdown(f"""
        # HB8-Ourhood inference test
        """)
        
        with gr.Row():
            
            prompt = gr.Text(
                label="Setting prompt",
                show_label=False,
                max_lines=1,
                placeholder="Where do you want to show the Ourhood pod?",
                container=False,
            )
            
            run_button = gr.Button("Run", scale=0)
        
        result = gr.Image(label="Result", show_label=False)

        with gr.Accordion("Advanced Settings", open=False):
            
            perspective = gr.Slider(
                label="perspective",
                minimum=1,
                maximum=3,
                step=1,
                value=1,
            )
            
            seed = gr.Slider(
                label="Tracking number (seed)",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
            )
            

            cases_strength = gr.Slider(
                label="Brand strenght",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.5,
            )

            cases_scope = gr.Slider(
                label="Brand scope",
                minimum=1,
                maximum=10,
                step=1,
                value=1,
            )
            

            with gr.Row():
                
                
                num_inference_steps = gr.Slider(
                    label="Detail steps",
                    minimum=35,
                    maximum=75,
                    step=1,
                    value=50, #Replace with defaults that work for your model
                )
        
        gr.Examples(
            examples = examples,
            inputs = [prompt]
        )
    gr.on(
        triggers=[run_button.click, prompt.submit],
        fn = ourhood_inference,
        inputs = [prompt, num_inference_steps, perspective, seed,cases_strength,cases_scope],
        outputs = [result]
    )

demo.queue().launch()