import gradio as gr import numpy as np import random # import spaces #[uncomment to use ZeroGPU] import os from PIL import Image, ImageDraw, ImageFont import torch from PIL import Image from diffusers.utils import load_image from diffusers import DPMSolverSDEScheduler from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, \ StableDiffusionXLControlNetPipeline, ControlNetModel from diffusers.utils import load_image from diffusers.image_processor import IPAdapterMaskProcessor from torch import nn ### auxiliary functions def ip_guide(guide, pool,num=3): distances = [] cos = nn.CosineSimilarity(dim=1, eps=1e-6) for embed in pool: dist = cos(guide, embed.to('cuda')) distances.append(dist) ### find the indexes of the top 5 embeddings indexed_distances = list(enumerate(distances)) # Sort the list of pairs based on the scores sorted_distances = sorted(indexed_distances, key=lambda x: x[1]) # Extract the indexes of the lowest scores lowest_indexes = [index for index, score in sorted_distances[:num]] ### return the embeddings with lowest_indexes return [pool[i] for i in lowest_indexes], lowest_indexes def make_inpaint_condition(image, image_mask): image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 assert image.shape[0:1] == image_mask.shape[0:1] image[image_mask > 0.5] = -1.0 # set as masked pixel image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) image = torch.from_numpy(image) return image def find_token_sequence_in_pre_tokenized(input_string, other_string,pipe): # Load the tokenizer tokenizer = pipe.tokenizer # Tokenize the input string input_tokens = tokenizer.tokenize(input_string) # Tokenize the other string pre_tokenized_tokens = tokenizer.tokenize(other_string) # Find matching token sequences and their indexes matching_sequences = [] input_length = len(input_tokens) for i in range(len(pre_tokenized_tokens) - input_length + 1): if pre_tokenized_tokens[i:i + input_length] == input_tokens: matching_sequences.append((pre_tokenized_tokens[i:i + input_length], i)) return matching_sequences device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use if torch.cuda.is_available(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 processor_mask = IPAdapterMaskProcessor() controlnets = [ ControlNetModel.from_pretrained( "diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16 ), ControlNetModel.from_pretrained( "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" ), ] ###load pipelines pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16, controlnet=[controlnets[0], controlnets[0], controlnets[1]], use_safetensors=True, variant='fp16') ###pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16) pipe_CN.scheduler = DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0", subfolder="scheduler", use_karras_sigmas=True) pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") pipe_CN.to("cuda") ##############################load loras pipe_CN.load_lora_weights('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors') #state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors') #pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood') #pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood') #pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, prefix='2', adapter_name='text_2_ourhood') #pipe_CN.set_adapters(["unet_ourhood", "text_ourhood", "text_2_ourhood"], adapter_weights=[1.0, 1.0, 1.0]) pipe_CN.fuse_lora() refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe_CN.text_encoder_2, vae=pipe_CN.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16") refiner.to("cuda") ip_pool = torch.load("./embeds_cases_for_ip.pt") pool = list(ip_pool.values()) MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 1024 slingshot = torch.load("./slingshot.pt") def ourhood_inference(prompt=str, num_inference_steps=int, scaffold=int, seed=int, cases_strength=float , cases_scope=int ): ###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init condition = 'both' guide = pipe_CN.encode_prompt(prompt) closest, indexes = ip_guide(guide[2], pool,cases_scope) ### torch.mean de los indexes ip_means = torch.mean(torch.stack([pool[i] for i in indexes]), dim=0) print([list(ip_pool.keys())[i] for i in indexes]) ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]), 0), torch.unsqueeze(ip_means, 0)], 0).to( dtype=torch.float16, device='cuda') pipe_CN.set_ip_adapter_scale([[cases_strength]]) prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt ### prompt encoding text_inputs = pipe_CN.tokenizer( prompt1, padding="max_length", max_length=pipe_CN.tokenizer.model_max_length, truncation=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids prompt_embeds_1 = pipe_CN.text_encoder(text_input_ids.to('cuda'), output_hidden_states=True) prompt_embeds_1 = prompt_embeds_1.hidden_states[-2] ###embed prompt encoding 2 prompt_embeds_2 = pipe_CN.text_encoder_2(text_input_ids.to('cuda'), output_hidden_states=True) pooled_prompt_embeds_2 = prompt_embeds_2[0] prompt_embeds_2 = prompt_embeds_2.hidden_states[-2] #### substraction if condition == 'both': matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN) items = [] for match in matches: for w in range(len(match[0])): items.append(match[1] + w) for it in items: prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda') pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda') elif condition == 'pooled': pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda') elif condition == 'embeds': matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN) items = [] for match in matches: for w in range(len(match[0])): items.append(match[1] + w) for it in items: prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda') ### concatenation prompt_embeds = torch.cat([prompt_embeds_1, prompt_embeds_2], dim=-1) ### create negative embeds text encoder 1 negative_prompt = "deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark" max_length = prompt_embeds.shape[1] uncond_input = pipe_CN.tokenizer( negative_prompt, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt", ) uncond_input_ids = uncond_input.input_ids negative_prompt_embeds_1 = pipe_CN.text_encoder( uncond_input_ids.to('cuda'), output_hidden_states=True, ) negative_prompt_embeds_1 = negative_prompt_embeds_1.hidden_states[-2] ### create negative embeds text encoder 2 negative_prompt_embeds_2 = pipe_CN.text_encoder_2( uncond_input_ids.to('cuda'), output_hidden_states=True, ) negative_pooled_prompt_embeds = negative_prompt_embeds_2[0] negative_prompt_embeds_2 = negative_prompt_embeds_2.hidden_states[-2] ### negative concatenation negative_prompt_embeds = torch.cat([negative_prompt_embeds_1, negative_prompt_embeds_2], dim=-1) ### function has no formats defined scaff_dic = {1: { 'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png", 'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png", 'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png", 'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"}, 2: { 'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png", 'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png", 'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png", 'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"}, 3: { 'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png", 'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png", 'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png", 'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}} ### mask init output_height = 1024 output_width = 1024 mask1 = load_image(scaff_dic[scaffold]['mask1']) mask2 = load_image(scaff_dic[scaffold]['mask2']) masks = processor_mask.preprocess([mask1], height=output_height, width=output_width) masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])] ###precomputed depth image depth_image = load_image(scaff_dic[scaffold]['depth_image']) canny_image = load_image(scaff_dic[scaffold]['canny_image']) masked_depth = make_inpaint_condition(depth_image, mask2) images_CN = [depth_image, canny_image] ### inference n_steps = num_inference_steps generator = torch.Generator(device="cuda").manual_seed(seed) results = pipe_CN( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds_2, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, ip_adapter_image_embeds=[ip_embeds], generator=generator, num_inference_steps=n_steps, num_images_per_prompt=1, denoising_end=0.95, image=[depth_image, masked_depth, canny_image], output_type="latent", control_guidance_start=[0.0, 0.35, 0.35], control_guidance_end=[0.35, 0.95, 0.95], controlnet_conditioning_scale=[0.35, 0.95, 0.95], cross_attention_kwargs={"ip_adapter_masks": masks} ).images[0] image = refiner( prompt=prompt1, generator=generator, num_inference_steps=n_steps, denoising_start=0.95, image=results, ).images[0] return image #@spaces.GPU #[uncomment to use ZeroGPU] examples = [ "in a British museum, pavillion, masonry, high-tables and chairs", "in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour", "in a colorful open office environment", " in a Nordic atrium environment"] css=""" #col-container { margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown(f""" # HB8-Ourhood inference test """) with gr.Row(): prompt = gr.Text( label="Setting prompt", show_label=False, max_lines=1, placeholder="Where do you want to show the Ourhood pod?", container=False, ) run_button = gr.Button("Run", scale=0) result = gr.Image(label="Result", show_label=False) with gr.Accordion("Advanced Settings", open=False): perspective = gr.Slider( label="perspective", minimum=1, maximum=3, step=1, value=1, ) seed = gr.Slider( label="Tracking number (seed)", minimum=0, maximum=MAX_SEED, step=1, value=0, ) cases_strength = gr.Slider( label="Brand strenght", minimum=0.0, maximum=1.0, step=0.05, value=0.5, ) cases_scope = gr.Slider( label="Brand scope", minimum=1, maximum=10, step=1, value=1, ) with gr.Row(): num_inference_steps = gr.Slider( label="Detail steps", minimum=35, maximum=75, step=1, value=50, #Replace with defaults that work for your model ) gr.Examples( examples = examples, inputs = [prompt] ) gr.on( triggers=[run_button.click, prompt.submit], fn = ourhood_inference, inputs = [prompt, num_inference_steps, perspective, seed], outputs = [result] ) demo.queue().launch()