Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import random | |
# import spaces #[uncomment to use ZeroGPU] | |
import os | |
from PIL import Image, ImageDraw, ImageFont | |
import torch | |
from PIL import Image | |
from diffusers.utils import load_image | |
from diffusers import DPMSolverSDEScheduler | |
from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, \ | |
StableDiffusionXLControlNetPipeline, ControlNetModel | |
from diffusers.utils import load_image | |
from diffusers.image_processor import IPAdapterMaskProcessor | |
from torch import nn | |
### auxiliary functions | |
def ip_guide(guide, pool,num=3): | |
distances = [] | |
cos = nn.CosineSimilarity(dim=1, eps=1e-6) | |
for embed in pool: | |
dist = cos(guide, embed.to('cuda')) | |
distances.append(dist) | |
### find the indexes of the top 5 embeddings | |
indexed_distances = list(enumerate(distances)) | |
# Sort the list of pairs based on the scores | |
sorted_distances = sorted(indexed_distances, key=lambda x: x[1]) | |
# Extract the indexes of the lowest scores | |
lowest_indexes = [index for index, score in sorted_distances[:num]] | |
### return the embeddings with lowest_indexes | |
return [pool[i] for i in lowest_indexes], lowest_indexes | |
def make_inpaint_condition(image, image_mask): | |
image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 | |
image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 | |
assert image.shape[0:1] == image_mask.shape[0:1] | |
image[image_mask > 0.5] = -1.0 # set as masked pixel | |
image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) | |
image = torch.from_numpy(image) | |
return image | |
def find_token_sequence_in_pre_tokenized(input_string, other_string,pipe): | |
# Load the tokenizer | |
tokenizer = pipe.tokenizer | |
# Tokenize the input string | |
input_tokens = tokenizer.tokenize(input_string) | |
# Tokenize the other string | |
pre_tokenized_tokens = tokenizer.tokenize(other_string) | |
# Find matching token sequences and their indexes | |
matching_sequences = [] | |
input_length = len(input_tokens) | |
for i in range(len(pre_tokenized_tokens) - input_length + 1): | |
if pre_tokenized_tokens[i:i + input_length] == input_tokens: | |
matching_sequences.append((pre_tokenized_tokens[i:i + input_length], i)) | |
return matching_sequences | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use | |
if torch.cuda.is_available(): | |
torch_dtype = torch.float16 | |
else: | |
torch_dtype = torch.float32 | |
processor_mask = IPAdapterMaskProcessor() | |
controlnets = [ | |
ControlNetModel.from_pretrained( | |
"diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16 | |
), | |
ControlNetModel.from_pretrained( | |
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" | |
), | |
] | |
###load pipelines | |
pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16, | |
controlnet=[controlnets[0], controlnets[0], | |
controlnets[1]], use_safetensors=True, | |
variant='fp16') | |
###pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16) | |
pipe_CN.scheduler = DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0", subfolder="scheduler", | |
use_karras_sigmas=True) | |
pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") | |
pipe_CN.to("cuda") | |
##############################load loras | |
pipe_CN.load_lora_weights('Tonioesparza/ourhood_training_dreambooth_lora_2_0', | |
weight_name='pytorch_lora_weights.safetensors') | |
#state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors') | |
#pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood') | |
#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood') | |
#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, prefix='2', adapter_name='text_2_ourhood') | |
#pipe_CN.set_adapters(["unet_ourhood", "text_ourhood", "text_2_ourhood"], adapter_weights=[1.0, 1.0, 1.0]) | |
pipe_CN.fuse_lora() | |
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0", | |
text_encoder_2=pipe_CN.text_encoder_2, vae=pipe_CN.vae, | |
torch_dtype=torch.float16, use_safetensors=True, | |
variant="fp16") | |
refiner.to("cuda") | |
ip_pool = torch.load("./embeds_cases_for_ip.pt") | |
pool = list(ip_pool.values()) | |
MAX_SEED = np.iinfo(np.int32).max | |
MAX_IMAGE_SIZE = 1024 | |
slingshot = torch.load("./slingshot.pt") | |
def ourhood_inference(prompt=str, num_inference_steps=int, scaffold=int, seed=int, cases_strength=float , cases_scope=int ): | |
###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init | |
condition = 'both' | |
guide = pipe_CN.encode_prompt(prompt) | |
closest, indexes = ip_guide(guide[2], pool,cases_scope) | |
### torch.mean de los indexes | |
ip_means = torch.mean(torch.stack([pool[i] for i in indexes]), dim=0) | |
print([list(ip_pool.keys())[i] for i in indexes]) | |
ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]), 0), torch.unsqueeze(ip_means, 0)], 0).to( | |
dtype=torch.float16, device='cuda') | |
pipe_CN.set_ip_adapter_scale([[cases_strength]]) | |
prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt | |
### prompt encoding | |
text_inputs = pipe_CN.tokenizer( | |
prompt1, | |
padding="max_length", | |
max_length=pipe_CN.tokenizer.model_max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
text_input_ids = text_inputs.input_ids | |
prompt_embeds_1 = pipe_CN.text_encoder(text_input_ids.to('cuda'), output_hidden_states=True) | |
prompt_embeds_1 = prompt_embeds_1.hidden_states[-2] | |
###embed prompt encoding 2 | |
prompt_embeds_2 = pipe_CN.text_encoder_2(text_input_ids.to('cuda'), output_hidden_states=True) | |
pooled_prompt_embeds_2 = prompt_embeds_2[0] | |
prompt_embeds_2 = prompt_embeds_2.hidden_states[-2] | |
#### substraction | |
if condition == 'both': | |
matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN) | |
items = [] | |
for match in matches: | |
for w in range(len(match[0])): | |
items.append(match[1] + w) | |
for it in items: | |
prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda') | |
pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda') | |
elif condition == 'pooled': | |
pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda') | |
elif condition == 'embeds': | |
matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN) | |
items = [] | |
for match in matches: | |
for w in range(len(match[0])): | |
items.append(match[1] + w) | |
for it in items: | |
prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda') | |
### concatenation | |
prompt_embeds = torch.cat([prompt_embeds_1, prompt_embeds_2], dim=-1) | |
### create negative embeds text encoder 1 | |
negative_prompt = "deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark" | |
max_length = prompt_embeds.shape[1] | |
uncond_input = pipe_CN.tokenizer( | |
negative_prompt, | |
padding="max_length", | |
max_length=max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
uncond_input_ids = uncond_input.input_ids | |
negative_prompt_embeds_1 = pipe_CN.text_encoder( | |
uncond_input_ids.to('cuda'), | |
output_hidden_states=True, | |
) | |
negative_prompt_embeds_1 = negative_prompt_embeds_1.hidden_states[-2] | |
### create negative embeds text encoder 2 | |
negative_prompt_embeds_2 = pipe_CN.text_encoder_2( | |
uncond_input_ids.to('cuda'), | |
output_hidden_states=True, | |
) | |
negative_pooled_prompt_embeds = negative_prompt_embeds_2[0] | |
negative_prompt_embeds_2 = negative_prompt_embeds_2.hidden_states[-2] | |
### negative concatenation | |
negative_prompt_embeds = torch.cat([negative_prompt_embeds_1, negative_prompt_embeds_2], dim=-1) | |
### function has no formats defined | |
scaff_dic = {1: { | |
'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png", | |
'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png", | |
'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png", | |
'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"}, | |
2: { | |
'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png", | |
'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png", | |
'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png", | |
'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"}, | |
3: { | |
'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png", | |
'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png", | |
'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png", | |
'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}} | |
### mask init | |
output_height = 1024 | |
output_width = 1024 | |
mask1 = load_image(scaff_dic[scaffold]['mask1']) | |
mask2 = load_image(scaff_dic[scaffold]['mask2']) | |
masks = processor_mask.preprocess([mask1], height=output_height, width=output_width) | |
masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])] | |
###precomputed depth image | |
depth_image = load_image(scaff_dic[scaffold]['depth_image']) | |
canny_image = load_image(scaff_dic[scaffold]['canny_image']) | |
masked_depth = make_inpaint_condition(depth_image, mask2) | |
images_CN = [depth_image, canny_image] | |
### inference | |
n_steps = num_inference_steps | |
generator = torch.Generator(device="cuda").manual_seed(seed) | |
results = pipe_CN( | |
prompt_embeds=prompt_embeds, | |
negative_prompt_embeds=negative_prompt_embeds, | |
pooled_prompt_embeds=pooled_prompt_embeds_2, | |
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, | |
ip_adapter_image_embeds=[ip_embeds], | |
generator=generator, | |
num_inference_steps=n_steps, | |
num_images_per_prompt=1, | |
denoising_end=0.95, | |
image=[depth_image, masked_depth, canny_image], | |
output_type="latent", | |
control_guidance_start=[0.0, 0.35, 0.35], | |
control_guidance_end=[0.35, 0.95, 0.95], | |
controlnet_conditioning_scale=[0.35, 0.95, 0.95], | |
cross_attention_kwargs={"ip_adapter_masks": masks} | |
).images[0] | |
image = refiner( | |
prompt=prompt1, | |
generator=generator, | |
num_inference_steps=n_steps, | |
denoising_start=0.95, | |
image=results, | |
).images[0] | |
return image | |
#@spaces.GPU #[uncomment to use ZeroGPU] | |
examples = [ | |
"in a British museum, pavillion, masonry, high-tables and chairs", | |
"in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour", | |
"in a colorful open office environment", | |
" in a Nordic atrium environment"] | |
css=""" | |
#col-container { | |
margin: 0 auto; | |
max-width: 640px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(f""" | |
# HB8-Ourhood inference test | |
""") | |
with gr.Row(): | |
prompt = gr.Text( | |
label="Setting prompt", | |
show_label=False, | |
max_lines=1, | |
placeholder="Where do you want to show the Ourhood pod?", | |
container=False, | |
) | |
run_button = gr.Button("Run", scale=0) | |
result = gr.Image(label="Result", show_label=False) | |
with gr.Accordion("Advanced Settings", open=False): | |
perspective = gr.Slider( | |
label="perspective", | |
minimum=1, | |
maximum=3, | |
step=1, | |
value=1, | |
) | |
seed = gr.Slider( | |
label="Tracking number (seed)", | |
minimum=0, | |
maximum=MAX_SEED, | |
step=1, | |
value=0, | |
) | |
cases_strength = gr.Slider( | |
label="Brand strenght", | |
minimum=0.0, | |
maximum=1.0, | |
step=0.05, | |
value=0.5, | |
) | |
cases_scope = gr.Slider( | |
label="Brand scope", | |
minimum=1, | |
maximum=10, | |
step=1, | |
value=1, | |
) | |
with gr.Row(): | |
num_inference_steps = gr.Slider( | |
label="Detail steps", | |
minimum=35, | |
maximum=75, | |
step=1, | |
value=50, #Replace with defaults that work for your model | |
) | |
gr.Examples( | |
examples = examples, | |
inputs = [prompt] | |
) | |
gr.on( | |
triggers=[run_button.click, prompt.submit], | |
fn = ourhood_inference, | |
inputs = [prompt, num_inference_steps, perspective, seed], | |
outputs = [result] | |
) | |
demo.queue().launch() | |