Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import random | |
#import spaces #[uncomment to use ZeroGPU] | |
import os | |
from PIL import Image, ImageDraw, ImageFont | |
import torch | |
from PIL import Image | |
from diffusers.utils import load_image | |
from diffusers import DPMSolverSDEScheduler | |
from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, StableDiffusionXLControlNetPipeline, ControlNetModel | |
from diffusers.utils import load_image | |
from diffusers.image_processor import IPAdapterMaskProcessor | |
from torch import nn | |
### auxiliary functions | |
def ip_guide(guide, pool): | |
distances = [] | |
cos = nn.CosineSimilarity(dim=1, eps=1e-6) | |
for embed in pool: | |
dist = cos(guide, embed.to('cuda')) | |
distances.append(dist) | |
### find the indexes of the top 5 embeddings | |
indexed_distances = list(enumerate(distances)) | |
# Sort the list of pairs based on the scores | |
sorted_distances = sorted(indexed_distances, key=lambda x: x[1]) | |
# Extract the indexes of the lowest scores | |
lowest_indexes = [index for index, score in sorted_distances[:5]] | |
### return the embeddings with lowest_indexes | |
return [pool[i] for i in lowest_indexes], lowest_indexes | |
def make_inpaint_condition(image, image_mask): | |
image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 | |
image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 | |
assert image.shape[0:1] == image_mask.shape[0:1] | |
image[image_mask > 0.5] = -1.0 # set as masked pixel | |
image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) | |
image = torch.from_numpy(image) | |
return image | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_repo_id = "stabilityai/sdxl-turbo" #Replace to the model you would like to use | |
if torch.cuda.is_available(): | |
torch_dtype = torch.float16 | |
else: | |
torch_dtype = torch.float32 | |
processor_mask = IPAdapterMaskProcessor() | |
controlnets = [ | |
ControlNetModel.from_pretrained( | |
"diffusers/controlnet-depth-sdxl-1.0",variant="fp16",use_safetensors=True,torch_dtype=torch.float16 | |
), | |
ControlNetModel.from_pretrained( | |
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True,variant="fp16" | |
), | |
] | |
###load pipelines | |
pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16,controlnet=[controlnets[0],controlnets[0],controlnets[1]], use_safetensors=True, variant='fp16') | |
pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16) | |
pipe_CN.scheduler=DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0",subfolder="scheduler",use_karras_sigmas=True) | |
pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") | |
pipe_CN.to("cuda") | |
##############################load loras | |
state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors') | |
pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood') | |
pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood') | |
pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder ,prefix='2', adapter_name='text_2_ourhood') | |
pipe_CN.set_adapters(["unet_ourhood","text_ourhood","text_2_ourhood"], adapter_weights=[1.0, 1.0,1.0]) | |
pipe_CN.fuse_lora() | |
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",text_encoder_2=pipe_CN.text_encoder_2,vae=pipe_CN.vae,torch_dtype=torch.float16,use_safetensors=True,variant="fp16") | |
refiner.to("cuda") | |
ip_pool = torch.load("./embeds_cases_for_ip.pt") | |
pool = list(ip_pool.values()) | |
MAX_SEED = np.iinfo(np.int32).max | |
MAX_IMAGE_SIZE = 1024 | |
def ourhood_inference(prompt=str,num_inference_steps=int,scaffold=int,seed=int): | |
###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init | |
guide = pipe_CN.encode_prompt(prompt) | |
closest,indexes = ip_guide(guide[2],pool) | |
print( [list(ip_pool.keys())[i] for i in indexes]) | |
ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]),0),torch.unsqueeze(closest[0],0)],0).to(dtype=torch.float16,device='cuda') | |
pipe_CN.set_ip_adapter_scale([[0.5]]) | |
prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt | |
### function has no formats defined | |
scaff_dic={1:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png", | |
'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png", | |
'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png", | |
'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"}, | |
2:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png", | |
'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png", | |
'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png", | |
'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"}, | |
3:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png", | |
'mask2':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png", | |
'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png", | |
'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}} | |
### mask init | |
output_height = 1024 | |
output_width = 1024 | |
mask1 = load_image(scaff_dic[scaffold]['mask1']) | |
mask2 = load_image(scaff_dic[scaffold]['mask2']) | |
masks = processor_mask.preprocess([mask1], height=output_height, width=output_width) | |
masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])] | |
###precomputed depth image | |
depth_image = load_image(scaff_dic[scaffold]['depth_image']) | |
canny_image = load_image(scaff_dic[scaffold]['canny_image']) | |
masked_depth=make_inpaint_condition(depth_image,mask2) | |
images_CN = [depth_image, canny_image] | |
### inference | |
n_steps = num_inference_steps | |
generator = torch.Generator(device="cuda").manual_seed(seed) | |
results = pipe_CN( | |
prompt=prompt1, | |
ip_adapter_image_embeds = [ip_embeds], | |
negative_prompt="deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark", | |
generator=generator, | |
num_inference_steps=n_steps, | |
num_images_per_prompt=1, | |
denoising_end=0.95, | |
image=[depth_image,masked_depth,canny_image], | |
output_type="latent", | |
control_guidance_start=[0.0, 0.35, 0.35], | |
control_guidance_end=[0.35, 0.95, 0.95], | |
controlnet_conditioning_scale=[0.35,0.95, 0.95], | |
cross_attention_kwargs={"ip_adapter_masks": masks} | |
).images[0] | |
image = refiner( | |
prompt=prompt1, | |
generator=generator, | |
num_inference_steps=n_steps, | |
denoising_start=0.95, | |
image=results, | |
).images[0] | |
return image | |
#@spaces.GPU #[uncomment to use ZeroGPU] | |
examples = [ | |
"in a British museum, pavillion, masonry, high-tables and chairs", | |
"in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour", | |
"in a colorful open office environment", | |
" in a Nordic atrium environment"] | |
css=""" | |
#col-container { | |
margin: 0 auto; | |
max-width: 640px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(f""" | |
# HB8-Ourhood inference test | |
""") | |
with gr.Row(): | |
prompt = gr.Text( | |
label="Setting prompt", | |
show_label=False, | |
max_lines=1, | |
placeholder="Where do you want to show the Ourhood pod?", | |
container=False, | |
) | |
run_button = gr.Button("Run", scale=0) | |
result = gr.Image(label="Result", show_label=False) | |
with gr.Accordion("Advanced Settings", open=False): | |
perspective = gr.Slider( | |
label="perspective", | |
minimum=1, | |
maximum=3, | |
step=1, | |
value=1, | |
) | |
seed = gr.Slider( | |
label="Tracking number (seed)", | |
minimum=0, | |
maximum=MAX_SEED, | |
step=1, | |
value=0, | |
) | |
with gr.Row(): | |
num_inference_steps = gr.Slider( | |
label="Detail steps", | |
minimum=35, | |
maximum=75, | |
step=1, | |
value=50, #Replace with defaults that work for your model | |
) | |
gr.Examples( | |
examples = examples, | |
inputs = [prompt] | |
) | |
gr.on( | |
triggers=[run_button.click, prompt.submit], | |
fn = ourhood_inference, | |
inputs = [prompt, num_inference_steps, perspective, seed], | |
outputs = [result] | |
) | |
demo.queue().launch() | |