Spaces:

Tonioesparza
/

hb8_ourhood_pilot

Sleeping

App Files Files Community

hb8_ourhood_pilot / app.py

Tonioesparza

Update app.py

c70c5d9 verified 2 months ago

raw

history blame contribute delete

14.9 kB

	import gradio as gr
	import numpy as np
	import random
	# import spaces #[uncomment to use ZeroGPU]
	import os
	from PIL import Image, ImageDraw, ImageFont
	import torch
	from PIL import Image
	from diffusers.utils import load_image
	from diffusers import DPMSolverSDEScheduler
	from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, \
	StableDiffusionXLControlNetPipeline, ControlNetModel
	from diffusers.utils import load_image
	from diffusers.image_processor import IPAdapterMaskProcessor
	from torch import nn


	### auxiliary functions

	def ip_guide(guide, pool,num=3):
	distances = []
	cos = nn.CosineSimilarity(dim=1, eps=1e-6)
	for embed in pool:
	dist = cos(guide, embed.to('cuda'))
	distances.append(dist)
	### find the indexes of the top 5 embeddings
	indexed_distances = list(enumerate(distances))
	# Sort the list of pairs based on the scores
	sorted_distances = sorted(indexed_distances, key=lambda x: x[1])
	# Extract the indexes of the lowest scores
	lowest_indexes = [index for index, score in sorted_distances[:num]]

	### return the embeddings with lowest_indexes
	return [pool[i] for i in lowest_indexes], lowest_indexes


	def make_inpaint_condition(image, image_mask):
	image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
	image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

	assert image.shape[0:1] == image_mask.shape[0:1]
	image[image_mask > 0.5] = -1.0 # set as masked pixel
	image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
	image = torch.from_numpy(image)
	return image

	def find_token_sequence_in_pre_tokenized(input_string, other_string,pipe):
	# Load the tokenizer
	tokenizer = pipe.tokenizer

	# Tokenize the input string
	input_tokens = tokenizer.tokenize(input_string)

	# Tokenize the other string

	pre_tokenized_tokens = tokenizer.tokenize(other_string)
	# Find matching token sequences and their indexes
	matching_sequences = []
	input_length = len(input_tokens)
	for i in range(len(pre_tokenized_tokens) - input_length + 1):
	if pre_tokenized_tokens[i:i + input_length] == input_tokens:
	matching_sequences.append((pre_tokenized_tokens[i:i + input_length], i))

	return matching_sequences


	device = "cuda" if torch.cuda.is_available() else "cpu"
	model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use

	if torch.cuda.is_available():
	torch_dtype = torch.float16
	else:
	torch_dtype = torch.float32

	processor_mask = IPAdapterMaskProcessor()
	controlnets = [
	ControlNetModel.from_pretrained(
	"diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16
	),
	ControlNetModel.from_pretrained(
	"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
	),
	]

	###load pipelines

	pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16,
	controlnet=[controlnets[0], controlnets[0],
	controlnets[1]], use_safetensors=True,
	variant='fp16')
	###pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
	pipe_CN.scheduler = DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0", subfolder="scheduler",
	use_karras_sigmas=True)
	pipe_CN.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
	pipe_CN.to("cuda")

	##############################load loras
	pipe_CN.load_lora_weights('Tonioesparza/ourhood_training_dreambooth_lora_2_0',
	weight_name='pytorch_lora_weights.safetensors')
	#state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors')
	#pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_ourhood')
	#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_ourhood')
	#pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, prefix='2', adapter_name='text_2_ourhood')
	#pipe_CN.set_adapters(["unet_ourhood", "text_ourhood", "text_2_ourhood"], adapter_weights=[1.0, 1.0, 1.0])

	pipe_CN.fuse_lora()

	refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",
	text_encoder_2=pipe_CN.text_encoder_2, vae=pipe_CN.vae,
	torch_dtype=torch.float16, use_safetensors=True,
	variant="fp16")
	refiner.to("cuda")

	ip_pool = torch.load("./embeds_cases_for_ip.pt")

	pool = list(ip_pool.values())

	MAX_SEED = np.iinfo(np.int32).max
	MAX_IMAGE_SIZE = 1024

	slingshot = torch.load("./slingshot.pt")

	def ourhood_inference(prompt=str, num_inference_steps=int, scaffold=int, seed=int, cases_strength=float , cases_scope=int ):
	###pro_encode = pipe_cn.encode_text(prompt) ###ip_images init

	condition = 'both'

	guide = pipe_CN.encode_prompt(prompt)

	closest, indexes = ip_guide(guide[2], pool,cases_scope)

	### torch.mean de los indexes

	ip_means = torch.mean(torch.stack([pool[i] for i in indexes]), dim=0)

	print([list(ip_pool.keys())[i] for i in indexes])

	ip_embeds = torch.cat([torch.unsqueeze(torch.zeros_like(closest[0]), 0), torch.unsqueeze(ip_means, 0)], 0).to(
	dtype=torch.float16, device='cuda')

	pipe_CN.set_ip_adapter_scale([[cases_strength]])

	prompt1 = 'A photograph, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, in ' + prompt

	### prompt encoding

	text_inputs = pipe_CN.tokenizer(
	prompt1,
	padding="max_length",
	max_length=pipe_CN.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	text_input_ids = text_inputs.input_ids

	prompt_embeds_1 = pipe_CN.text_encoder(text_input_ids.to('cuda'), output_hidden_states=True)

	prompt_embeds_1 = prompt_embeds_1.hidden_states[-2]

	###embed prompt encoding 2

	prompt_embeds_2 = pipe_CN.text_encoder_2(text_input_ids.to('cuda'), output_hidden_states=True)

	pooled_prompt_embeds_2 = prompt_embeds_2[0]

	prompt_embeds_2 = prompt_embeds_2.hidden_states[-2]

	#### substraction

	if condition == 'both':

	matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN)

	items = []

	for match in matches:
	for w in range(len(match[0])):
	items.append(match[1] + w)

	for it in items:
	prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda')

	pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda')

	elif condition == 'pooled':

	pooled_prompt_embeds = pooled_prompt_embeds_2 + slingshot['b'].to('cuda')

	elif condition == 'embeds':

	matches = find_token_sequence_in_pre_tokenized('ourhood privacy booth', prompt1, pipe_CN)

	items = []

	for match in matches:
	for w in range(len(match[0])):
	items.append(match[1] + w)

	for it in items:
	prompt_embeds_2[0][it] = prompt_embeds_2[0][it] + slingshot['b'].to('cuda')

	### concatenation

	prompt_embeds = torch.cat([prompt_embeds_1, prompt_embeds_2], dim=-1)

	### create negative embeds text encoder 1

	negative_prompt = "deformed, ugly, wrong proportion, low res, worst quality, low quality,text,watermark"

	max_length = prompt_embeds.shape[1]

	uncond_input = pipe_CN.tokenizer(
	negative_prompt,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="pt",
	)

	uncond_input_ids = uncond_input.input_ids

	negative_prompt_embeds_1 = pipe_CN.text_encoder(
	uncond_input_ids.to('cuda'),
	output_hidden_states=True,
	)

	negative_prompt_embeds_1 = negative_prompt_embeds_1.hidden_states[-2]

	### create negative embeds text encoder 2

	negative_prompt_embeds_2 = pipe_CN.text_encoder_2(
	uncond_input_ids.to('cuda'),
	output_hidden_states=True,
	)

	negative_pooled_prompt_embeds = negative_prompt_embeds_2[0]

	negative_prompt_embeds_2 = negative_prompt_embeds_2.hidden_states[-2]

	### negative concatenation

	negative_prompt_embeds = torch.cat([negative_prompt_embeds_1, negative_prompt_embeds_2], dim=-1)

	### function has no formats defined

	scaff_dic = {1: {
	'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png",
	'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_square_2.png",
	'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png",
	'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"},
	2: {
	'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png",
	'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_C.png",
	'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png",
	'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"},
	3: {
	'mask1': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png",
	'mask2': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_out_B.png",
	'depth_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png",
	'canny_image': "https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}}
	### mask init

	output_height = 1024
	output_width = 1024

	mask1 = load_image(scaff_dic[scaffold]['mask1'])
	mask2 = load_image(scaff_dic[scaffold]['mask2'])

	masks = processor_mask.preprocess([mask1], height=output_height, width=output_width)
	masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]

	###precomputed depth image

	depth_image = load_image(scaff_dic[scaffold]['depth_image'])
	canny_image = load_image(scaff_dic[scaffold]['canny_image'])
	masked_depth = make_inpaint_condition(depth_image, mask2)

	images_CN = [depth_image, canny_image]

	### inference

	n_steps = num_inference_steps

	generator = torch.Generator(device="cuda").manual_seed(seed)

	results = pipe_CN(
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds_2,
	negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
	ip_adapter_image_embeds=[ip_embeds],
	generator=generator,
	num_inference_steps=n_steps,
	num_images_per_prompt=1,
	denoising_end=0.95,
	image=[depth_image, masked_depth, canny_image],
	output_type="latent",
	control_guidance_start=[0.0, 0.35, 0.35],
	control_guidance_end=[0.35, 0.95, 0.95],
	controlnet_conditioning_scale=[0.35, 0.95, 0.95],
	cross_attention_kwargs={"ip_adapter_masks": masks}
	).images[0]

	image = refiner(
	prompt=prompt1,
	generator=generator,
	num_inference_steps=n_steps,
	denoising_start=0.95,
	image=results,
	).images[0]

	return image



	#@spaces.GPU #[uncomment to use ZeroGPU]

	examples = [
	"in a British museum, pavillion, masonry, high-tables and chairs",
	"in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour",
	"in a colorful open office environment",
	" in a Nordic atrium environment"]

	css="""
	#col-container {
	margin: 0 auto;
	max-width: 640px;
	}
	"""

	with gr.Blocks(css=css) as demo:

	with gr.Column(elem_id="col-container"):
	gr.Markdown(f"""
	# HB8-Ourhood inference test
	""")

	with gr.Row():

	prompt = gr.Text(
	label="Setting prompt",
	show_label=False,
	max_lines=1,
	placeholder="Where do you want to show the Ourhood pod?",
	container=False,
	)

	run_button = gr.Button("Run", scale=0)

	result = gr.Image(label="Result", show_label=False)

	with gr.Accordion("Advanced Settings", open=False):

	perspective = gr.Slider(
	label="perspective",
	minimum=1,
	maximum=3,
	step=1,
	value=1,
	)

	seed = gr.Slider(
	label="Tracking number (seed)",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=0,
	)


	cases_strength = gr.Slider(
	label="Brand strenght",
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	value=0.5,
	)

	cases_scope = gr.Slider(
	label="Brand scope",
	minimum=1,
	maximum=10,
	step=1,
	value=1,
	)


	with gr.Row():


	num_inference_steps = gr.Slider(
	label="Detail steps",
	minimum=35,
	maximum=75,
	step=1,
	value=50, #Replace with defaults that work for your model
	)

	gr.Examples(
	examples = examples,
	inputs = [prompt]
	)
	gr.on(
	triggers=[run_button.click, prompt.submit],
	fn = ourhood_inference,
	inputs = [prompt, num_inference_steps, perspective, seed,cases_strength,cases_scope],
	outputs = [result]
	)

	demo.queue().launch()