import os import random import gradio as gr import numpy as np import torch from PIL import Image, ImageFilter from transformers import CLIPTextModel from diffusers import UniPCMultistepScheduler from model.BrushNet_CA import BrushNetModel from model.diffusers_c.models import UNet2DConditionModel from pipeline.pipeline_PowerPaint_Brushnet_CA import StableDiffusionPowerPaintBrushNetPipeline from utils.utils import TokenizerWrapper, add_tokens base_path = "./PowerPaint_v2" os.system("apt install git") os.system("apt install git-lfs") os.system(f"git lfs clone https://code.openxlab.org.cn/zhuangjunhao/PowerPaint_v2.git {base_path}") os.system(f"cd {base_path} && git lfs pull") os.system("cd ..") torch.set_grad_enabled(False) context_prompt = "" context_negative_prompt = "" base_model_path = "./PowerPaint_v2/realisticVisionV60B1_v51VAE/" dtype = torch.float16 unet = UNet2DConditionModel.from_pretrained( "runwayml/stable-diffusion-v1-5", subfolder="unet", revision=None, torch_dtype=dtype ) text_encoder_brushnet = CLIPTextModel.from_pretrained( "runwayml/stable-diffusion-v1-5", subfolder="text_encoder", revision=None, torch_dtype=dtype ) brushnet = BrushNetModel.from_unet(unet) global pipe pipe = StableDiffusionPowerPaintBrushNetPipeline.from_pretrained( base_model_path, brushnet=brushnet, text_encoder_brushnet=text_encoder_brushnet, torch_dtype=dtype, low_cpu_mem_usage=False, safety_checker=None, ) pipe.unet = UNet2DConditionModel.from_pretrained(base_model_path, subfolder="unet", revision=None, torch_dtype=dtype) pipe.tokenizer = TokenizerWrapper(from_pretrained=base_model_path, subfolder="tokenizer", revision=None) add_tokens( tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder_brushnet, placeholder_tokens=["P_ctxt", "P_shape", "P_obj"], initialize_tokens=["a", "a", "a"], num_vectors_per_token=10, ) from safetensors.torch import load_model load_model(pipe.brushnet, "./PowerPaint_v2/PowerPaint_Brushnet/diffusion_pytorch_model.safetensors") pipe.text_encoder_brushnet.load_state_dict( torch.load("./PowerPaint_v2/PowerPaint_Brushnet/pytorch_model.bin"), strict=False ) pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() global current_control current_control = "canny" # controlnet_conditioning_scale = 0.8 def set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) def add_task(control_type): # print(control_type) if control_type == "object-removal": promptA = "P_ctxt" promptB = "P_ctxt" negative_promptA = "P_obj" negative_promptB = "P_obj" elif control_type == "context-aware": promptA = "P_ctxt" promptB = "P_ctxt" negative_promptA = "" negative_promptB = "" elif control_type == "shape-guided": promptA = "P_shape" promptB = "P_ctxt" negative_promptA = "P_shape" negative_promptB = "P_ctxt" elif control_type == "image-outpainting": promptA = "P_ctxt" promptB = "P_ctxt" negative_promptA = "P_obj" negative_promptB = "P_obj" else: promptA = "P_obj" promptB = "P_obj" negative_promptA = "P_obj" negative_promptB = "P_obj" return promptA, promptB, negative_promptA, negative_promptB def predict( input_image, prompt, fitting_degree, ddim_steps, scale, seed, negative_prompt, task, left_expansion_ratio, right_expansion_ratio, top_expansion_ratio, bottom_expansion_ratio, ): size1, size2 = input_image["image"].convert("RGB").size if task != "image-outpainting": if size1 < size2: input_image["image"] = input_image["image"].convert("RGB").resize((640, int(size2 / size1 * 640))) else: input_image["image"] = input_image["image"].convert("RGB").resize((int(size1 / size2 * 640), 640)) else: if size1 < size2: input_image["image"] = input_image["image"].convert("RGB").resize((512, int(size2 / size1 * 512))) else: input_image["image"] = input_image["image"].convert("RGB").resize((int(size1 / size2 * 512), 512)) if task == "image-outpainting" or task == "context-aware": prompt = prompt + " empty scene" if task == "object-removal": prompt = prompt + " empty scene blur" if ( left_expansion_ratio is not None and right_expansion_ratio is not None and top_expansion_ratio is not None and bottom_expansion_ratio is not None ): o_W, o_H = input_image["image"].convert("RGB").size c_W = int((1 + left_expansion_ratio + right_expansion_ratio) * o_W) c_H = int((1 + top_expansion_ratio + bottom_expansion_ratio) * o_H) expand_img = np.ones((c_H, c_W, 3), dtype=np.uint8) * 127 original_img = np.array(input_image["image"]) expand_img[ int(top_expansion_ratio * o_H):int(top_expansion_ratio * o_H) + o_H, int(left_expansion_ratio * o_W):int(left_expansion_ratio * o_W) + o_W, : ] = original_img blurry_gap = 10 expand_mask = np.ones((c_H, c_W, 3), dtype=np.uint8) * 255 expand_mask[ int(top_expansion_ratio * o_H) + blurry_gap:int(top_expansion_ratio * o_H) + o_H - blurry_gap, int(left_expansion_ratio * o_W) + blurry_gap:int(left_expansion_ratio * o_W) + o_W - blurry_gap, : ] = 0 input_image["image"] = Image.fromarray(expand_img) input_image["mask"] = Image.fromarray(expand_mask) promptA, promptB, negative_promptA, negative_promptB = add_task(task) img = np.array(input_image["image"].convert("RGB")) W = int(np.shape(img)[0] - np.shape(img)[0] % 8) H = int(np.shape(img)[1] - np.shape(img)[1] % 8) input_image["image"] = input_image["image"].resize((H, W)) input_image["mask"] = input_image["mask"].resize((H, W)) np_inpimg = np.array(input_image["image"]) np_inmask = np.array(input_image["mask"]) / 255.0 if len(np_inmask.shape)==2: np_inmask = np.expand_dims(np_inmask, axis=-1) # return np_inpimg, np_inmask np_inpimg = np_inpimg * (1 - np_inmask) input_image["image"] = Image.fromarray(np_inpimg.astype(np.uint8)).convert("RGB") # return input_image set_seed(seed) global pipe result = pipe( promptA=promptA, promptB=promptB, promptU=prompt, tradoff=fitting_degree, tradoff_nag=fitting_degree, image=input_image["image"].convert("RGB"), mask=input_image["mask"].convert("RGB"), num_inference_steps=ddim_steps, generator=torch.Generator("cuda").manual_seed(seed), brushnet_conditioning_scale=1.0, negative_promptA=negative_promptA, negative_promptB=negative_promptB, negative_promptU=negative_prompt, guidance_scale=scale, width=H, height=W, ).images[0] mask_np = np.array(input_image["mask"].convert("RGB")) red = np.array(result).astype("float") * 1 red[:, :, 0] = 180.0 red[:, :, 2] = 0 red[:, :, 1] = 0 result_m = np.array(result) result_m = Image.fromarray( ( result_m.astype("float") * (1 - mask_np.astype("float") / 512.0) + mask_np.astype("float") / 512.0 * red ).astype("uint8") ) m_img = input_image["mask"].convert("RGB").filter(ImageFilter.GaussianBlur(radius=3)) m_img = np.asarray(m_img) / 255.0 img_np = np.asarray(input_image["image"].convert("RGB")) / 255.0 ours_np = np.asarray(result) / 255.0 ours_np = ours_np * m_img + (1 - m_img) * img_np result_paste = Image.fromarray(np.uint8(ours_np * 255)) dict_res = [input_image["mask"].convert("RGB"), result_m] dict_out = [result] return dict_out, dict_res import gradio as gr def custom_infer(input_image_path, input_mask_path=None, prompt="", fitting_degree=0.5, ddim_steps=20, scale=5, seed=143, negative_prompt="", task="text-guided", left_expansion_ratio=0.2, right_expansion_ratio=0.2, top_expansion_ratio=0.2, bottom_expansion_ratio=0.2): image = Image.open(input_image_path) if input_mask_path: mask = Image.open(input_mask_path) if task == "text-guided": input_dict = {"image": image, "mask": mask} a, b = predict(input_dict, prompt, fitting_degree, ddim_steps, scale, seed, negative_prompt, task, None, None, None, None) if task == "image-outpainting": input_dict = {"image": image} a, b = predict(input_dict, prompt, fitting_degree, ddim_steps, scale, seed, negative_prompt, task, left_expansion_ratio, right_expansion_ratio, top_expansion_ratio, bottom_expansion_ratio) return a, b import gradio as gr # Define the Gradio interface using the new version inputs = [ gr.Image(label="Input Image", type="filepath"), gr.Image(label="Input Mask (optional)", type="filepath"), gr.Textbox(label="Prompt", value="A beautiful landscape"), gr.Slider(label="Fitting Degree", minimum=1, maximum=20, value=7, step=1), gr.Slider(label="DDIM Steps", minimum=10, maximum=100, value=20, step=1), gr.Slider(label="Scale", minimum=1, maximum=20, value=7.5, step=0.1), gr.Slider(label="Use Seed", minimum=0, maximum=1300000, value=143, step=1), gr.Textbox(label="Negative Prompt", value="blur, low quality"), gr.Radio(label="Task", choices=["text-guided", "image-outpainting"], value="image-outpainting"), gr.Slider(label="Left Expansion Ratio", minimum=0, maximum=2, value=0.2, step=0.01), gr.Slider(label="Right Expansion Ratio", minimum=0, maximum=2, value=0.2, step=0.01), gr.Slider(label="Top Expansion Ratio", minimum=0, maximum=2, value=0.2, step=0.01), gr.Slider(label="Bottom Expansion Ratio", minimum=0, maximum=2, value=0.2, step=0.01) ] outputs = [ gr.Image(label="Output Image"), gr.Textbox(label="Output Text") ] # Create the Gradio interface demo = gr.Interface(fn=custom_infer, inputs=inputs, outputs=outputs, title="Inference") demo.queue(concurrency_count=1, max_size=1, api_open=True) demo.launch(show_api=True)