Spaces:

muneebable
/

clip-guided-anime-art

Sleeping

App Files Files Community

muneebable commited on Sep 29, 2024

Commit

1b928b3

verified ·

1 Parent(s): af77af7

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -49

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import numpy as np
-import random
 import torch
 from diffusers import DDPMPipeline, DDIMScheduler
 import open_clip
@@ -17,16 +16,14 @@ clip_model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pr
 clip_model.to(device)
 # Transform to preprocess images
-tfms = torchvision.transforms.Compose(
-    [
-        torchvision.transforms.Resize((224, 224)),
-        torchvision.transforms.ToTensor(),
-        torchvision.transforms.Normalize(
-            mean=(0.48145466, 0.4578275, 0.40821073),
-            std=(0.26862954, 0.26130258, 0.27577711),
-        ),
-    ]
-)
 # CLIP Loss function
 def clip_loss(image, text_features):
@@ -37,66 +34,70 @@ def clip_loss(image, text_features):
     return loss
 # Load Diffusion model
-model_repo_id = "muneebable/ddpm-celebahq-finetuned-anime-art"  # Replace with desired model repo
 image_pipe = DDPMPipeline.from_pretrained(model_repo_id)
 image_pipe.to(device)
 # Load scheduler
 scheduler = DDIMScheduler.from_pretrained(model_repo_id)
-scheduler.set_timesteps(num_inference_steps=40)
-# Gradio Inference Function
-def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
-    if randomize_seed:
-        seed = random.randint(0, np.iinfo(np.int32).max)
-    generator = torch.manual_seed(seed)
-    # Embed prompt with CLIP
     text = open_clip.tokenize([prompt]).to(device)
-    with torch.no_grad():
         text_features = clip_model.encode_text(text)
-    x = torch.randn(4, 3, 256, 256).to(device)
     for i, t in tqdm(enumerate(scheduler.timesteps)):
         model_input = scheduler.scale_model_input(x, t)
         with torch.no_grad():
             noise_pred = image_pipe.unet(model_input, t)["sample"]
         cond_grad = 0
-        for cut in range(4):
             x = x.detach().requires_grad_()
             x0 = scheduler.step(noise_pred, t, x).pred_original_sample
             loss = clip_loss(x0, text_features) * guidance_scale
-            cond_grad -= torch.autograd.grad(loss, x)[0] / 4
         alpha_bar = scheduler.alphas_cumprod[i]
         x = x.detach() + cond_grad * alpha_bar.sqrt()
         x = scheduler.step(noise_pred, t, x).prev_sample
-    # Convert output to an image
-    grid = torchvision.utils.make_grid(x.detach(), nrow=4)
-    im = grid.permute(1, 2, 0).cpu().clip(-1, 1) * 0.5 + 0.5
-    result_image = Image.fromarray((im.numpy() * 255).astype(np.uint8))
-    return result_image, seed
-# Gradio App
-with gr.Blocks() as demo:
-    prompt = gr.Textbox(placeholder="Enter your prompt", label="Prompt")
-    run_button = gr.Button("Generate")
-    result = gr.Image(label="Generated Image")
-    with gr.Accordion("Advanced Settings"):
-        negative_prompt = gr.Textbox(label="Negative Prompt")
-        seed = gr.Slider(0, np.iinfo(np.int32).max, value=0, label="Seed")
-        randomize_seed = gr.Checkbox(True, label="Randomize Seed")
-        width = gr.Slider(256, 1024, value=512, label="Width")
-        height = gr.Slider(256, 1024, value=512, label="Height")
-        guidance_scale = gr.Slider(0.0, 10.0, value=7.5, label="Guidance Scale")
-        num_inference_steps = gr.Slider(1, 50, value=50, label="Steps")
-    run_button.click(infer,
-                     inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
-                     outputs=[result, seed])
-demo.queue().launch()

 import gradio as gr
 import numpy as np
 import torch
 from diffusers import DDPMPipeline, DDIMScheduler
 import open_clip
 clip_model.to(device)
 # Transform to preprocess images
+tfms = torchvision.transforms.Compose([
+    torchvision.transforms.Resize((224, 224)),
+    torchvision.transforms.ToTensor(),
+    torchvision.transforms.Normalize(
+        mean=(0.48145466, 0.4578275, 0.40821073),
+        std=(0.26862954, 0.26130258, 0.27577711),
+    ),
+])
 # CLIP Loss function
 def clip_loss(image, text_features):
     return loss
 # Load Diffusion model
+model_repo_id = "muneebable/ddpm-celebahq-finetuned-anime-art"
 image_pipe = DDPMPipeline.from_pretrained(model_repo_id)
 image_pipe.to(device)
 # Load scheduler
 scheduler = DDIMScheduler.from_pretrained(model_repo_id)
+def generate_image(prompt, guidance_scale, num_steps):
+    scheduler.set_timesteps(num_inference_steps=num_steps)
+    # We embed a prompt with CLIP as our target
     text = open_clip.tokenize([prompt]).to(device)
+    with torch.no_grad(), torch.cuda.amp.autocast():
         text_features = clip_model.encode_text(text)
+    x = torch.randn(1, 3, 256, 256).to(device)
+    n_cuts = 4
     for i, t in tqdm(enumerate(scheduler.timesteps)):
         model_input = scheduler.scale_model_input(x, t)
+        # predict the noise residual
         with torch.no_grad():
             noise_pred = image_pipe.unet(model_input, t)["sample"]
         cond_grad = 0
+        for cut in range(n_cuts):
+            # Set requires grad on x
             x = x.detach().requires_grad_()
+            # Get the predicted x0:
             x0 = scheduler.step(noise_pred, t, x).pred_original_sample
+            # Calculate loss
             loss = clip_loss(x0, text_features) * guidance_scale
+            # Get gradient (scale by n_cuts since we want the average)
+            cond_grad -= torch.autograd.grad(loss, x)[0] / n_cuts
+        # Modify x based on this gradient
         alpha_bar = scheduler.alphas_cumprod[i]
         x = x.detach() + cond_grad * alpha_bar.sqrt()
+        # Now step with scheduler
         x = scheduler.step(noise_pred, t, x).prev_sample
+    grid = torchvision.utils.make_grid(x.detach(), nrow=1)
+    im = grid.permute(1, 2, 0).cpu().clip(-1, 1) * 0.5 + 0.5
+    im = (im * 255).byte().numpy()
+    return Image.fromarray(im)
+# Gradio interface
+def gradio_interface(prompt, guidance_scale, num_steps):
+    return generate_image(prompt, guidance_scale, num_steps)
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Textbox(label="Prompt", value="Red Rose (still life), red flower painting"),
+        gr.Slider(minimum=1, maximum=20, step=1, label="Guidance Scale", value=8),
+        gr.Slider(minimum=10, maximum=100, step=10, label="Number of Steps", value=50)
+    ],
+    outputs=gr.Image(type="pil", label="Generated Image"),
+    title="CLIP-Guided Diffusion Image Generation",
+    description="Generate images using CLIP-guided diffusion. Enter a prompt, adjust the guidance scale, and set the number of steps.",
+    examples=[
+        ["A serene landscape with mountains and a lake", 10, 50],
+        ["A futuristic cityscape at night", 15, 70],
+        ["A cute cartoon character", 5, 30]
+    ]
+)
+iface.launch()