File size: 2,748 Bytes
39ccbfb
e3b8308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6f56d2
e3b8308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6f56d2
e3b8308
 
 
8dd4d2a
 
 
 
39ccbfb
8dd4d2a
54eb3a0
 
 
 
f4d7772
39ccbfb
e3b8308
 
 
 
 
 
 
 
d3f9419
 
54eb3a0
 
d982af7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
import PIL
import gradio as gr
from PIL import Image, ImageDraw
import requests

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101", revision="no_timm")

def biggest_obj(res):
  max_area = 0
  for i, bb in enumerate(res["boxes"]):
    x1,y1,x2,y2 = list(map(int, bb.tolist()))
    area = (abs(x2-x1)*abs(y1-y2))
    if area > max_area:
      max_area = area
      ind = i
      coords = list(map(int, bb.tolist()))
  cl = model.config.id2label[res["labels"][ind].item()]
  return ind, coords, cl


def create_mask(im_shape:tuple, mask_zone:list):
    mask = Image.new("L", im_shape, 0)
    draw = ImageDraw.Draw(mask)
    draw.rectangle(mask_zone, fill=255)
    return mask
    
from diffusers import StableDiffusionInpaintPipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    revision="fp16",
    torch_dtype=torch.float16,
).to(device)

def predict(image, prompt):
  image =  image.convert("RGB").resize((512, 512))
  # DETR works
  inputs = processor(images=image, return_tensors="pt")
  outputs = model(**inputs)
  # convert outputs (bounding boxes and class logits) to COCO API
  # let's only keep detections with score > 0.9
  target_sizes = torch.tensor([image.size[::-1]])
  results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

  # find the biggest bb on the image
  ind, coords, cl = biggest_obj(results)
  # mask image
  mask_image = create_mask(image.size, coords)

  images = pipe(
    prompt=prompt,
    image=image,
    mask_image=mask_image,
    guidance_scale=5,
    generator=torch.Generator(device="cuda").manual_seed(0),
    num_images_per_prompt=1,
  ).images

  draw_on_image = ImageDraw.Draw(image)
  # Define the rectangle coordinates (left-top, right-bottom)
  rectangle_coordinates = coords
  draw_on_image.rectangle(rectangle_coordinates, outline="red", width=2)

  return images[0], image
    
examples = [["cats.png", "cat is smiling"],
            ["dog.jpg", "dog with big eyes"],
            ["dog1.jpg", "dog with big bone"],
            ["beaver.jpg", "big strong beaver"]]

gr.Interface(
    predict,
    title = 'Stable Diffusion In-Painting',
    inputs=[
        gr.Image(type = 'pil'),
        gr.Textbox(label = 'prompt')
    ],
    outputs = [
        gr.Image(),
        gr.Image(),
        ],
    examples=examples,
).launch(debug=True, share=True)