Spaces:
Runtime error
Runtime error
import argparse | |
import os | |
import copy | |
import numpy as np | |
import torch | |
from PIL import Image, ImageDraw, ImageFont | |
import PIL | |
# OwlViT Detection | |
from transformers import OwlViTProcessor, OwlViTForObjectDetection | |
# segment anything | |
from segment_anything import build_sam, SamPredictor | |
import cv2 | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import gc | |
def show_mask(mask, ax, random_color=False): | |
if random_color: | |
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) | |
else: | |
color = np.array([30/255, 144/255, 255/255, 0.6]) | |
h, w = mask.shape[-2:] | |
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) | |
ax.imshow(mask_image) | |
def show_box(box, ax): | |
x0, y0 = box[0], box[1] | |
w, h = box[2] - box[0], box[3] - box[1] | |
ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) | |
def plot_boxes_to_image(image_pil, tgt): | |
H, W = tgt["size"] | |
boxes = tgt["boxes"] | |
labels = tgt["labels"] | |
assert len(boxes) == len(labels), "boxes and labels must have same length" | |
draw = ImageDraw.Draw(image_pil) | |
mask = Image.new("L", image_pil.size, 0) | |
mask_draw = ImageDraw.Draw(mask) | |
# draw boxes and masks | |
for box, label in zip(boxes, labels): | |
# random color | |
color = tuple(np.random.randint(0, 255, size=3).tolist()) | |
# draw | |
x0, y0, x1, y1 = box | |
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) | |
draw.rectangle([x0, y0, x1, y1], outline=color, width=6) | |
draw.text((x0, y0), str(label), fill=color) | |
font = ImageFont.load_default() | |
if hasattr(font, "getbbox"): | |
bbox = draw.textbbox((x0, y0), str(label), font) | |
else: | |
w, h = draw.textsize(str(label), font) | |
bbox = (x0, y0, w + x0, y0 + h) | |
# bbox = draw.textbbox((x0, y0), str(label)) | |
draw.rectangle(bbox, fill=color) | |
draw.text((x0, y0), str(label), fill="white") | |
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6) | |
return image_pil, mask | |
# Use GPU if available | |
if torch.cuda.is_available(): | |
device = torch.device("cuda:4") | |
else: | |
device = torch.device("cpu") | |
# load OWL-ViT model | |
owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device) | |
owlvit_model.eval() | |
owlvit_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") | |
# run segment anything (SAM) | |
sam_predictor = SamPredictor(build_sam(checkpoint="./sam_vit_h_4b8939.pth")) | |
def query_image(img, text_prompt, box_threshold): | |
# load image | |
if not isinstance(img, PIL.Image.Image): | |
pil_img = Image.fromarray(np.uint8(img)).convert('RGB') | |
text_prompt = text_prompt | |
texts = text_prompt.split(",") | |
box_threshold = box_threshold | |
# run object detection model | |
with torch.no_grad(): | |
inputs = owlvit_processor(text=texts, images=pil_img, return_tensors="pt").to(device) | |
outputs = owlvit_model(**inputs) | |
# Target image sizes (height, width) to rescale box predictions [batch_size, 2] | |
target_sizes = torch.Tensor([pil_img.size[::-1]]) | |
# Convert outputs (bounding boxes and class logits) to COCO API | |
results = owlvit_processor.post_process_object_detection(outputs=outputs, threshold=box_threshold, target_sizes=target_sizes.to(device)) | |
scores = torch.sigmoid(outputs.logits) | |
topk_scores, topk_idxs = torch.topk(scores, k=1, dim=1) | |
i = 0 # Retrieve predictions for the first image for the corresponding text queries | |
text = texts[i] | |
topk_idxs = topk_idxs.squeeze(1).tolist() | |
topk_boxes = results[i]['boxes'][topk_idxs] | |
topk_scores = topk_scores.view(len(text), -1) | |
topk_labels = results[i]["labels"][topk_idxs] | |
boxes, scores, labels = topk_boxes, topk_scores, topk_labels | |
# boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] | |
# Print detected objects and rescaled box coordinates | |
# for box, score, label in zip(boxes, scores, labels): | |
# box = [round(i, 2) for i in box.tolist()] | |
# print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") | |
boxes = boxes.cpu().detach().numpy() | |
normalized_boxes = copy.deepcopy(boxes) | |
# # visualize pred | |
size = pil_img.size | |
pred_dict = { | |
"boxes": normalized_boxes, | |
"size": [size[1], size[0]], # H, W | |
"labels": [text[idx] for idx in labels] | |
} | |
# release the OWL-ViT | |
# owlvit_model.cpu() | |
# del owlvit_model | |
gc.collect() | |
torch.cuda.empty_cache() | |
# run segment anything (SAM) | |
open_cv_image = np.array(pil_img) | |
image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB) | |
sam_predictor.set_image(image) | |
H, W = size[1], size[0] | |
for i in range(boxes.shape[0]): | |
boxes[i] = torch.Tensor(boxes[i]) | |
boxes = torch.tensor(boxes, device=sam_predictor.device) | |
transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes, image.shape[:2]) | |
masks, _, _ = sam_predictor.predict_torch( | |
point_coords = None, | |
point_labels = None, | |
boxes = transformed_boxes, | |
multimask_output = False, | |
) | |
plt.figure(figsize=(10, 10)) | |
plt.imshow(image) | |
for mask in masks: | |
show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) | |
for box in boxes: | |
show_box(box.numpy(), plt.gca()) | |
plt.axis('off') | |
import io | |
buf = io.BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
owlvit_segment_image = Image.open(buf).convert('RGB') | |
# grounded results | |
image_with_box = plot_boxes_to_image(pil_img, pred_dict)[0] | |
# return owlvit_segment_image, image_with_box | |
return owlvit_segment_image | |