Spaces:
Running
Running
from typing import Tuple, List | |
import cv2 | |
import numpy as np | |
import supervision as sv | |
import torch | |
from PIL import Image | |
from torchvision.ops import box_convert | |
import groundingdino.datasets.transforms as T | |
from groundingdino.models import build_model | |
from groundingdino.util.misc import clean_state_dict | |
from groundingdino.util.slconfig import SLConfig | |
from groundingdino.util.utils import get_phrases_from_posmap | |
def preprocess_caption(caption: str) -> str: | |
result = caption.lower().strip() | |
if result.endswith("."): | |
return result | |
return result + "." | |
def load_model(model_config_path: str, model_checkpoint_path: str, device='cuda'): | |
args = SLConfig.fromfile(model_config_path) | |
args.device = device | |
model = build_model(args) | |
checkpoint = torch.load(model_checkpoint_path, map_location="cpu") | |
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) | |
model.eval() | |
return model | |
def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]: | |
transform = T.Compose( | |
[ | |
T.RandomResize([800], max_size=1333), | |
T.ToTensor(), | |
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), | |
] | |
) | |
image_source = Image.open(image_path).convert("RGB") | |
image = np.asarray(image_source) | |
image_transformed, _ = transform(image_source, None) | |
return image, image_transformed | |
def predict( | |
model, | |
image: torch.Tensor, | |
caption: str, | |
box_threshold: float, | |
text_threshold: float, | |
device='cuda', | |
) -> Tuple[torch.Tensor, torch.Tensor, List[str]]: | |
caption = preprocess_caption(caption=caption) | |
model = model.to(device) | |
image = image.to(device) | |
with torch.no_grad(): | |
outputs = model(image[None], captions=[caption]) | |
prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0] # prediction_logits.shape = (nq, 256) | |
prediction_boxes = outputs["pred_boxes"].cpu()[0] # prediction_boxes.shape = (nq, 4) | |
mask = prediction_logits.max(dim=1)[0] > box_threshold | |
logits = prediction_logits[mask] # logits.shape = (n, 256) | |
boxes = prediction_boxes[mask] # boxes.shape = (n, 4) | |
tokenizer = model.tokenizer | |
tokenized = tokenizer(caption) | |
phrases = [ | |
get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '') | |
for logit | |
in logits | |
] | |
return boxes, logits.max(dim=1)[0], phrases | |
def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray: | |
h, w, _ = image_source.shape | |
boxes = boxes * torch.Tensor([w, h, w, h]) | |
xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() | |
detections = sv.Detections(xyxy=xyxy) | |
labels = [ | |
f"{phrase} {logit:.2f}" | |
for phrase, logit | |
in zip(phrases, logits) | |
] | |
box_annotator = sv.BoxAnnotator() | |
annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) | |
annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) | |
return annotated_frame | |