copied and modified a script that uses owl2 from a space here, but while the demo detects multiple cats in one image my local copy of the script always only detects one. If I change the threshold, it only decides for the other cat in the demo picture, but it will never detect both cats.

Here is the slightly modified script and I would kindly ask if you can give me advice, why this script always just finds one of the cats (in other pictures it finds one of two car license plates and so on).

from transformers import pipeline, SamModel, SamProcessor
import torch
import numpy as np
from PIL import Image, ImageDraw
import os

checkpoint = "google/owlv2-base-patch16-ensemble"
detector = pipeline(model=checkpoint, task="zero-shot-object-detection", device="cpu")
sam_model = SamModel.from_pretrained("facebook/sam-vit-base").to("cpu")
sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

def query(image_path, texts, threshold, sam_threshold):
"""
This function analyzes an image for objects specified in the text input.

Args:
image_path: Path to the image file.
texts: Comma-separated list of object labels to search for in the image.
threshold: Confidence threshold for the object detection pipeline.
sam_threshold: Threshold for the second-stage mask-based verification.

Returns:
Path to the annotated image file.
"""

print("Pfad")
print(image_path)
texts = texts.split(",")
image = Image.open(image_path).convert("RGB")
#image_np = np.array(image)
#print(image_np)

Pass the image directly as an argument to the detector

predictions = detector(
image, # Provide the image here
candidate_labels=texts,
threshold=threshold
)

result_labels = []
for pred in predictions:
box = pred["box"]
score = pred["score"]
label = pred["label"]
box = [round(box["xmin"], 2), round(box["ymin"], 2),
round(box["xmax"], 2), round(box["ymax"], 2)]
inputs = sam_processor(
images=image, # Ensure the correct argument is used
input_boxes=[[box]],
return_tensors="pt"
).to("cpu")
with torch.no_grad():
outputs = sam_model(**inputs)

  mask = sam_processor.image_processor.post_process_masks(
      outputs.pred_masks.cpu(),
      inputs["original_sizes"].cpu(),
      inputs["reshaped_input_sizes"].cpu()
  )
  iou_scores = outputs["iou_scores"]

  masks, testscores, boxes = sam_processor.image_processor.filter_masks(
      mask[0],
      iou_scores[0].cpu(),
      inputs["original_sizes"][0].cpu(),
      box,
      pred_iou_thresh=sam_threshold,
  )
  #print(mask[0][0][0])

  result_labels.append((mask[0][0][0].numpy(), label))

draw = ImageDraw.Draw(image)
print(result_labels)
for mask, label in result_labels:
mask = Image.fromarray((mask * 255).astype(np.uint8), mode='L').resize(image.size)
mask = mask.convert("RGBA")
image.paste(mask, (0, 0), mask)

base_name, ext = os.path.splitext(image_path)
output_path = f"{base_name}_annotated{ext}"
image.save(output_path)

return output_path

output_path = query("tmp3.jpg", "license plates", 0.6, 0.88)
print(f"Annotated image saved as {output_path}")

google
/

owlv2-base-patch16-ensemble

Detect multiple objects

Pass the image directly as an argument to the detector