import gradio as gr from PIL import Image from ultralytics import YOLO import torchvision.transforms.functional as TVF from transformers import Owlv2VisionModel from torch import nn import torch import torch.nn.functional as F # OWLv2 classification head class DetectorModelOwl(nn.Module): owl: Owlv2VisionModel def __init__(self, model_path: str, dropout: float, n_hidden: int = 768): super().__init__() owl = Owlv2VisionModel.from_pretrained(model_path) assert isinstance(owl, Owlv2VisionModel) self.owl = owl self.owl.requires_grad_(False) self.transforms = None self.dropout1 = nn.Dropout(dropout) self.ln1 = nn.LayerNorm(n_hidden, eps=1e-5) self.linear1 = nn.Linear(n_hidden, n_hidden * 2) self.act1 = nn.GELU() self.dropout2 = nn.Dropout(dropout) self.ln2 = nn.LayerNorm(n_hidden * 2, eps=1e-5) self.linear2 = nn.Linear(n_hidden * 2, 2) def forward(self, pixel_values: torch.Tensor, labels: torch.Tensor | None = None): with torch.autocast("cpu", dtype=torch.bfloat16): # Embed the image outputs = self.owl(pixel_values=pixel_values, output_hidden_states=True) x = outputs.last_hidden_state # B, N, C # Linear x = self.dropout1(x) x = self.ln1(x) x = self.linear1(x) x = self.act1(x) # Norm and Mean x = self.dropout2(x) #x = x.mean(dim=1) x, _ = x.max(dim=1) x = self.ln2(x) # Linear x = self.linear2(x) if labels is not None: loss = F.cross_entropy(x, labels) return (x, loss) return (x,) def owl_predict(image: Image.Image) -> bool: # Process the image # Pad to square big_side = max(image.size) new_image = Image.new("RGB", (big_side, big_side), (128, 128, 128)) new_image.paste(image, (0, 0)) # Resize to 960x960 preped = new_image.resize((960, 960), Image.BICUBIC) # Bicubic performed best in my tests (even compared to Lanczos) #preped = new_image.resize((1008, 1008), Image.BICUBIC) # Bicubic performed best in my tests (even compared to Lanczos) # Convert to tensor and normalize preped = TVF.pil_to_tensor(preped) preped = preped / 255.0 input_image = TVF.normalize(preped, [0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]) # Run logits, = model(input_image.to('cpu').unsqueeze(0), None) probs = F.softmax(logits, dim=1) prediction = torch.argmax(probs.cpu(), dim=1) return prediction.item() == 1 def yolo_predict(image: Image.Image) -> Image.Image: results = yolo_model(image, imgsz=1024, augment=True, iou=0.5) assert len(results) == 1 result = results[0] im_array = result.plot() im = Image.fromarray(im_array[..., ::-1]) return im def predict(image: Image.Image, conf_threshold: float): # OWLv2 owl_prediction = owl_predict(image) label_owl = "Watermarked" if owl_prediction else "Not Watermarked" # YOLO yolo_image = yolo_predict(image) return yolo_image, f"OWLv2 Prediction: {label_owl}" # Load OWLv2 classification model model = DetectorModelOwl("google/owlv2-base-patch16-ensemble", dropout=0.0) model.load_state_dict(torch.load("far5y1y5-8000.pt", map_location="cpu")) model.eval() # Load YOLO model yolo_model = YOLO("yolo11x-train28-best.pt") gradio_app = gr.Blocks() with gr.Blocks() as app: gr.HTML( """