|
import gradio as gr |
|
from PIL import Image |
|
from ultralytics import YOLO |
|
import torchvision.transforms.functional as TVF |
|
from transformers import Owlv2VisionModel |
|
from torch import nn |
|
import torch |
|
import torch.nn.functional as F |
|
|
|
|
|
|
|
class DetectorModelOwl(nn.Module): |
|
owl: Owlv2VisionModel |
|
|
|
def __init__(self, model_path: str, dropout: float, n_hidden: int = 768): |
|
super().__init__() |
|
|
|
owl = Owlv2VisionModel.from_pretrained(model_path) |
|
assert isinstance(owl, Owlv2VisionModel) |
|
self.owl = owl |
|
self.owl.requires_grad_(False) |
|
self.transforms = None |
|
|
|
self.dropout1 = nn.Dropout(dropout) |
|
self.ln1 = nn.LayerNorm(n_hidden, eps=1e-5) |
|
self.linear1 = nn.Linear(n_hidden, n_hidden * 2) |
|
self.act1 = nn.GELU() |
|
self.dropout2 = nn.Dropout(dropout) |
|
self.ln2 = nn.LayerNorm(n_hidden * 2, eps=1e-5) |
|
self.linear2 = nn.Linear(n_hidden * 2, 2) |
|
|
|
def forward(self, pixel_values: torch.Tensor, labels: torch.Tensor | None = None): |
|
with torch.autocast("cpu", dtype=torch.bfloat16): |
|
|
|
outputs = self.owl(pixel_values=pixel_values, output_hidden_states=True) |
|
x = outputs.last_hidden_state |
|
|
|
|
|
x = self.dropout1(x) |
|
x = self.ln1(x) |
|
x = self.linear1(x) |
|
x = self.act1(x) |
|
|
|
|
|
x = self.dropout2(x) |
|
|
|
x, _ = x.max(dim=1) |
|
x = self.ln2(x) |
|
|
|
|
|
x = self.linear2(x) |
|
|
|
if labels is not None: |
|
loss = F.cross_entropy(x, labels) |
|
return (x, loss) |
|
|
|
return (x,) |
|
|
|
|
|
def owl_predict(image: Image.Image) -> bool: |
|
|
|
|
|
big_side = max(image.size) |
|
new_image = Image.new("RGB", (big_side, big_side), (128, 128, 128)) |
|
new_image.paste(image, (0, 0)) |
|
|
|
|
|
preped = new_image.resize((960, 960), Image.BICUBIC) |
|
|
|
|
|
|
|
preped = TVF.pil_to_tensor(preped) |
|
preped = preped / 255.0 |
|
input_image = TVF.normalize(preped, [0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]) |
|
|
|
|
|
logits, = model(input_image.to('cpu').unsqueeze(0), None) |
|
probs = F.softmax(logits, dim=1) |
|
prediction = torch.argmax(probs.cpu(), dim=1) |
|
|
|
return prediction.item() == 1 |
|
|
|
|
|
def yolo_predict(image: Image.Image) -> Image.Image: |
|
results = yolo_model(image, imgsz=1024, augment=True, iou=0.5) |
|
assert len(results) == 1 |
|
result = results[0] |
|
im_array = result.plot() |
|
im = Image.fromarray(im_array[..., ::-1]) |
|
|
|
return im |
|
|
|
|
|
def predict(image: Image.Image, conf_threshold: float): |
|
|
|
owl_prediction = owl_predict(image) |
|
label_owl = "Watermarked" if owl_prediction else "Not Watermarked" |
|
|
|
|
|
yolo_image = yolo_predict(image) |
|
|
|
return yolo_image, f"OWLv2 Prediction: {label_owl}" |
|
|
|
|
|
|
|
model = DetectorModelOwl("google/owlv2-base-patch16-ensemble", dropout=0.0) |
|
model.load_state_dict(torch.load("far5y1y5-8000.pt", map_location="cpu")) |
|
model.eval() |
|
|
|
|
|
yolo_model = YOLO("yolo11x-train28-best.pt") |
|
|
|
|
|
gradio_app = gr.Blocks() |
|
with gr.Blocks() as app: |
|
gr.HTML( |
|
""" |
|
<h1>Watermark Detection</h1> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image = gr.Image(type="pil", label="Image") |
|
conf_threshold = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, label="Confidence Threshold") |
|
btn_submit = gr.Button(value="Detect Watermarks") |
|
|
|
with gr.Column(): |
|
image_yolo = gr.Image(type="pil", label="YOLO Detections") |
|
label_owl = gr.Label(label="OWLv2 Prediction: N/A") |
|
|
|
|
|
btn_submit.click(fn=predict, inputs=[image, conf_threshold], outputs=[image_yolo, label_owl]) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |