Spaces:

fancyfeast
/

joycaption-watermark-detection

Running

File size: 3,768 Bytes

import gradio as gr
from PIL import Image
from ultralytics import YOLO
import torchvision.transforms.functional as TVF
from transformers import Owlv2VisionModel
from torch import nn
import torch
import torch.nn.functional as F


# OWLv2 classification head
class DetectorModelOwl(nn.Module):
	owl: Owlv2VisionModel

	def __init__(self, model_path: str, dropout: float, n_hidden: int = 768):
		super().__init__()

		owl = Owlv2VisionModel.from_pretrained(model_path)
		assert isinstance(owl, Owlv2VisionModel)
		self.owl = owl
		self.owl.requires_grad_(False)
		self.transforms = None

		self.dropout1 = nn.Dropout(dropout)
		self.ln1 = nn.LayerNorm(n_hidden, eps=1e-5)
		self.linear1 = nn.Linear(n_hidden, n_hidden * 2)
		self.act1 = nn.GELU()
		self.dropout2 = nn.Dropout(dropout)
		self.ln2 = nn.LayerNorm(n_hidden * 2, eps=1e-5)
		self.linear2 = nn.Linear(n_hidden * 2, 2)
	
	def forward(self, pixel_values: torch.Tensor, labels: torch.Tensor | None = None):
		with torch.autocast("cpu", dtype=torch.bfloat16):
			# Embed the image
			outputs = self.owl(pixel_values=pixel_values, output_hidden_states=True)
			x = outputs.last_hidden_state  # B, N, C
		
			# Linear
			x = self.dropout1(x)
			x = self.ln1(x)
			x = self.linear1(x)
			x = self.act1(x)

			# Norm and Mean
			x = self.dropout2(x)
			#x = x.mean(dim=1)
			x, _ = x.max(dim=1)
			x = self.ln2(x)

			# Linear
			x = self.linear2(x)
		
		if labels is not None:
			loss = F.cross_entropy(x, labels)
			return (x, loss)

		return (x,)


def owl_predict(image: Image.Image) -> bool:
	# Process the image
	# Pad to square
	big_side = max(image.size)
	new_image = Image.new("RGB", (big_side, big_side), (128, 128, 128))
	new_image.paste(image, (0, 0))

	# Resize to 960x960
	preped = new_image.resize((960, 960), Image.BICUBIC)  # Bicubic performed best in my tests (even compared to Lanczos)
	#preped = new_image.resize((1008, 1008), Image.BICUBIC)  # Bicubic performed best in my tests (even compared to Lanczos)

	# Convert to tensor and normalize
	preped = TVF.pil_to_tensor(preped)
	preped = preped / 255.0
	input_image = TVF.normalize(preped, [0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711])

	# Run
	logits, = model(input_image.to('cpu').unsqueeze(0), None)
	probs = F.softmax(logits, dim=1)
	prediction = torch.argmax(probs.cpu(), dim=1)

	return prediction.item() == 1


def yolo_predict(image: Image.Image) -> Image.Image:
	results = yolo_model(image, imgsz=1024, augment=True, iou=0.5)
	assert len(results) == 1
	result = results[0]
	im_array = result.plot()
	im = Image.fromarray(im_array[..., ::-1])

	return im


def predict(image: Image.Image, conf_threshold: float):
	# OWLv2
	owl_prediction = owl_predict(image)
	label_owl = "Watermarked" if owl_prediction else "Not Watermarked"

	# YOLO
	yolo_image = yolo_predict(image)

	return yolo_image, f"OWLv2 Prediction: {label_owl}"


# Load OWLv2 classification model
model = DetectorModelOwl("google/owlv2-base-patch16-ensemble", dropout=0.0)
model.load_state_dict(torch.load("far5y1y5-8000.pt", map_location="cpu"))
model.eval()

# Load YOLO model
yolo_model = YOLO("yolo11x-train28-best.pt")


gradio_app = gr.Blocks()
with gr.Blocks() as app:
	gr.HTML(
		"""
		<h1>Watermark Detection</h1>
		"""
	)

	with gr.Row():
		with gr.Column():
			image = gr.Image(type="pil", label="Image")
			conf_threshold = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, label="Confidence Threshold")
			btn_submit = gr.Button(value="Detect Watermarks")
		
		with gr.Column():
			image_yolo = gr.Image(type="pil", label="YOLO Detections")
			label_owl = gr.Label(label="OWLv2 Prediction: N/A")
		

	btn_submit.click(fn=predict, inputs=[image, conf_threshold], outputs=[image_yolo, label_owl])


if __name__ == "__main__":
	app.launch()