Spaces:

yonigozlan
/

omdet-turbo-open-vocabulary

Running on Zero

App Files Files Community

omdet-turbo-open-vocabulary / app.py

yonigozlan HF staff

add app.py

54e5741 4 months ago

raw

history blame

6.04 kB

	import os
	import time

	import cv2
	import gradio as gr
	import numpy as np
	import spaces
	import supervision as sv
	import torch
	from PIL import Image
	from tqdm import tqdm

	from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	processor = AutoProcessor.from_pretrained("omdet-turbo-tiny-timm")
	model = AutoModelForZeroShotObjectDetection.from_pretrained("omdet-turbo-tiny-timm").to(
	device
	)

	css = """
	#warning {background-color: #FFCCCB}
	.feedback textarea {font-size: 24px !important}
	"""

	BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
	MASK_ANNOTATOR = sv.MaskAnnotator()
	LABEL_ANNOTATOR = sv.LabelAnnotator()


	def calculate_end_frame_index(source_video_path):
	video_info = sv.VideoInfo.from_video_path(source_video_path)
	return min(video_info.total_frames, video_info.fps * 2)


	def annotate_image(input_image, detections, labels) -> np.ndarray:
	output_image = MASK_ANNOTATOR.annotate(input_image, detections)
	output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
	output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
	return output_image


	def resize_to_max_side(frame: np.ndarray, max_side: int = 640):
	h, w = frame.shape[:2]
	if h > w:
	new_h, new_w = max_side, int(w * max_side / h)
	else:
	new_h, new_w = int(h * max_side / w), max_side

	return cv2.resize(frame, (new_w, new_h))


	@spaces.GPU
	def process_video(
	input_video,
	confidence_threshold,
	classes,
	max_side,
	progress=gr.Progress(track_tqdm=True),
	):
	classes = classes.strip(" ").split(",")
	video_info = sv.VideoInfo.from_video_path(input_video)
	total = calculate_end_frame_index(input_video)
	frame_generator = sv.get_video_frames_generator(source_path=input_video, end=total)

	result_file_name = "output.mp4"
	result_file_path = os.path.join(os.getcwd(), result_file_name)
	all_fps = []
	with sv.VideoSink(result_file_path, video_info=video_info) as sink:
	for _ in tqdm(range(total), desc="Processing video.."):
	frame = next(frame_generator)
	results, fps = query(
	frame, classes, confidence_threshold, max_side=max_side
	)
	all_fps.append(fps)
	detections = []

	detections = sv.Detections(
	xyxy=results[0]["boxes"].cpu().detach().numpy(),
	confidence=results[0]["scores"].cpu().detach().numpy(),
	class_id=np.array(
	[
	classes.index(results_class)
	for results_class in results[0]["classes"]
	]
	),
	data={"class_name": results[0]["classes"]},
	)
	frame = annotate_image(
	input_image=frame,
	detections=detections,
	labels=results[0]["classes"],
	)
	sink.write_frame(frame)

	avg_fps = np.mean(all_fps)
	return result_file_path, gr.Markdown(
	f'<h3 style="text-align: center;">Model inference FPS: {avg_fps:.2f}</h3>',
	visible=True,
	)


	def query(frame, classes, confidence_threshold, max_side=360):
	frame_resized = resize_to_max_side(frame, max_side=max_side)
	image = Image.fromarray(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
	inputs = processor(images=image, text=classes, return_tensors="pt").to(device)
	with torch.no_grad():
	start = time.time()
	outputs = model(**inputs)
	fps = 1 / (time.time() - start)
	target_sizes = [frame.shape[:2]]

	results = processor.post_process_grounded_object_detection(
	outputs=outputs,
	classes=classes,
	score_threshold=confidence_threshold,
	target_sizes=target_sizes,
	)
	return results, fps


	with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
	gr.Markdown("## Real Time Open Vocabulary Object Detection with Omdet-Turbo")
	gr.Markdown(
	"This is a demo for open vocabulary object detection using OmDet-Turbo. \\"
	"It runs on ZeroGPU which captures GPU every first time you infer. This combined with video processing time means that the demo inference time is slower than the model's actual inference time. \\"
	"The actual model inference FPS is displayed under the processed video after inference."
	)
	gr.Markdown(
	"Simply upload a video, and write the objects you want to detect! You can also play with confidence threshold, image size, or try the examples below. 👇"
	)

	with gr.Row():
	with gr.Column():
	input_video = gr.Video(label="Input Video")
	submit = gr.Button()
	with gr.Column():
	output_video = gr.Video(label="Output Video")
	actual_fps = gr.Markdown("", visible=False)
	with gr.Row():
	classes = gr.Textbox(
	"person, cat, dog",
	label="Objects to detect. Change this as you like!",
	elem_classes="feedback",
	scale=3,
	)
	conf = gr.Slider(
	label="Confidence Threshold",
	minimum=0.1,
	maximum=1.0,
	value=0.2,
	step=0.05,
	)
	max_side = gr.Slider(
	label="Image Size",
	minimum=240,
	maximum=1080,
	value=640,
	step=10,
	)
	example = gr.Examples(
	fn=process_video,
	examples=[
	["./football.mp4", 0.3, "person, ball, shoe", 640],
	["./cat.mp4", 0.2, "cat", 640],
	["./safari2.mp4", 0.3, "elephant, giraffe, springbok, zebra", 640],
	],
	inputs=[input_video, conf, classes, max_side],
	outputs=output_video,
	)

	submit.click(
	fn=process_video,
	inputs=[input_video, conf, classes, max_side],
	outputs=[output_video, actual_fps],
	)

	if __name__ == "__main__":
	demo.launch(show_error=True)