import time import uuid import cv2 import gradio as gr import numpy as np import spaces import supervision as sv import torch from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf") model = AutoModelForZeroShotObjectDetection.from_pretrained( "omlab/omdet-turbo-swin-tiny-hf" ).to(device) css = """ .feedback textarea {font-size: 24px !important} """ global classes global detections global labels global threshold classes = "person, bike, car" detections = None labels = None threshold = 0.2 BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator() MASK_ANNOTATOR = sv.MaskAnnotator() LABEL_ANNOTATOR = sv.LabelAnnotator() SUBSAMPLE = 2 def annotate_image(input_image, detections, labels) -> np.ndarray: output_image = MASK_ANNOTATOR.annotate(input_image, detections) output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections) output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels) return output_image @spaces.GPU def process_video( input_video, confidence_threshold, classes_new, progress=gr.Progress(track_tqdm=True), ): global detections global labels global classes global threshold classes = classes_new threshold = confidence_threshold result_file_name = f"output_{uuid.uuid4()}.mp4" cap = cv2.VideoCapture(input_video) video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore fps = int(cap.get(cv2.CAP_PROP_FPS)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) desired_fps = fps // SUBSAMPLE iterating, frame = cap.read() segment_file = cv2.VideoWriter( result_file_name, video_codec, desired_fps, (width, height) ) # type: ignore batch = [] frames = [] predict_index = [] n_frames = 0 while iterating: # frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5) if n_frames % SUBSAMPLE == 0: predict_index.append(len(frames)) batch.append(frame) frames.append(frame) if len(batch) == desired_fps: classes_list = classes.strip(" ").split(",") results, fps = query(batch, classes_list, threshold, (width, height)) for i in range(len(frames)): if i in predict_index: batch_index = predict_index.index(i) detections = sv.Detections( xyxy=results[batch_index]["boxes"].cpu().detach().numpy(), confidence=results[batch_index]["scores"] .cpu() .detach() .numpy(), class_id=np.array( [ classes_list.index(results_class) for results_class in results[batch_index]["classes"] ] ), data={"class_name": results[batch_index]["classes"]}, ) labels = results[batch_index]["classes"] frame = annotate_image( input_image=frames[i], detections=detections, labels=labels, ) segment_file.write(frame) segment_file.release() yield ( result_file_name, gr.Markdown( f'