import os import spaces from unittest.mock import patch import gradio as gr import numpy as np import supervision as sv import torch from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoProcessor from utils.imports import fixed_get_imports from utils.models import ( run_captioning, CAPTIONING_TASK, run_caption_to_phrase_grounding ) from utils.video import ( create_directory, remove_files_older_than, generate_file_name, calculate_end_frame_index ) MARKDOWN = """ # Florence-2 for Videos 🎬

""" RESULTS = "results" CHECKPOINT = "microsoft/Florence-2-base-ft" DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): MODEL = AutoModelForCausalLM.from_pretrained( CHECKPOINT, trust_remote_code=True).to(DEVICE) PROCESSOR = AutoProcessor.from_pretrained( CHECKPOINT, trust_remote_code=True) BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(color_lookup=sv.ColorLookup.TRACK) LABEL_ANNOTATOR = sv.LabelAnnotator(color_lookup=sv.ColorLookup.TRACK) TRACKER = sv.ByteTrack() # creating video results directory create_directory(directory_path=RESULTS) def annotate_image( input_image: np.ndarray, detections: sv.Detections ) -> np.ndarray: output_image = input_image.copy() output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections) output_image = LABEL_ANNOTATOR.annotate(output_image, detections) return output_image @spaces.GPU def process_video( input_video: str, progress=gr.Progress(track_tqdm=True) ) -> str: # cleanup of old video files remove_files_older_than(RESULTS, 30) video_info = sv.VideoInfo.from_video_path(input_video) video_info.fps = video_info.fps // 2 total = calculate_end_frame_index(input_video) frame_generator = sv.get_video_frames_generator( source_path=input_video, end=total, stride=2 ) result_file_name = generate_file_name(extension="mp4") result_file_path = os.path.join(RESULTS, result_file_name) TRACKER.reset() with sv.VideoSink(result_file_path, video_info=video_info) as sink: for _ in tqdm(range(total), desc="Processing video..."): frame = next(frame_generator) caption = run_captioning( model=MODEL, processor=PROCESSOR, image=frame, device=DEVICE )[CAPTIONING_TASK] detections = run_caption_to_phrase_grounding( model=MODEL, processor=PROCESSOR, caption=caption, image=frame, device=DEVICE ) detections.confidence = np.ones(len(detections)) detections.class_id = np.zeros(len(detections)) detections = TRACKER.update_with_detections(detections) frame = annotate_image( input_image=frame, detections=detections ) sink.write_frame(frame) return result_file_path with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): input_video_component = gr.Video( label='Input Video' ) output_video_component = gr.Video( label='Output Video' ) with gr.Row(): submit_button_component = gr.Button( value='Submit', scale=1, variant='primary' ) submit_button_component.click( fn=process_video, inputs=[ input_video_component, ], outputs=output_video_component ) demo.launch(debug=False, show_error=True, max_threads=1)