Spaces:

sahirp
/

text_2_segment_any_vid

Running on Zero

App Files Files Community

er1t0 commited on Aug 3

Commit

8870220

•

1 Parent(s): 9b87d5a

torch autocast

Browse files

Files changed (2) hide show

.gitignore +3 -0
app.py +55 -46

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+temp_frames
+temp_frames_30
+segmented_video.mp4

app.py CHANGED Viewed

@@ -41,6 +41,8 @@ florence_model = load_model_without_flash_attn(load_florence_model)
 florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def apply_color_mask(frame, mask, obj_id):
  cmap = plt.get_cmap("tab10")
  color = np.array(cmap(obj_id % 10)[:3]) # Use modulo 10 to cycle through colors
@@ -61,25 +63,26 @@ def apply_color_mask(frame, mask, obj_id):
  colored_mask = mask * color
  return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
- with torch.amp.autocast(dtype=torch.bfloat16):
- task_prompt = '<OPEN_VOCABULARY_DETECTION>'
- prompt = task_prompt + text_input
- inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
- generated_ids = florence_model.generate(
- input_ids=inputs["input_ids"].cuda(),
- pixel_values=inputs["pixel_values"].cuda(),
- max_new_tokens=1024,
- early_stopping=False,
- do_sample=False,
- num_beams=3,
- )
- generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
- parsed_answer = florence_processor.post_process_generation(
- generated_text,
- task=task_prompt,
- image_size=(image.width, image.height)
- )
  return parsed_answer[task_prompt]['bboxes'][0]
 def remove_directory_contents(directory):
@@ -89,7 +92,8 @@ def remove_directory_contents(directory):
  for name in dirs:
  os.rmdir(os.path.join(root, name))
 def process_video(video_path, prompt):
  try:
  # Get video info
@@ -123,14 +127,13 @@ def process_video(video_path, prompt):
  print("Reshaped mask box:", mask_box)
  # SAM2 segmentation on first frame
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
- image_predictor.set_image(first_frame)
- masks, _, _ = image_predictor.predict(
- point_coords=None,
- point_labels=None,
- box=mask_box[None, :],
- multimask_output=False,
- )
  print("masks.shape", masks.shape)
  mask = masks.squeeze().astype(bool)
@@ -145,21 +148,20 @@ def process_video(video_path, prompt):
  print(f"Saved {len(frames)} temporary frames")
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
- inference_state = video_predictor.init_state(video_path=temp_dir)
- _, _, _ = video_predictor.add_new_mask(
- inference_state=inference_state,
- frame_idx=0,
- obj_id=1,
- mask=mask
- )
- video_segments = {}
- for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
- video_segments[out_frame_idx] = {
- out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
- for i, out_obj_id in enumerate(out_obj_ids)
- }
  print('Segmenting for main vid done')
  print(f"Number of segmented frames: {len(video_segments)}")
@@ -216,12 +218,19 @@ def segment_video(video_file, prompt):
 demo = gr.Interface(
  fn=segment_video,
  inputs=[
- gr.Video(label="Upload Video"),
- gr.Textbox(label="Enter prompt (e.g., 'a gymnast')")
  ],
  outputs=gr.Video(label="Segmented Video"),
- title="Video Object Segmentation with Florence and SAM2",
- description="Upload a video and provide a text prompt to segment a specific object throughout the video."
 )
 demo.launch()

 florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def apply_color_mask(frame, mask, obj_id):
  cmap = plt.get_cmap("tab10")
  color = np.array(cmap(obj_id % 10)[:3]) # Use modulo 10 to cycle through colors
  colored_mask = mask * color
  return frame * (1 - mask) + colored_mask * 255
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def run_florence(image, text_input):
+ task_prompt = '<OPEN_VOCABULARY_DETECTION>'
+ prompt = task_prompt + text_input
+ inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
+ generated_ids = florence_model.generate(
+ input_ids=inputs["input_ids"].cuda(),
+ pixel_values=inputs["pixel_values"].cuda(),
+ max_new_tokens=1024,
+ early_stopping=False,
+ do_sample=False,
+ num_beams=3,
+ )
+ generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+ parsed_answer = florence_processor.post_process_generation(
+ generated_text,
+ task=task_prompt,
+ image_size=(image.width, image.height)
+ )
  return parsed_answer[task_prompt]['bboxes'][0]
 def remove_directory_contents(directory):
  for name in dirs:
  os.rmdir(os.path.join(root, name))
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def process_video(video_path, prompt):
  try:
  # Get video info
  print("Reshaped mask box:", mask_box)
  # SAM2 segmentation on first frame
+ image_predictor.set_image(first_frame)
+ masks, _, _ = image_predictor.predict(
+ point_coords=None,
+ point_labels=None,
+ box=mask_box[None, :],
+ multimask_output=False,
+ )
  print("masks.shape", masks.shape)
  mask = masks.squeeze().astype(bool)
  print(f"Saved {len(frames)} temporary frames")
+ inference_state = video_predictor.init_state(video_path=temp_dir)
+ _, _, _ = video_predictor.add_new_mask(
+ inference_state=inference_state,
+ frame_idx=0,
+ obj_id=1,
+ mask=mask
+ )
+ video_segments = {}
+ for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+ video_segments[out_frame_idx] = {
+ out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+ for i, out_obj_id in enumerate(out_obj_ids)
+ }
  print('Segmenting for main vid done')
  print(f"Number of segmented frames: {len(video_segments)}")
 demo = gr.Interface(
  fn=segment_video,
  inputs=[
+ gr.Video(label="Upload Video (Keep it under 10 seconds for this demo)"),
+ gr.Textbox(label="Enter text prompt for object detection")
  ],
  outputs=gr.Video(label="Segmented Video"),
+ title="Text-Prompted Video Object Segmentation",
+ description="""
+ This demo uses [Florence-2](https://huggingface.co/microsoft/Florence-2-large), a vision-language model, to enable text-prompted object detection for [SAM2](https://github.com/facebookresearch/segment-anything).
+ Florence-2 interprets your text prompt, allowing SAM2 to segment the described object in the video.
+ 1. Upload a short video (< 10 sec)
+ 2. Describe the object to segment
+ 3. Get your segmented video!
+ """
 )
 demo.launch()