Spaces:

sahirp
/

text_2_segment_any_vid

Running on Zero

App Files Files Community

er1t0 commited on Aug 3

Commit

9b87d5a

•

1 Parent(s): 7976ee8

shift to ffmpeg

Browse files

Files changed (4) hide show

app.py +108 -98
myapp2.py +204 -0
packages.txt +1 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import torch
 import numpy as np
@@ -10,6 +9,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 import cv2
 import traceback
 import matplotlib.pyplot as plt
 from utils import load_model_without_flash_attn
@@ -62,7 +62,7 @@ def apply_color_mask(frame, mask, obj_id):
  return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
  task_prompt = '<OPEN_VOCABULARY_DETECTION>'
  prompt = task_prompt + text_input
  inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
@@ -89,125 +89,135 @@ def remove_directory_contents(directory):
  for name in dirs:
  os.rmdir(os.path.join(root, name))
-def process_video(video_path, prompt, chunk_size=30):
  try:
- video = cv2.VideoCapture(video_path)
- if not video.isOpened():
- raise ValueError("Unable to open video file")
- fps = video.get(cv2.CAP_PROP_FPS)
- frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
- # Process video in chunks
- all_segmented_frames = []
- for chunk_start in range(0, frame_count, chunk_size):
- chunk_end = min(chunk_start + chunk_size, frame_count)
- frames = []
- video.set(cv2.CAP_PROP_POS_FRAMES, chunk_start)
- for _ in range(chunk_end - chunk_start):
- ret, frame = video.read()
- if not ret:
- break
- frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
- if not frames:
- print(f"No frames extracted for chunk starting at {chunk_start}")
- continue
- # Florence detection on first frame of the chunk
- first_frame = Image.fromarray(frames[0])
- mask_box = run_florence(first_frame, prompt)
- print("Original mask box:", mask_box)
- # Convert mask_box to numpy array and ensure it's in the correct format
- mask_box = np.array(mask_box)
- print("Reshaped mask box:", mask_box)
- # SAM2 segmentation on first frame
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
- image_predictor.set_image(first_frame)
- masks, _, _ = image_predictor.predict(
- point_coords=None,
- point_labels=None,
- box=mask_box[None, :],
- multimask_output=False,
- )
- print("masks.shape",masks.shape)
- mask = masks.squeeze().astype(bool)
- print("Mask shape:", mask.shape)
- print("Frame shape:", frames[0].shape)
- # SAM2 video propagation
- temp_dir = f"temp_frames_{chunk_start}"
- os.makedirs(temp_dir, exist_ok=True)
- for i, frame in enumerate(frames):
- cv2.imwrite(os.path.join(temp_dir, f"{i:04d}.jpg"), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
- with torch.cuda.amp.autocast(dtype=torch.bfloat16):
- inference_state = video_predictor.init_state(video_path=temp_dir)
- _, _, _ = video_predictor.add_new_mask(
- inference_state=inference_state,
- frame_idx=0,
- obj_id=1,
- mask=mask
- )
- video_segments = {}
- for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
- video_segments[out_frame_idx] = {
- out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
- for i, out_obj_id in enumerate(out_obj_ids)
- }
- print('segmenting for main vid done')
- # Apply segmentation masks to frames
- for i, frame in enumerate(frames):
- if i in video_segments:
- for out_obj_id, mask in video_segments[i].items():
- frame = apply_color_mask(frame, mask, out_obj_id)
- all_segmented_frames.append(frame.astype(np.uint8))
- else:
- all_segmented_frames.append(frame)
- # Clean up temporary files
- remove_directory_contents(temp_dir)
- os.rmdir(temp_dir)
- video.release()
- if not all_segmented_frames:
- raise ValueError("No frames were processed successfully")
- # Create video from segmented frames
  output_path = "segmented_video.mp4"
- out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps,
- (all_segmented_frames[0].shape[1], all_segmented_frames[0].shape[0]))
  for frame in all_segmented_frames:
- out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
- out.release()
  return output_path
  except Exception as e:
  print(f"Error in process_video: {str(e)}")
  print(traceback.format_exc()) # This will print the full stack trace
  return None
-def segment_video(video_file, prompt, chunk_size):
  if video_file is None:
  return None
- output_video = process_video(video_file, prompt, int(chunk_size))
  return output_video
 demo = gr.Interface(
  fn=segment_video,
  inputs=[
  gr.Video(label="Upload Video"),
- gr.Textbox(label="Enter prompt (e.g., 'a gymnast')"),
- gr.Slider(minimum=10, maximum=100, step=10, value=30, label="Chunk Size (frames)")
  ],
  outputs=gr.Video(label="Segmented Video"),
  title="Video Object Segmentation with Florence and SAM2",

 import os
 import torch
 import numpy as np
 import cv2
 import traceback
 import matplotlib.pyplot as plt
+import ffmpeg
 from utils import load_model_without_flash_attn
  return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
+ with torch.amp.autocast(dtype=torch.bfloat16):
  task_prompt = '<OPEN_VOCABULARY_DETECTION>'
  prompt = task_prompt + text_input
  inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
  for name in dirs:
  os.rmdir(os.path.join(root, name))
+def process_video(video_path, prompt):
  try:
+ # Get video info
+ probe = ffmpeg.probe(video_path)
+ video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
+ width = int(video_info['width'])
+ height = int(video_info['height'])
+ num_frames = int(video_info['nb_frames'])
+ fps = eval(video_info['r_frame_rate'])
+ print(f"Video info: {width}x{height}, {num_frames} frames, {fps} fps")
+ # Read frames
+ out, _ = (
+ ffmpeg
+ .input(video_path)
+ .output('pipe:', format='rawvideo', pix_fmt='rgb24')
+ .run(capture_stdout=True)
+ )
+ frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+ print(f"Read {len(frames)} frames")
+ # Florence detection on first frame
+ first_frame = Image.fromarray(frames[0])
+ mask_box = run_florence(first_frame, prompt)
+ print("Original mask box:", mask_box)
+ # Convert mask_box to numpy array
+ mask_box = np.array(mask_box)
+ print("Reshaped mask box:", mask_box)
+ # SAM2 segmentation on first frame
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+ image_predictor.set_image(first_frame)
+ masks, _, _ = image_predictor.predict(
+ point_coords=None,
+ point_labels=None,
+ box=mask_box[None, :],
+ multimask_output=False,
+ )
+ print("masks.shape", masks.shape)
+ mask = masks.squeeze().astype(bool)
+ print("Mask shape:", mask.shape)
+ print("Frame shape:", frames[0].shape)
+ # SAM2 video propagation
+ temp_dir = "temp_frames"
+ os.makedirs(temp_dir, exist_ok=True)
+ for i, frame in enumerate(frames):
+ Image.fromarray(frame).save(os.path.join(temp_dir, f"{i:04d}.jpg"))
+ print(f"Saved {len(frames)} temporary frames")
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+ inference_state = video_predictor.init_state(video_path=temp_dir)
+ _, _, _ = video_predictor.add_new_mask(
+ inference_state=inference_state,
+ frame_idx=0,
+ obj_id=1,
+ mask=mask
+ )
+ video_segments = {}
+ for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+ video_segments[out_frame_idx] = {
+ out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+ for i, out_obj_id in enumerate(out_obj_ids)
+ }
+ print('Segmenting for main vid done')
+ print(f"Number of segmented frames: {len(video_segments)}")
+ # Apply segmentation masks to frames
+ all_segmented_frames = []
+ for i, frame in enumerate(frames):
+ if i in video_segments:
+ for out_obj_id, mask in video_segments[i].items():
+ frame = apply_color_mask(frame, mask, out_obj_id)
+ all_segmented_frames.append(frame.astype(np.uint8))
+ else:
+ all_segmented_frames.append(frame)
+ print(f"Applied masks to {len(all_segmented_frames)} frames")
+ # Clean up temporary files
+ remove_directory_contents(temp_dir)
+ os.rmdir(temp_dir)
+ # Write output video using ffmpeg
  output_path = "segmented_video.mp4"
+ process = (
+ ffmpeg
+ .input('pipe:', format='rawvideo', pix_fmt='rgb24', s=f'{width}x{height}', r=fps)
+ .output(output_path, pix_fmt='yuv420p')
+ .overwrite_output()
+ .run_async(pipe_stdin=True)
+ )
  for frame in all_segmented_frames:
+ process.stdin.write(frame.tobytes())
+ process.stdin.close()
+ process.wait()
+ if not os.path.exists(output_path):
+ raise ValueError(f"Output video file was not created: {output_path}")
+ print(f"Successfully created output video: {output_path}")
  return output_path
  except Exception as e:
  print(f"Error in process_video: {str(e)}")
  print(traceback.format_exc()) # This will print the full stack trace
  return None
+def segment_video(video_file, prompt):
  if video_file is None:
  return None
+ output_video = process_video(video_file, prompt)
  return output_video
 demo = gr.Interface(
  fn=segment_video,
  inputs=[
  gr.Video(label="Upload Video"),
+ gr.Textbox(label="Enter prompt (e.g., 'a gymnast')")
  ],
  outputs=gr.Video(label="Segmented Video"),
  title="Video Object Segmentation with Florence and SAM2",

myapp2.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+from sam2.build_sam import build_sam2_video_predictor, build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+import cv2
+import traceback
+import matplotlib.pyplot as plt
+# CUDA optimizations
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+# Initialize models
+sam2_checkpoint = "../checkpoints/sam2_hiera_large.pt"
+model_cfg = "sam2_hiera_l.yaml"
+video_predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)
+sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")
+image_predictor = SAM2ImagePredictor(sam2_model)
+model_id = 'microsoft/Florence-2-large'
+florence_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.bfloat16).eval().cuda()
+florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+def apply_color_mask(frame, mask, obj_id):
+ cmap = plt.get_cmap("tab10")
+ color = np.array(cmap(obj_id % 10)[:3]) # Use modulo 10 to cycle through colors
+ # Ensure mask has the correct shape
+ if mask.ndim == 4:
+ mask = mask.squeeze() # Remove singleton dimensions
+ if mask.ndim == 3 and mask.shape[0] == 1:
+ mask = mask[0] # Take the first channel if it's a single-channel 3D array
+ # Reshape mask to match frame dimensions
+ mask = cv2.resize(mask.astype(np.float32), (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_LINEAR)
+ # Expand dimensions of mask and color for broadcasting
+ mask = np.expand_dims(mask, axis=2)
+ color = color.reshape(1, 1, 3)
+ colored_mask = mask * color
+ return frame * (1 - mask) + colored_mask * 255
+def run_florence(image, text_input):
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+ task_prompt = '<OPEN_VOCABULARY_DETECTION>'
+ prompt = task_prompt + text_input
+ inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
+ generated_ids = florence_model.generate(
+ input_ids=inputs["input_ids"].cuda(),
+ pixel_values=inputs["pixel_values"].cuda(),
+ max_new_tokens=1024,
+ early_stopping=False,
+ do_sample=False,
+ num_beams=3,
+ )
+ generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+ parsed_answer = florence_processor.post_process_generation(
+ generated_text,
+ task=task_prompt,
+ image_size=(image.width, image.height)
+ )
+ return parsed_answer[task_prompt]['bboxes'][0]
+def remove_directory_contents(directory):
+ for root, dirs, files in os.walk(directory, topdown=False):
+ for name in files:
+ os.remove(os.path.join(root, name))
+ for name in dirs:
+ os.rmdir(os.path.join(root, name))
+def process_video(video_path, prompt, chunk_size=30):
+ try:
+ video = cv2.VideoCapture(video_path)
+ if not video.isOpened():
+ raise ValueError("Unable to open video file")
+ fps = video.get(cv2.CAP_PROP_FPS)
+ frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+ # Process video in chunks
+ all_segmented_frames = []
+ for chunk_start in range(0, frame_count, chunk_size):
+ chunk_end = min(chunk_start + chunk_size, frame_count)
+ frames = []
+ video.set(cv2.CAP_PROP_POS_FRAMES, chunk_start)
+ for _ in range(chunk_end - chunk_start):
+ ret, frame = video.read()
+ if not ret:
+ break
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+ if not frames:
+ print(f"No frames extracted for chunk starting at {chunk_start}")
+ continue
+ # Florence detection on first frame of the chunk
+ first_frame = Image.fromarray(frames[0])
+ mask_box = run_florence(first_frame, prompt)
+ print("Original mask box:", mask_box)
+ # Convert mask_box to numpy array and ensure it's in the correct format
+ mask_box = np.array(mask_box)
+ print("Reshaped mask box:", mask_box)
+ # SAM2 segmentation on first frame
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+ image_predictor.set_image(first_frame)
+ masks, _, _ = image_predictor.predict(
+ point_coords=None,
+ point_labels=None,
+ box=mask_box[None, :],
+ multimask_output=False,
+ )
+ print("masks.shape",masks.shape)
+ mask = masks.squeeze().astype(bool)
+ print("Mask shape:", mask.shape)
+ print("Frame shape:", frames[0].shape)
+ # SAM2 video propagation
+ temp_dir = f"temp_frames_{chunk_start}"
+ os.makedirs(temp_dir, exist_ok=True)
+ for i, frame in enumerate(frames):
+ cv2.imwrite(os.path.join(temp_dir, f"{i:04d}.jpg"), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+ inference_state = video_predictor.init_state(video_path=temp_dir)
+ _, _, _ = video_predictor.add_new_mask(
+ inference_state=inference_state,
+ frame_idx=0,
+ obj_id=1,
+ mask=mask
+ )
+ video_segments = {}
+ for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+ video_segments[out_frame_idx] = {
+ out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+ for i, out_obj_id in enumerate(out_obj_ids)
+ }
+ print('segmenting for main vid done')
+ # Apply segmentation masks to frames
+ for i, frame in enumerate(frames):
+ if i in video_segments:
+ for out_obj_id, mask in video_segments[i].items():
+ frame = apply_color_mask(frame, mask, out_obj_id)
+ all_segmented_frames.append(frame.astype(np.uint8))
+ else:
+ all_segmented_frames.append(frame)
+ # Clean up temporary files
+ remove_directory_contents(temp_dir)
+ os.rmdir(temp_dir)
+ video.release()
+ if not all_segmented_frames:
+ raise ValueError("No frames were processed successfully")
+ # Create video from segmented frames
+ output_path = "segmented_video.mp4"
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps,
+ (all_segmented_frames[0].shape[1], all_segmented_frames[0].shape[0]))
+ for frame in all_segmented_frames:
+ out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+ out.release()
+ return output_path
+ except Exception as e:
+ print(f"Error in process_video: {str(e)}")
+ print(traceback.format_exc()) # This will print the full stack trace
+ return None
+def segment_video(video_file, prompt, chunk_size):
+ if video_file is None:
+ return None
+ output_video = process_video(video_file, prompt, int(chunk_size))
+ return output_video
+demo = gr.Interface(
+ fn=segment_video,
+ inputs=[
+ gr.Video(label="Upload Video"),
+ gr.Textbox(label="Enter prompt (e.g., 'a gymnast')"),
+ gr.Slider(minimum=10, maximum=100, step=10, value=30, label="Chunk Size (frames)")
+ ],
+ outputs=gr.Video(label="Segmented Video"),
+ title="Video Object Segmentation with Florence and SAM2",
+ description="Upload a video and provide a text prompt to segment a specific object throughout the video."
+)
+demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ opencv-python
 matplotlib
 einops
 timm
-pytest

 matplotlib
 einops
 timm
+pytest
+ffmpeg-python