Spaces:

ighoshsubho
/

flux-sam-florence

Running

App Files Files Community

ighoshsubho commited on Aug 20

Commit

9aecc37

•

0 Parent(s):

Florence sam flux first commit

Browse files

Files changed (6) hide show

.gitignore +3 -0
README.md +12 -0
app.py +121 -0
requirements.txt +13 -0
utils/florence.py +58 -0
utils/sam.py +45 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/venv
+/.idea
+/tmp

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Florence2 + SAM2 + FLUX
+emoji: 🔥
+colorFrom: purple
+colorTo: green
+sdk: gradio
+sdk_version: 4.40.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import numpy as np
+from PIL import Image
+from diffusers import FluxInpaintPipeline
+from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.sam import load_sam_image_model, run_sam_inference
+import gradio as gr
+import supervision as sv
+# Load models
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+FLUX_PIPE = FluxInpaintPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(
+ DEVICE)
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_MODEL = load_sam_image_model(device=DEVICE)
+COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
+COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
+BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+LABEL_ANNOTATOR = sv.LabelAnnotator(
+ color=COLOR_PALETTE,
+ color_lookup=sv.ColorLookup.INDEX,
+ text_position=sv.Position.CENTER_OF_MASS,
+ text_color=sv.Color.from_hex("#000000"),
+ border_radius=5
+)
+MASK_ANNOTATOR = sv.MaskAnnotator(
+ color=COLOR_PALETTE,
+ color_lookup=sv.ColorLookup.INDEX
+)
+def visualize_detections(image, detections):
+ output_image = image.copy()
+ output_image = MASK_ANNOTATOR.annotate(output_image, detections)
+ output_image = BOX_ANNOTATOR.annotate(output_image, detections)
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
+ return output_image
+def detect_objects(image, text_prompt):
+ # Use Florence for object detection
+ _, result = run_florence_inference(
+ model=FLORENCE_MODEL,
+ processor=FLORENCE_PROCESSOR,
+ device=DEVICE,
+ image=image,
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+ text=text_prompt
+ )
+ detections = sv.Detections.from_lmm(
+ lmm=sv.LMM.FLORENCE_2,
+ result=result,
+ resolution_wh=image.size
+ )
+ # Use SAM to refine masks
+ detections = run_sam_inference(SAM_MODEL, image, detections)
+ return detections
+def inpaint_selected_objects(image, detections, selected_indices, inpaint_prompt):
+ mask = np.zeros(image.size[::-1], dtype=np.uint8)
+ for idx in selected_indices:
+ mask |= detections.mask[idx]
+ mask_image = Image.fromarray(mask * 255)
+ result = FLUX_PIPE(
+ prompt=inpaint_prompt,
+ image=image,
+ mask_image=mask_image,
+ num_inference_steps=30,
+ strength=0.85,
+ ).images[0]
+ return result
+def process_image(input_image, detection_prompt, inpaint_prompt, selected_objects):
+ detections = detect_objects(input_image, detection_prompt)
+ # Visualize detected objects
+ detected_image = visualize_detections(input_image, detections)
+ if selected_objects:
+ selected_indices = [int(idx) for idx in selected_objects.split(',')]
+ inpainted_image = inpaint_selected_objects(input_image, detections, selected_indices, inpaint_prompt)
+ return detected_image, inpainted_image
+ else:
+ return detected_image, None
+# Gradio interface
+with gr.Blocks() as demo:
+ gr.Markdown("# Object Detection and Inpainting with FLUX, Florence, and SAM")
+ with gr.Row():
+ with gr.Column():
+ input_image = gr.Image(type="pil", label="Input Image")
+ detection_prompt = gr.Textbox(label="Detection Prompt", placeholder="Enter objects to detect")
+ detect_button = gr.Button("Detect Objects")
+ with gr.Column():
+ detected_image = gr.Image(type="pil", label="Detected Objects")
+ selected_objects = gr.Textbox(label="Selected Objects",
+ placeholder="Enter indices of objects to inpaint (comma-separated)")
+ inpaint_prompt = gr.Textbox(label="Inpainting Prompt", placeholder="Describe what to inpaint")
+ inpaint_button = gr.Button("Inpaint Selected Objects")
+ output_image = gr.Image(type="pil", label="Inpainted Result")
+ detect_button.click(
+ fn=lambda img, prompt: process_image(img, prompt, "", "")[0],
+ inputs=[input_image, detection_prompt],
+ outputs=detected_image
+ )
+ inpaint_button.click(
+ fn=process_image,
+ inputs=[input_image, detection_prompt, inpaint_prompt, selected_objects],
+ outputs=[detected_image, output_image]
+ )
+demo.launch(debug=False, show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+tqdm
+einops
+spaces
+timm
+transformers
+samv2
+gradio
+supervision
+opencv-python
+pytest
+torch
+numpy
+diffusers

utils/florence.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
+FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+ if not str(filename).endswith("/modeling_florence2.py"):
+ return get_imports(filename)
+ imports = get_imports(filename)
+ imports.remove("flash_attn")
+ return imports
+def load_florence_model(
+ device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+ model = AutoModelForCausalLM.from_pretrained(
+ checkpoint, trust_remote_code=True).to(device).eval()
+ processor = AutoProcessor.from_pretrained(
+ checkpoint, trust_remote_code=True)
+ return model, processor
+def run_florence_inference(
+ model: Any,
+ processor: Any,
+ device: torch.device,
+ image: Image,
+ task: str,
+ text: str = ""
+) -> Tuple[str, Dict]:
+ prompt = task + text
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+ generated_ids = model.generate(
+ input_ids=inputs["input_ids"],
+ pixel_values=inputs["pixel_values"],
+ max_new_tokens=1024,
+ num_beams=3
+ )
+ generated_text = processor.batch_decode(
+ generated_ids, skip_special_tokens=False)[0]
+ response = processor.post_process_generation(
+ generated_text, task=task, image_size=image.size)
+ return generated_text, response

utils/sam.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Any
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
+SAM_CONFIG = "sam2_hiera_s.yaml"
+def load_sam_image_model(
+ device: torch.device,
+ config: str = SAM_CONFIG,
+ checkpoint: str = SAM_CHECKPOINT
+) -> SAM2ImagePredictor:
+ model = build_sam2(config, checkpoint, device=device)
+ return SAM2ImagePredictor(sam_model=model)
+def load_sam_video_model(
+ device: torch.device,
+ config: str = SAM_CONFIG,
+ checkpoint: str = SAM_CHECKPOINT
+) -> Any:
+ return build_sam2_video_predictor(config, checkpoint, device=device)
+def run_sam_inference(
+ model: Any,
+ image: Image,
+ detections: sv.Detections
+) -> sv.Detections:
+ image = np.array(image.convert("RGB"))
+ model.set_image(image)
+ mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
+ # dirty fix; remove this later
+ if len(mask.shape) == 4:
+ mask = np.squeeze(mask)
+ detections.mask = mask.astype(bool)
+ return detections