Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Oct 8, 2024

Commit

4d21d31

1 Parent(s): 57f7a8d

add application

Browse files

Files changed (1) hide show

app.py +315 -13

app.py CHANGED Viewed

@@ -419,7 +419,7 @@ def segment_fg_bg(images):
             # transform the input images
             input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
-            output = model(input_images)['attn'][6]
             fg_act = output[:, 6, 6].mean(0)
             bg_act = output[:, 0, 0].mean(0)
             fg_acts.append(fg_act)
@@ -455,8 +455,8 @@ def segment_fg_bg(images):
             # output = model(input_images)[:, 5]
             output = model(input_images)['attn'][6]
             output = F.normalize(output, dim=-1)
-            heatmap_fg = output @ fg_act[:, None]
-            heatmap_bg = output @ bg_act[:, None]
             heatmap_fgs.append(heatmap_fg.cpu())
             heatmap_bgs.append(heatmap_bg.cpu())
     heatmap_fg = torch.cat(heatmap_fgs, dim=0)
@@ -498,8 +498,8 @@ def make_cluster_plot(eigvecs, images, h=64, w=64, progess_start=0.6, advanced=F
     left = F.normalize(left, dim=-1)
     right = F.normalize(right, dim=-1)
     heatmap = left @ right.T
-    heatmap = F.normalize(heatmap, dim=-1)
-    num_samples = clusters + 20
     if num_samples > fps_idx.shape[0]:
         num_samples = fps_idx.shape[0]
     r2_fps_idx = farthest_point_sampling(heatmap, num_samples)
@@ -939,7 +939,7 @@ def ncut_run(
         return video_path, logging_str
     cluster_images = None
-    if plot_clusters:
         start = time.time()
         progress_start = 0.6
         progress(progress_start, desc="Plotting Clusters")
@@ -955,7 +955,7 @@ def ncut_run(
         logging_str += f"plot time: {time.time() - start:.2f}s\n"
     norm_images = None
-    if alignedcut_eig_norm_plot:
         norm_images = []
         # eig_magnitude = torch.clamp(eig_magnitude, 0, 1)
         vmin, vmax = eig_magnitude.min(), eig_magnitude.max()
@@ -977,7 +977,7 @@ def ncut_run(
 def _ncut_run(*args, **kwargs):
-    n_ret = kwargs.pop("n_ret", 1)
     try:
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -1653,8 +1653,9 @@ def load_and_append(existing_images, *args, **kwargs):
     gr.Info(f"Total images: {len(existing_images)}")
     return existing_images
-def make_input_images_section(rows=1, cols=3, height="auto", advanced=False, is_random=False, allow_download=False):
-    gr.Markdown('### Input Images')
     input_gallery = gr.Gallery(value=None, label="Input images", show_label=True, elem_id="input_images", columns=[cols], rows=[rows], object_fit="contain", height=height, type="pil", show_share_button=False,
                                format="webp")
@@ -2020,10 +2021,12 @@ def add_download_button(gallery, filename_prefix="output"):
     return create_file_button, download_button
-def make_output_images_section():
-    gr.Markdown('### Output Images')
     output_gallery = gr.Gallery(format='png', value=[], label="NCUT Embedding", show_label=True, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto", show_share_button=True, interactive=False)
-    add_rotate_flip_buttons(output_gallery)
     return output_gallery
 def make_parameters_section(is_lisa=False, model_ratio=True):
@@ -2133,6 +2136,8 @@ demo = gr.Blocks(
     css=custom_css,
 )
 with demo:
     with gr.Tab('AlignedCut'):
         with gr.Row():
@@ -3081,7 +3086,304 @@ with demo:
         buttons[-1].click(fn=lambda x: gr.update(visible=True), outputs=rows[-1])
         buttons[-1].click(fn=lambda x: gr.update(visible=False), outputs=buttons[-1])
     with gr.Tab('📄About'):
         with gr.Column():

             # transform the input images
             input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
+            output = model(input_images)['attn'][6]  # [B, H=14, W=14, C]
             fg_act = output[:, 6, 6].mean(0)
             bg_act = output[:, 0, 0].mean(0)
             fg_acts.append(fg_act)
             # output = model(input_images)[:, 5]
             output = model(input_images)['attn'][6]
             output = F.normalize(output, dim=-1)
+            heatmap_fg = output @ fg_act[:, None]  # [B, H, W, 1]
+            heatmap_bg = output @ bg_act[:, None]  # [B, H, W, 1]
             heatmap_fgs.append(heatmap_fg.cpu())
             heatmap_bgs.append(heatmap_bg.cpu())
     heatmap_fg = torch.cat(heatmap_fgs, dim=0)
     left = F.normalize(left, dim=-1)
     right = F.normalize(right, dim=-1)
     heatmap = left @ right.T
+    heatmap = F.normalize(heatmap, dim=-1)  # [300, N_pixel]  PCA-> [300, 8]
+    num_samples = clusters + 20   # 100/120
     if num_samples > fps_idx.shape[0]:
         num_samples = fps_idx.shape[0]
     r2_fps_idx = farthest_point_sampling(heatmap, num_samples)
         return video_path, logging_str
     cluster_images = None
+    if plot_clusters and kwargs.get("n_ret", 1) > 1:
         start = time.time()
         progress_start = 0.6
         progress(progress_start, desc="Plotting Clusters")
         logging_str += f"plot time: {time.time() - start:.2f}s\n"
     norm_images = None
+    if alignedcut_eig_norm_plot and kwargs.get("n_ret", 1) > 1:
         norm_images = []
         # eig_magnitude = torch.clamp(eig_magnitude, 0, 1)
         vmin, vmax = eig_magnitude.min(), eig_magnitude.max()
 def _ncut_run(*args, **kwargs):
+    n_ret = kwargs.get("n_ret", 1)
     try:
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     gr.Info(f"Total images: {len(existing_images)}")
     return existing_images
+def make_input_images_section(rows=1, cols=3, height="auto", advanced=False, is_random=False, allow_download=False, markdown=True):
+    if markdown:
+        gr.Markdown('### Input Images')
     input_gallery = gr.Gallery(value=None, label="Input images", show_label=True, elem_id="input_images", columns=[cols], rows=[rows], object_fit="contain", height=height, type="pil", show_share_button=False,
                                format="webp")
     return create_file_button, download_button
+def make_output_images_section(markdown=True, button=True):
+    if markdown:
+        gr.Markdown('### Output Images')
     output_gallery = gr.Gallery(format='png', value=[], label="NCUT Embedding", show_label=True, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto", show_share_button=True, interactive=False)
+    if button:
+        add_rotate_flip_buttons(output_gallery)
     return output_gallery
 def make_parameters_section(is_lisa=False, model_ratio=True):
     css=custom_css,
 )
 with demo:
     with gr.Tab('AlignedCut'):
         with gr.Row():
         buttons[-1].click(fn=lambda x: gr.update(visible=True), outputs=rows[-1])
         buttons[-1].click(fn=lambda x: gr.update(visible=False), outputs=buttons[-1])
+    with gr.Tab('Application'):
+        gr.Markdown("Draw some points on the image to find corrsponding segments in other images. E.g. click on one face to segment all the face. [Video Tutorial (coming...)]()")
+        with gr.Row():
+            with gr.Column(scale=5, min_width=200):
+                gr.Markdown("### Step 0: Load Images")
+                input_gallery, submit_button, clear_images_button, dataset_dropdown, num_images_slider, random_seed_slider, load_images_button = make_input_images_section(markdown=False)
+                submit_button.visible = False
+                num_images_slider.value = 30
+                logging_text = gr.Textbox("Logging information", label="Logging", elem_id="logging", type="text", placeholder="Logging information", autofocus=False, autoscroll=False)
+            with gr.Column(scale=5, min_width=200):
+                gr.Markdown("### Step 1: NCUT Embedding")
+                output_gallery = make_output_images_section(markdown=False, button=False)
+                submit_button = gr.Button("🔴 RUN", elem_id="submit_button", variant='primary')
+                add_rotate_flip_buttons(output_gallery)
+                [
+                    model_dropdown, layer_slider, node_type_dropdown, num_eig_slider,
+                    affinity_focal_gamma_slider, num_sample_ncut_slider, ncut_knn_slider, ncut_indirect_connection, ncut_make_orthogonal,
+                    embedding_method_dropdown, embedding_metric_dropdown, num_sample_tsne_slider, knn_tsne_slider,
+                    perplexity_slider, n_neighbors_slider, min_dist_slider,
+                    sampling_method_dropdown, ncut_metric_dropdown, positive_prompt, negative_prompt
+                ] = make_parameters_section()
+                false_placeholder = gr.Checkbox(label="False", value=False, elem_id="false_placeholder", visible=False)
+                no_prompt = gr.Textbox("", label="", elem_id="empty_placeholder", type="text", placeholder="", visible=False)
+                submit_button.click(
+                    partial(run_fn, n_ret=1),
+                    inputs=[
+                        input_gallery, model_dropdown, layer_slider, num_eig_slider, node_type_dropdown,
+                        positive_prompt, negative_prompt,
+                        false_placeholder, no_prompt, no_prompt, no_prompt,
+                        affinity_focal_gamma_slider, num_sample_ncut_slider, ncut_knn_slider, ncut_indirect_connection, ncut_make_orthogonal,
+                        embedding_method_dropdown, embedding_metric_dropdown, num_sample_tsne_slider, knn_tsne_slider,
+                        perplexity_slider, n_neighbors_slider, min_dist_slider, sampling_method_dropdown, ncut_metric_dropdown
+                    ],
+                    outputs=[output_gallery, logging_text],
+                )
+            with gr.Column(scale=5, min_width=200):
+                gr.Markdown("### Step 2a: Pick an Image")
+                from gradio_image_prompter import ImagePrompter
+                image_type_radio = gr.Radio(["Original", "NCUT"], label="Image Display Type", value="Original", elem_id="image_type_radio")
+                with gr.Row():
+                    image1_slider = gr.Slider(0, 100, step=1, label="Image#1 Index", value=0, elem_id="image1_slider", interactive=True)
+                    image2_slider = gr.Slider(0, 100, step=1, label="Image#2 Index", value=1, elem_id="image2_slider", interactive=True)
+                    image3_slider = gr.Slider(0, 100, step=1, label="Image#3 Index", value=2, elem_id="image3_slider", interactive=True)
+                load_one_image_button = gr.Button("🔴 Load", elem_id="load_one_image_button", variant='primary')
+                gr.Markdown("### Step 2b: Draw Points")
+                gr.Markdown("##### 🖱️ Left Click: Foreground")
+                gr.Markdown("##### 🖱️ Middle Click: Background")
+                gr.Markdown("""
+                    <h5>
+                    Top Right
+                    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
+                    stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"
+                    style="vertical-align: middle; height: 1em; width: 1em; display: inline;">
+                    <polyline points="1 4 1 10 7 10"></polyline>
+                    <path d="M3.51 15a9 9 0 1 0 2.13-9.36L1 10"></path>
+                    </svg> :
+                    Remove Last Point
+                    </h5>
+                """)
+                prompt_image1 = ImagePrompter(show_label=False, elem_id="prompt_image", interactive=True)
+                prompt_image2 = ImagePrompter(show_label=False, elem_id="prompt_image", interactive=True)
+                prompt_image3 = ImagePrompter(show_label=False, elem_id="prompt_image", interactive=True)
+                # def update_number_of_images(images):
+                #     if images is None:
+                #         return gr.update(max=0, value=0)
+                #     return gr.update(max=len(images)-1, value=1)
+                # input_gallery.change(update_number_of_images, inputs=input_gallery, outputs=image1_slider)
+                def update_prompt_image(original_images, ncut_images, image_type, index):
+                    if image_type == "Original":
+                        images = original_images
+                    else:
+                        images = ncut_images
+                    if images is None:
+                        return
+                    total_len = len(images)
+                    if total_len == 0:
+                        return
+                    if index >= total_len:
+                        index = total_len - 1
+                    return gr.update(value={'image': images[index][0]})
+                load_one_image_button.click(update_prompt_image, inputs=[input_gallery, output_gallery, image_type_radio, image1_slider], outputs=[prompt_image1])
+                load_one_image_button.click(update_prompt_image, inputs=[input_gallery, output_gallery, image_type_radio, image2_slider], outputs=[prompt_image2])
+                load_one_image_button.click(update_prompt_image, inputs=[input_gallery, output_gallery, image_type_radio, image3_slider], outputs=[prompt_image3])
+                image3_slider.visible = False
+                prompt_image3.visible = False
+            with gr.Column(scale=5, min_width=200):
+                gr.Markdown("### Step 3: Segment and Crop")
+                mask_gallery = gr.Gallery(value=[], label="Segmentation Masks", show_label=True, elem_id="mask_gallery", columns=[3], rows=[1], object_fit="contain", height="auto", show_share_button=True, interactive=False)
+                run_crop_button = gr.Button("🔴 RUN", elem_id="run_crop_button", variant='primary')
+                add_download_button(mask_gallery, "mask")
+                distance_threshold_slider = gr.Slider(0, 1, step=0.01, label="Mask Threshold", value=0.5, elem_id="distance_threshold", info="increase for smaller mask")
+                # filter_small_area_checkbox = gr.Checkbox(label="Noise Reduction", value=True, elem_id="filter_small_area_checkbox")
+                distance_power_slider = gr.Slider(-3, 3, step=0.01, label="Distance Power", value=0.5, elem_id="distance_power", info="d = d^p", visible=False)
+                crop_gallery = gr.Gallery(value=[], label="Cropped Images", show_label=True, elem_id="crop_gallery", columns=[3], rows=[1], object_fit="contain", height="auto", show_share_button=True, interactive=False)
+                add_download_button(crop_gallery, "cropped")
+                crop_expand_slider = gr.Slider(1.0, 2.0, step=0.1, label="Crop bbox Expand Factor", value=1.0, elem_id="crop_expand", info="increase for larger crop", visible=True)
+                area_threshold_slider = gr.Slider(0, 100, step=0.1, label="Area Threshold (%)", value=3, elem_id="area_threshold", info="for noise filtering (area of connected components)", visible=False)
+                # logging_image = gr.Image(value=None, label="Logging Image", elem_id="logging_image", interactive=False)
+                # prompt_image.change(lambda x: gr.update(value=x.get('image', None)), inputs=prompt_image, outputs=[logging_image])
+                def relative_xy(prompts):
+                    image = prompts['image']
+                    points = np.asarray(prompts['points'])
+                    if points.shape[0] == 0:
+                        return [], []
+                    is_point = points[:, 5] == 4.0
+                    points = points[is_point]
+                    is_positive = points[:, 2] == 1.0
+                    is_negative = points[:, 2] == 0.0
+                    xy = points[:, :2].tolist()
+                    if isinstance(image, str):
+                        image = Image.open(image)
+                        image = np.array(image)
+                    h, w = image.shape[:2]
+                    new_xy = [(x/w, y/h) for x, y in xy]
+                    # print(new_xy)
+                    return new_xy, is_positive
+                def xy_rgb(prompts, image_idx, ncut_images):
+                    image = ncut_images[image_idx]
+                    xy, is_positive = relative_xy(prompts)
+                    rgbs = []
+                    for i, (x, y) in enumerate(xy):
+                        rgb = image.getpixel((int(x*image.width), int(y*image.height)))
+                        rgbs.append((rgb, is_positive[i]))
+                    return rgbs
+                def run_crop(original_images, ncut_images, prompts1, prompts2, prompts3, image_idx1, image_idx2, image_idx3,
+                            crop_expand, distance_threshold, distance_power, area_threshold):
+                    ncut_images = [image[0] for image in ncut_images]
+                    if len(ncut_images) == 0:
+                        return []
+                    if isinstance(ncut_images[0], str):
+                        ncut_images = [Image.open(image) for image in ncut_images]
+                    rgbs = xy_rgb(prompts1, image_idx1, ncut_images) + \
+                            xy_rgb(prompts2, image_idx2, ncut_images) + \
+                            xy_rgb(prompts3, image_idx3, ncut_images)
+                    # print(rgbs)
+                    ncut_images = [np.array(image).astype(np.float32) for image in ncut_images]
+                    ncut_pixels = [image.reshape(-1, 3) for image in ncut_images]
+                    h, w = ncut_images[0].shape[:2]
+                    ncut_pixels = torch.tensor(np.array(ncut_pixels).reshape(-1, 3)) / 255
+                    # normalized_ncut_pixels = F.normalize(ncut_pixels, p=2, dim=-1)
+                    positive_distances = []
+                    negative_distances = []
+                    for rgb, is_positive in rgbs:
+                        rgb = torch.tensor(rgb).float() / 255
+                        # rgb = F.normalize(rgb, p=2, dim=-1)
+                        distance = (ncut_pixels - rgb[None]).norm(dim=-1)
+                        distance = distance.squeeze(-1)
+                        if is_positive:
+                            positive_distances.append(distance)
+                        else:
+                            negative_distances.append(distance)
+                    if len(positive_distances) == 0:
+                        raise gr.Error("No prompt points. Please draw some points on the image.")
+                    positive_distances = torch.stack(positive_distances)
+                    negative_flag = len(negative_distances) > 0
+                    if len(negative_distances) == 0:
+                        negative_distances = positive_distances * 0  # dummy
+                    else:
+                        negative_distances = torch.stack(negative_distances)
+                    positive_distance = positive_distances.min(dim=0).values
+                    negative_distance = negative_distances.min(dim=0).values
+                    # positive_distance = positive_distances.mean(dim=0)
+                    # negative_distance = negative_distances.mean(dim=0)
+                    def to_mask(heatmap, threshold):
+                        heatmap = 1 / (heatmap + 1e-6)
+                        heatmap = heatmap.reshape(len(ncut_images), h, w)
+                        vmin, vmax = heatmap.quantile(0.01), heatmap.quantile(0.99)
+                        heatmap = (heatmap - vmin) / (vmax - vmin)
+                        mask = heatmap > threshold
+                        return mask
+                    positive_mask = to_mask(positive_distance, distance_threshold)
+                    if negative_flag:
+                        negative_mask = to_mask(negative_distance, distance_threshold)
+                        positive_mask = positive_mask & ~negative_mask
+                    #convert to PIL
+                    mask = positive_mask.cpu().numpy()
+                    mask = mask.astype(np.uint8) * 255
+                    mask = [Image.fromarray(mask[i]) for i in range(len(mask))]
+                    import cv2
+                    def get_bboxes_and_clean_mask(pil_mask, min_area=500):
+                        """
+                        Args:
+                        - pil_mask: A Pillow image of a binary mask with 255 for the object and 0 for the background.
+                        - min_area: Minimum area for a connected component to be considered valid (default 500).
+                        Returns:
+                        - bounding_boxes: List of bounding boxes for valid objects (x, y, width, height).
+                        - cleaned_pil_mask: A Pillow image of the cleaned mask, with small components removed.
+                        """
+                        # Convert the Pillow image to a NumPy array
+                        mask = np.array(pil_mask)
+                        # Ensure the mask is binary (0 or 255)
+                        mask = np.where(mask > 127, 255, 0).astype(np.uint8)
+                        # Remove small noise using morphological operations (denoising)
+                        kernel = np.ones((5, 5), np.uint8)
+                        cleaned_mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+                        # Find connected components in the cleaned mask
+                        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(cleaned_mask, connectivity=8)
+                        # Initialize an empty mask to store the final cleaned mask
+                        final_cleaned_mask = np.zeros_like(cleaned_mask)
+                        # Collect bounding boxes for components that are larger than the threshold and update the cleaned mask
+                        bounding_boxes = []
+                        for i in range(1, num_labels):  # Skip label 0 (background)
+                            x, y, w, h, area = stats[i]
+                            if area >= min_area:
+                                # Add the bounding box of the valid component
+                                bounding_boxes.append((x, y, w, h))
+                                # Keep the valid components in the final cleaned mask
+                                final_cleaned_mask[labels == i] = 255
+                        # Convert the final cleaned mask back to a Pillow image
+                        cleaned_pil_mask = Image.fromarray(final_cleaned_mask)
+                        return bounding_boxes, cleaned_pil_mask
+                    bboxs, filtered_masks = zip(*[get_bboxes_and_clean_mask(_mask) for _mask in mask])
+                    # combine the masks, also draw the bounding boxes
+                    combined_masks = []
+                    for i_image in range(len(mask)):
+                        noisy_mask = np.array(mask[i_image].convert("RGB"))
+                        bbox = bboxs[i_image]
+                        clean_mask = np.array(filtered_masks[i_image].convert("RGB"))
+                        combined_mask = noisy_mask * 0.4 + clean_mask
+                        combined_mask = np.clip(combined_mask, 0, 255).astype(np.uint8)
+                        for x, y, w, h in bbox:
+                            cv2.rectangle(combined_mask, (x-1, y-1), (x + w+2, y + h+2), (255, 0, 0), 2)
+                        combined_mask = Image.fromarray(combined_mask)
+                        combined_masks.append(combined_mask)
+                    def extend_the_mask(xywh, factor=1.5):
+                        x, y, w, h = xywh
+                        x -= w * (factor - 1) / 2
+                        y -= h * (factor - 1) / 2
+                        w *= factor
+                        h *= factor
+                        return x, y, w, h
+                    def resize_the_mask(xywh, original_size, target_size):
+                        x, y, w, h = xywh
+                        x *= target_size[0] / original_size[0]
+                        y *= target_size[1] / original_size[1]
+                        w *= target_size[0] / original_size[0]
+                        h *= target_size[1] / original_size[1]
+                        x, y, w, h = int(x), int(y), int(w), int(h)
+                        return x, y, w, h
+                    def crop_image(image, xywh, mask_h, mask_w, factor=1.0):
+                        x, y, w, h = xywh
+                        x, y, w, h = resize_the_mask((x, y, w, h), (mask_h, mask_w), image.size)
+                        _x, _y, _w, _h = extend_the_mask((x, y, w, h), factor=factor)
+                        crop = image.crop((_x, _y, _x + _w, _y + _h))
+                        return crop
+                    original_images = [image[0] for image in original_images]
+                    if isinstance(original_images[0], str):
+                        original_images = [Image.open(image) for image in original_images]
+                    mask_h, mask_w = filtered_masks[0].size
+                    cropped_images = []
+                    for _image, _bboxs in zip(original_images, bboxs):
+                        for _bbox in _bboxs:
+                            cropped_images.append(crop_image(_image, _bbox, mask_h, mask_w, factor=crop_expand))
+                    return combined_masks, cropped_images
+                run_crop_button.click(run_crop,
+                    inputs=[input_gallery, output_gallery, prompt_image1, prompt_image2, prompt_image3, image1_slider, image2_slider, image3_slider,
+                            crop_expand_slider, distance_threshold_slider, distance_power_slider, area_threshold_slider],
+                    outputs=[mask_gallery, crop_gallery])
     with gr.Tab('📄About'):
         with gr.Column():