import torch from transformers import ( SegformerImageProcessor, SegformerForSemanticSegmentation, DPTImageProcessor, DPTForDepthEstimation ) from PIL import Image, ImageFilter import numpy as np import gradio as gr # Suppress specific warnings import warnings warnings.filterwarnings("ignore", category=UserWarning, module="transformers") # Load pre-trained models and processors seg_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") seg_model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") def process_image(image): # Ensure image is in RGB if image.mode != "RGB": image = image.convert("RGB") # Resize the image to 512x512 image = image.resize((512, 512)) # ------------------ Semantic Segmentation ------------------ seg_inputs = seg_processor(images=image, return_tensors="pt") with torch.no_grad(): seg_outputs = seg_model(**seg_inputs) seg_logits = seg_outputs.logits segmentation = torch.argmax(seg_logits, dim=1)[0].numpy() # Create binary mask for 'person' class (class index 12) person_class_index = 12 binary_mask = (segmentation == person_class_index).astype(np.uint8) * 255 binary_mask_image = Image.fromarray(binary_mask) # ------------------ Depth Estimation ------------------ depth_inputs = depth_processor(images=image, return_tensors="pt") with torch.no_grad(): depth_outputs = depth_model(**depth_inputs) predicted_depth = depth_outputs.predicted_depth[0].cpu().numpy() # Normalize the depth map for visualization min_depth = predicted_depth.min() max_depth = predicted_depth.max() normalized_depth = (predicted_depth - min_depth) / (max_depth - min_depth) depth_map_image = Image.fromarray((normalized_depth * 255).astype(np.uint8)) # ------------------ Blurred Background Effect ------------------ # Invert the depth map inverted_depth = 1 - normalized_depth inverted_depth = (inverted_depth - inverted_depth.min()) / (inverted_depth.max() - inverted_depth.min()) # Resize and expand dimensions to match image channels depth_weight_resized = Image.fromarray((inverted_depth * 255).astype(np.uint8)).resize((512, 512)) depth_weight_resized = np.array(depth_weight_resized) / 255.0 depth_weight_resized = np.expand_dims(depth_weight_resized, axis=-1) # Apply Gaussian blur to the entire image blurred_image = image.filter(ImageFilter.GaussianBlur(radius=15)) # Convert images to numpy arrays original_np = np.array(image).astype(np.float32) blurred_np = np.array(blurred_image).astype(np.float32) # Blend images based on the depth weight composite_np = (1 - depth_weight_resized) * original_np + depth_weight_resized * blurred_np composite_image = Image.fromarray(np.clip(composite_np, 0, 255).astype(np.uint8)) return image, binary_mask_image, depth_map_image, composite_image # Define Gradio interface using the updated API interface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil", label="Upload Image"), outputs=[ gr.Image(type="pil", label="Original Image"), gr.Image(type="pil", label="Segmentation Mask"), gr.Image(type="pil", label="Depth Map"), gr.Image(type="pil", label="Blurred Background Effect"), ], title="Semantic Segmentation and Depth Estimation", description="Upload an image to generate a segmentation mask, depth map, and a blurred background effect.", examples=[ ["Selfie_1.jpg"], ["Selfie_2.jpg"] ] ) # Launch the interface if __name__ == "__main__": interface.launch()