--- library_name: keras-hub pipeline_tag: text-to-image tags: - image-to-image - keras --- ### Model Overview # Stable Diffusion 3 Medium ![demo](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3demo.jpg) ## Model ![mmdit](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/mmdit.png) [Stable Diffusion 3 Medium](https://stability.ai/news/stable-diffusion-3-medium) is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features greatly improved performance in image quality, typography, complex prompt understanding, and resource-efficiency. For more technical details, please refer to the [Research paper](https://stability.ai/news/stable-diffusion-3-research-paper). Please note: this model is released under the Stability Community License. For Enterprise License visit Stability.ai or [contact us](https://stability.ai/enterprise) for commercial licensing details. ### Model Description - **Developed by:** Stability AI - **Model type:** MMDiT text-to-image generative model - **Model Description:** This is a model that can be used to generate images based on text prompts. It is a Multimodal Diffusion Transformer (https://arxiv.org/abs/2403.03206) that uses three fixed, pretrained text encoders ([OpenCLIP-ViT/G](https://github.com/mlfoundations/open_clip), [CLIP-ViT/L](https://github.com/openai/CLIP/tree/main) and [T5-xxl](https://huggingface.co/google/t5-v1_1-xxl)) ### Model card https://huggingface.co/stabilityai/stable-diffusion-3-medium ## Example Usage ```python # Pretrained Stable Diffusion 3 model. model = keras_hub.models.StableDiffusion3Backbone.from_preset( "stable_diffusion_3_medium" ) # Randomly initialized Stable Diffusion 3 model with custom config. vae = keras_hub.models.VAEBackbone(...) clip_l = keras_hub.models.CLIPTextEncoder(...) clip_g = keras_hub.models.CLIPTextEncoder(...) model = keras_hub.models.StableDiffusion3Backbone( mmdit_patch_size=2, mmdit_num_heads=4, mmdit_hidden_dim=256, mmdit_depth=4, mmdit_position_size=192, vae=vae, clip_l=clip_l, clip_g=clip_g, ) # Image to image example image_to_image = keras_hub.models.StableDiffusion3ImageToImage.from_preset( "stable_diffusion_3_medium", height=512, width=512 ) image_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", } ) # Generate with batched prompts. image_to_image.generate( { "images": np.ones((2, 512, 512, 3), dtype="float32"), "prompts": ["cute wallpaper art of a cat", "cute wallpaper art of a dog"], } ) # Generate with different `num_steps`, `guidance_scale` and `strength`. image_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", } num_steps=50, guidance_scale=5.0, strength=0.6, ) # Generate with `negative_prompts`. text_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "negative_prompts": "green color", } ) # inpainting example reference_image = np.ones((1024, 1024, 3), dtype="float32") reference_mask = np.ones((1024, 1024), dtype="float32") inpaint = keras_hub.models.StableDiffusion3Inpaint.from_preset( "stable_diffusion_3_medium", height=512, width=512 ) inpaint.generate( reference_image, reference_mask, "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", ) # Generate with batched prompts. reference_images = np.ones((2, 512, 512, 3), dtype="float32") reference_mask = np.ones((2, 1024, 1024), dtype="float32") inpaint.generate( reference_images, reference_mask, ["cute wallpaper art of a cat", "cute wallpaper art of a dog"] ) # Generate with different `num_steps`, `guidance_scale` and `strength`. inpaint.generate( reference_image, reference_mask, "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", num_steps=50, guidance_scale=5.0, strength=0.6, ) # text to image example text_to_image = keras_hub.models.StableDiffusion3TextToImage.from_preset( "stable_diffusion_3_medium", height=512, width=512 ) text_to_image.generate( "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" ) # Generate with batched prompts. text_to_image.generate( ["cute wallpaper art of a cat", "cute wallpaper art of a dog"] ) # Generate with different `num_steps` and `guidance_scale`. text_to_image.generate( "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", num_steps=50, guidance_scale=5.0, ) # Generate with `negative_prompts`. text_to_image.generate( { "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "negative_prompts": "green color", } ) ``` ## Example Usage with Hugging Face URI ```python # Pretrained Stable Diffusion 3 model. model = keras_hub.models.StableDiffusion3Backbone.from_preset( "hf://keras/stable_diffusion_3_medium" ) # Randomly initialized Stable Diffusion 3 model with custom config. vae = keras_hub.models.VAEBackbone(...) clip_l = keras_hub.models.CLIPTextEncoder(...) clip_g = keras_hub.models.CLIPTextEncoder(...) model = keras_hub.models.StableDiffusion3Backbone( mmdit_patch_size=2, mmdit_num_heads=4, mmdit_hidden_dim=256, mmdit_depth=4, mmdit_position_size=192, vae=vae, clip_l=clip_l, clip_g=clip_g, ) # Image to image example image_to_image = keras_hub.models.StableDiffusion3ImageToImage.from_preset( "hf://keras/stable_diffusion_3_medium", height=512, width=512 ) image_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", } ) # Generate with batched prompts. image_to_image.generate( { "images": np.ones((2, 512, 512, 3), dtype="float32"), "prompts": ["cute wallpaper art of a cat", "cute wallpaper art of a dog"], } ) # Generate with different `num_steps`, `guidance_scale` and `strength`. image_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", } num_steps=50, guidance_scale=5.0, strength=0.6, ) # Generate with `negative_prompts`. text_to_image.generate( { "images": np.ones((512, 512, 3), dtype="float32"), "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "negative_prompts": "green color", } ) # inpainting example reference_image = np.ones((1024, 1024, 3), dtype="float32") reference_mask = np.ones((1024, 1024), dtype="float32") inpaint = keras_hub.models.StableDiffusion3Inpaint.from_preset( "hf://keras/stable_diffusion_3_medium", height=512, width=512 ) inpaint.generate( reference_image, reference_mask, "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", ) # Generate with batched prompts. reference_images = np.ones((2, 512, 512, 3), dtype="float32") reference_mask = np.ones((2, 1024, 1024), dtype="float32") inpaint.generate( reference_images, reference_mask, ["cute wallpaper art of a cat", "cute wallpaper art of a dog"] ) # Generate with different `num_steps`, `guidance_scale` and `strength`. inpaint.generate( reference_image, reference_mask, "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", num_steps=50, guidance_scale=5.0, strength=0.6, ) # text to image example text_to_image = keras_hub.models.StableDiffusion3TextToImage.from_preset( "hf://keras/stable_diffusion_3_medium", height=512, width=512 ) text_to_image.generate( "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" ) # Generate with batched prompts. text_to_image.generate( ["cute wallpaper art of a cat", "cute wallpaper art of a dog"] ) # Generate with different `num_steps` and `guidance_scale`. text_to_image.generate( "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", num_steps=50, guidance_scale=5.0, ) # Generate with `negative_prompts`. text_to_image.generate( { "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "negative_prompts": "green color", } ) ```