Spaces:

Ryukijano
/

Fastest-image-generation

Running on Zero

App Files Files Community

Ryukijano commited on 17 days ago

Commit

9565796

•

1 Parent(s): 3567671

Upload 3 files

Browse files

Files changed (3) hide show

app.py +4 -21
custom_pipeline.py +0 -192
requirements.txt +9 -9

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
-from huggingface_hub import login
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -19,33 +18,17 @@ DEFAULT_HEIGHT = 1024
 DEFAULT_INFERENCE_STEPS = 1
 # Device and model setup
-dtype = torch.bfloat16
 pipe = FluxWithCFGPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype, use_safetensors=True
 )
-pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype, use_safetensors=True, variant="fp16")
 pipe.to("cuda")
 pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
-# Enable xformers
-pipe.enable_xformers_memory_efficient_attention()
-# Compile the model (Optional, needs further testing for stability)
-# pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
-# Capture CUDA Graph (Warm-up)
-static_inputs = {
-    "prompt": "warmup",
-    "width": DEFAULT_WIDTH,
-    "height": DEFAULT_HEIGHT,
-    "num_inference_steps": DEFAULT_INFERENCE_STEPS,
-    "generator": torch.Generator().manual_seed(0),
-}
-pipe.capture_cuda_graph(**static_inputs)
 torch.cuda.empty_cache()
 # Inference function
@@ -180,4 +163,4 @@ with gr.Blocks() as demo:
         )
 # Launch the app
-demo.launch()

 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
 torch.backends.cuda.matmul.allow_tf32 = True
 DEFAULT_INFERENCE_STEPS = 1
 # Device and model setup
+dtype = torch.float16
 pipe = FluxWithCFGPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
 )
+pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
 pipe.to("cuda")
 pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
 torch.cuda.empty_cache()
 # Inference function
         )
 # Launch the app
+demo.launch()

custom_pipeline.py CHANGED Viewed

@@ -3,7 +3,6 @@ import numpy as np
 from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
 from typing import Any, Dict, List, Optional, Union
 from PIL import Image
-from collections import OrderedDict
 # Constants for shift calculation
 BASE_SEQ_LEN = 256
@@ -48,169 +47,6 @@ class FluxWithCFGPipeline(FluxPipeline):
     Extends the FluxPipeline to yield intermediate images during the denoising process
     with progressively increasing resolution for faster generation.
     """
-    def __init__(
-        self,
-        vae,
-        text_encoder,
-        text_encoder_2,
-        tokenizer,
-        tokenizer_2,
-        transformer,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-    ):
-        super().__init__(vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, transformer, scheduler)
-        self.cuda_graphs = {}
-    def capture_cuda_graph(
-        self,
-        prompt: Union[str, List[str]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 4,
-        guidance_scale: float = 3.5,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        max_sequence_length: int = 300,
-        **kwargs,
-    ):
-        """
-        Captures a static CUDA Graph for the generation process given static inputs.
-        """
-        # Use a static size for all inputs
-        static_height = height
-        static_width = width
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            prompt_2,
-            static_height,
-            static_width,
-            prompt_embeds=prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            max_sequence_length=max_sequence_length,
-        )
-        self._guidance_scale = guidance_scale
-        self._joint_attention_kwargs = joint_attention_kwargs
-        self._interrupt = False
-        # 2. Define call parameters
-        batch_size = 1
-        device = self._execution_device
-        # 3. Encode prompt (with static inputs)
-        lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
-        # Use a static prompt for capture
-        static_prompt = "static prompt" if isinstance(prompt, str) else ["static prompt"]
-        prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
-            prompt=static_prompt,
-            prompt_2=prompt_2,
-            prompt_embeds=None,
-            pooled_prompt_embeds=None,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
-            lora_scale=lora_scale,
-        )
-        # 4. Prepare latent variables (with static inputs)
-        num_channels_latents = self.transformer.config.in_channels // 4
-        latents, latent_image_ids = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            static_height,
-            static_width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            None,
-        )
-        # 5. Prepare timesteps (with static inputs)
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-        image_seq_len = latents.shape[1]
-        mu = calculate_timestep_shift(image_seq_len)
-        timesteps, num_inference_steps = prepare_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            None,
-            sigmas,
-            mu=mu,
-        )
-        self._num_timesteps = len(timesteps)
-        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float16).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
-        # Capture the graph
-        torch.cuda.synchronize()
-        stream = torch.cuda.Stream()
-        stream.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(stream):
-            for i, t in enumerate(timesteps):
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-        torch.cuda.current_stream().wait_stream(stream)
-        torch.cuda.synchronize()
-        # Capture the CUDA graph
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
-            # Create static inputs
-            static_inputs = OrderedDict()
-            static_inputs["hidden_states"] = latents.clone()
-            static_inputs["timestep"] = timesteps[0].expand(latents.shape[0]).to(latents.dtype)
-            static_inputs["guidance"] = guidance.clone() if guidance is not None else None
-            static_inputs["pooled_projections"] = pooled_prompt_embeds.clone()
-            static_inputs["encoder_hidden_states"] = prompt_embeds.clone()
-            static_inputs["txt_ids"] = text_ids
-            static_inputs["img_ids"] = latent_image_ids.clone()
-            static_inputs["joint_attention_kwargs"] = self.joint_attention_kwargs
-            # Run the static graph
-            for i, t in enumerate(timesteps):
-                timestep = static_inputs["timestep"].clone()
-                noise_pred = self.transformer(
-                    hidden_states=static_inputs["hidden_states"],
-                    timestep=timestep / 1000,
-                    guidance=static_inputs["guidance"],
-                    pooled_projections=static_inputs["pooled_projections"],
-                    encoder_hidden_states=static_inputs["encoder_hidden_states"],
-                    txt_ids=static_inputs["txt_ids"],
-                    img_ids=static_inputs["img_ids"],
-                    joint_attention_kwargs=static_inputs["joint_attention_kwargs"],
-                    return_dict=False,
-                )[0]
-                static_inputs["hidden_states"] = self.scheduler.step(noise_pred, t, static_inputs["hidden_states"], return_dict=False)[0]
-            # Decode the latents after the loop
-            final_latents = static_inputs["hidden_states"]
-            final_image = self._decode_latents_to_image(final_latents, static_height, static_width, output_type)
-        # Store the graph and static inputs in the dictionary
-        self.cuda_graphs[(static_height, static_width, num_inference_steps)] = (graph, static_inputs, final_image)
     @torch.inference_mode()
     def generate_images(
         self,
@@ -235,34 +71,6 @@ class FluxWithCFGPipeline(FluxPipeline):
         height = height or self.default_sample_size * self.vae_scale_factor
         width = width or self.default_sample_size * self.vae_scale_factor
-        # 0. Check if a CUDA graph can be used
-        if (height, width, num_inference_steps) in self.cuda_graphs:
-            graph, static_inputs, final_image = self.cuda_graphs[(height, width, num_inference_steps)]
-            # Update dynamic inputs (like prompt) in static_inputs
-            lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
-            prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
-                prompt=prompt,
-                prompt_2=prompt_2,
-                prompt_embeds=prompt_embeds,
-                pooled_prompt_embeds=pooled_prompt_embeds,
-                device=self._execution_device,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-                lora_scale=lora_scale,
-            )
-            # Update only the dynamic parts of static_inputs
-            static_inputs["pooled_projections"].copy_(pooled_prompt_embeds)
-            static_inputs["encoder_hidden_states"].copy_(prompt_embeds)
-            static_inputs["txt_ids"] = text_ids
-            # Replay the graph
-            graph.replay()
-            torch.cuda.empty_cache()
-            return final_image
         # 1. Check inputs
         self.check_inputs(
             prompt,

 from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
 from typing import Any, Dict, List, Optional, Union
 from PIL import Image
 # Constants for shift calculation
 BASE_SEQ_LEN = 256
     Extends the FluxPipeline to yield intermediate images during the denoising process
     with progressively increasing resolution for faster generation.
     """
     @torch.inference_mode()
     def generate_images(
         self,
         height = height or self.default_sample_size * self.vae_scale_factor
         width = width or self.default_sample_size * self.vae_scale_factor
         # 1. Check inputs
         self.check_inputs(
             prompt,

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
-accelerate
-git+https://github.com/huggingface/diffusers.git@main
-torch>=2.0
-gradio==5.8.0
-transformers
-xformers
-sentencepiece
-peft
-numpy
 pillow

+accelerate
+git+https://github.com/huggingface/diffusers.git@main
+torch>=2.0
+gradio==5.8.0
+transformers
+xformers
+sentencepiece
+peft
+numpy
 pillow