FluxBooru-CFG3.5

Running on Zero

App Files Files Community

bghira commited on 8 days ago

Commit

563f3d0

•

1 Parent(s): 8be7ccb

Update custom_pipeline.py

Browse files

Files changed (1) hide show

custom_pipeline.py +78 -72

custom_pipeline.py CHANGED Viewed

@@ -56,7 +56,6 @@ EXAMPLE_DOC_STRING = """
  ```py
  >>> import torch
  >>> from diffusers import FluxPipeline
  >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
  >>> pipe.to("cuda")
  >>> prompt = "A cat holding a sign that says hello world"
@@ -93,7 +92,6 @@ def retrieve_timesteps(
  """
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
  Args:
  scheduler (`SchedulerMixin`):
  The scheduler to get timesteps from.
@@ -108,7 +106,6 @@ def retrieve_timesteps(
  sigmas (`List[float]`, *optional*):
  Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
  `num_inference_steps` and `timesteps` must be `None`.
  Returns:
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
  second element is the number of inference steps.
@@ -150,9 +147,7 @@ def retrieve_timesteps(
 class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  r"""
  The Flux pipeline for text-to-image generation.
  Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
  Args:
  transformer ([`FluxTransformer2DModel`]):
  Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
@@ -334,7 +329,6 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  lora_scale: Optional[float] = None,
  ):
  r"""
  Args:
  prompt (`str` or `List[str]`, *optional*):
  prompt to be encoded
@@ -612,7 +606,6 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  ):
  r"""
  Function invoked when calling the pipeline for generation.
  Args:
  prompt (`str` or `List[str]`, *optional*):
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
@@ -674,9 +667,7 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
  `._callback_tensor_inputs` attribute of your pipeline class.
  max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
  Examples:
  Returns:
  [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
  is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
@@ -797,102 +788,118 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  latent_image_ids = latent_image_ids.to(self.transformer.device)[0]
  timesteps = timesteps.to(self.transformer.device)
  text_ids = text_ids.to(self.transformer.device)[0]
  # 6. Denoising loop
  with self.progress_bar(total=num_inference_steps) as progress_bar:
  for i, t in enumerate(timesteps):
  if self.interrupt:
  continue
- # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- timestep = t.expand(latents.shape[0]).to(latents.dtype)
- # handle guidance
  if self.transformer.config.guidance_embeds:
- guidance = torch.tensor(
- [guidance_scale], device=self.transformer.device
- )
- guidance = guidance.expand(latents.shape[0])
  else:
  guidance = None
  extra_transformer_args = {}
  if prompt_mask is not None:
- extra_transformer_args["attention_mask"] = prompt_mask.to(
- device=self.transformer.device
- )
  noise_pred = self.transformer(
- hidden_states=latents.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
  timestep=timestep / 1000,
  guidance=guidance,
- pooled_projections=pooled_prompt_embeds.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- encoder_hidden_states=prompt_embeds.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- txt_ids=text_ids,
- img_ids=latent_image_ids,
  joint_attention_kwargs=self.joint_attention_kwargs,
  return_dict=False,
  **extra_transformer_args,
  )[0]
- # TODO optionally use batch prediction to speed this up.
  if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
- noise_pred_uncond = self.transformer(
- hidden_states=latents.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
- timestep=timestep / 1000,
- guidance=guidance,
- pooled_projections=negative_pooled_prompt_embeds.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- encoder_hidden_states=negative_prompt_embeds.to(
- device=self.transformer.device # , dtype=self.transformer.dtype # can't cast dtype like this because of NF4
- ),
- txt_ids=negative_text_ids.to(device=self.transformer.device),
- img_ids=latent_image_ids.to(device=self.transformer.device),
- joint_attention_kwargs=self.joint_attention_kwargs,
- return_dict=False,
- )[0]
- noise_pred = noise_pred_uncond + guidance_scale_real * (
- noise_pred - noise_pred_uncond
- )
- # compute the previous noisy sample x_t -> x_t-1
  latents_dtype = latents.dtype
- latents = self.scheduler.step(
- noise_pred, t, latents, return_dict=False
- )[0]
  if latents.dtype != latents_dtype:
  if torch.backends.mps.is_available():
- # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
  latents = latents.to(latents_dtype)
  if callback_on_step_end is not None:
- callback_kwargs = {}
- for k in callback_on_step_end_tensor_inputs:
- callback_kwargs[k] = locals()[k]
  callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
- latents = callback_outputs.pop("latents", latents)
- prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
- # call the callback, if provided
- if i == len(timesteps) - 1 or (
- (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
- ):
  progress_bar.update()
  if XLA_AVAILABLE:
  xm.mark_step()
@@ -932,7 +939,6 @@ from diffusers.utils import BaseOutput
 class FluxPipelineOutput(BaseOutput):
  """
  Output class for Stable Diffusion pipelines.
  Args:
  images (`List[PIL.Image.Image]` or `np.ndarray`)
  List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,

  ```py
  >>> import torch
  >>> from diffusers import FluxPipeline
  >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
  >>> pipe.to("cuda")
  >>> prompt = "A cat holding a sign that says hello world"
  """
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
  Args:
  scheduler (`SchedulerMixin`):
  The scheduler to get timesteps from.
  sigmas (`List[float]`, *optional*):
  Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
  `num_inference_steps` and `timesteps` must be `None`.
  Returns:
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
  second element is the number of inference steps.
 class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
  r"""
  The Flux pipeline for text-to-image generation.
  Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
  Args:
  transformer ([`FluxTransformer2DModel`]):
  Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
  lora_scale: Optional[float] = None,
  ):
  r"""
  Args:
  prompt (`str` or `List[str]`, *optional*):
  prompt to be encoded
  ):
  r"""
  Function invoked when calling the pipeline for generation.
  Args:
  prompt (`str` or `List[str]`, *optional*):
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
  `._callback_tensor_inputs` attribute of your pipeline class.
  max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
  Examples:
  Returns:
  [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
  is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
  latent_image_ids = latent_image_ids.to(self.transformer.device)[0]
  timesteps = timesteps.to(self.transformer.device)
  text_ids = text_ids.to(self.transformer.device)[0]
+ negative_text_ids = negative_text_ids.to(self.transformer.device)[0]
+ # Assume 'do_batch_cfg' is a boolean indicating whether to use batched CFG
+ do_batch_cfg = True # Set this to False to use sequential CFG
  # 6. Denoising loop
  with self.progress_bar(total=num_inference_steps) as progress_bar:
  for i, t in enumerate(timesteps):
  if self.interrupt:
  continue
+ # Prepare the latent model input
+ prompt_embeds_input = prompt_embeds
+ pooled_prompt_embeds_input = pooled_prompt_embeds
+ text_ids_input = text_ids
+ latent_image_ids_input = latent_image_ids
+ prompt_mask_input = prompt_mask
+ latent_model_input = latents
+ if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
+ # Concatenate prompt embeddings
+ prompt_embeds_input = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ pooled_prompt_embeds_input = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+ # # Concatenate text IDs if they are used
+ # if text_ids is not None and negative_text_ids is not None:
+ # text_ids_input = torch.cat([negative_text_ids, text_ids], dim=0)
+ # Concatenate latent image IDs if they are used
+ # if latent_image_ids is not None:
+ # latent_image_ids_input = torch.cat([latent_image_ids, latent_image_ids], dim=0)
+ # Concatenate prompt masks if they are used
+ if prompt_mask is not None and negative_mask is not None:
+ prompt_mask_input = torch.cat([negative_mask, prompt_mask], dim=0)
+ # Duplicate latents for unconditional and conditional inputs
+ latent_model_input = torch.cat([latents] * 2)
+ # Expand timestep to match batch size
+ timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
+ # Handle guidance
  if self.transformer.config.guidance_embeds:
+ guidance = torch.tensor([guidance_scale], device=self.transformer.device)
+ guidance = guidance.expand(latent_model_input.shape[0])
  else:
  guidance = None
+ # Prepare extra transformer arguments
  extra_transformer_args = {}
  if prompt_mask is not None:
+ extra_transformer_args["attention_mask"] = prompt_mask.to(device=self.transformer.device)
+ # Forward pass through the transformer
  noise_pred = self.transformer(
+ hidden_states=latent_model_input.to(device=self.transformer.device),
  timestep=timestep / 1000,
  guidance=guidance,
+ pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
+ encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
+ txt_ids=text_ids_input.to(device=self.transformer.device) if text_ids is not None else None,
+ img_ids=latent_image_ids_input.to(device=self.transformer.device) if latent_image_ids is not None else None,
  joint_attention_kwargs=self.joint_attention_kwargs,
  return_dict=False,
  **extra_transformer_args,
  )[0]
+ # Apply real CFG
  if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
+ if do_batch_cfg:
+ # Batched CFG: Split the noise prediction into unconditional and conditional parts
+ noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
+ else:
+ # Sequential CFG: Compute unconditional noise prediction separately
+ noise_pred_uncond = self.transformer(
+ hidden_states=latents.to(device=self.transformer.device),
+ timestep=timestep / 1000,
+ guidance=guidance,
+ pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
+ encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
+ txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
+ img_ids=latent_image_ids.to(device=self.transformer.device) if latent_image_ids is not None else None,
+ joint_attention_kwargs=self.joint_attention_kwargs,
+ return_dict=False,
+ )[0]
+ # Combine conditional and unconditional predictions
+ noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)
+ # Compute the previous noisy sample x_t -> x_t-1
  latents_dtype = latents.dtype
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+ # Ensure latents have the correct dtype
  if latents.dtype != latents_dtype:
  if torch.backends.mps.is_available():
  latents = latents.to(latents_dtype)
+ # Callback at the end of the step, if provided
  if callback_on_step_end is not None:
+ callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs}
  callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+ latents = callback_outputs.get("latents", latents)
+ prompt_embeds = callback_outputs.get("prompt_embeds", prompt_embeds)
+ # Update the progress bar
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
  progress_bar.update()
+ # Mark step for XLA devices
  if XLA_AVAILABLE:
  xm.mark_step()
 class FluxPipelineOutput(BaseOutput):
  """
  Output class for Stable Diffusion pipelines.
  Args:
  images (`List[PIL.Image.Image]` or `np.ndarray`)
  List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,