Spaces:

TencentARC
/

Flux-Mini

Running on Zero

App Files Files Community

daoyuan98 commited on Nov 19, 2024

Commit

6e29804

verified ·

1 Parent(s): 1d45c1e

update main file, fix local bugs

Browse files

Files changed (1) hide show

app.py +54 -45

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ import numpy as np
 import random
 import spaces
 import torch
-from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file as load_sft
 from diffusers import  DiffusionPipeline, FlowMatchEulerDiscreteScheduler, AutoencoderTiny, AutoencoderKL, FluxPipeline
 from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
@@ -29,10 +29,10 @@ def calculate_shift(
 def retrieve_timesteps(
     scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
     if timesteps is not None and sigmas is not None:
@@ -54,23 +54,23 @@ def retrieve_timesteps(
 @torch.inference_mode()
 def flux_pipe_call_that_returns_an_iterable_of_images(
     self,
-    prompt: Union[str, List[str]] = None,
-    prompt_2: Optional[Union[str, List[str]]] = None,
-    height: Optional[int] = None,
-    width: Optional[int] = None,
     num_inference_steps: int = 28,
-    timesteps: List[int] = None,
     guidance_scale: float = 3.5,
-    num_images_per_prompt: Optional[int] = 1,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-    latents: Optional[torch.FloatTensor] = None,
-    prompt_embeds: Optional[torch.FloatTensor] = None,
-    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-    output_type: Optional[str] = "pil",
-    return_dict: bool = True,
-    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    max_sequence_length: int = 512,
-    good_vae: Optional[Any] = None,
 ):
     height = height or self.default_sample_size * self.vae_scale_factor
     width = width or self.default_sample_size * self.vae_scale_factor
@@ -92,7 +92,10 @@ def flux_pipe_call_that_returns_an_iterable_of_images(
     # 2. Define call parameters
     batch_size = 1 if isinstance(prompt, str) else len(prompt)
-    device = self._execution_device
     # 3. Encode prompt
     lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
@@ -107,7 +110,7 @@ def flux_pipe_call_that_returns_an_iterable_of_images(
         lora_scale=lora_scale,
     )
     # 4. Prepare latent variables
-    num_channels_latents = self.transformer.config.in_channels // 4
     latents, latent_image_ids = self.prepare_latents(
         batch_size * num_images_per_prompt,
         num_channels_latents,
@@ -139,26 +142,25 @@ def flux_pipe_call_that_returns_an_iterable_of_images(
     self._num_timesteps = len(timesteps)
     # Handle guidance
-    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
     # 6. Denoising loop
     for i, t in enumerate(timesteps):
         if self.interrupt:
             continue
-        timestep = t.expand(latents.shape[0]).to(latents.dtype)
         noise_pred = self.transformer(
-            hidden_states=latents,
-            timestep=timestep / 1000,
-            guidance=guidance,
-            pooled_projections=pooled_prompt_embeds,
-            encoder_hidden_states=prompt_embeds,
-            txt_ids=text_ids,
-            img_ids=latent_image_ids,
-            joint_attention_kwargs=self.joint_attention_kwargs,
-            return_dict=False,
-        )[0]
         # Yield intermediate result
         latents_for_image = self._unpack_latents(latents, height, width, self.vae_scale_factor)
         latents_for_image = (latents_for_image / self.vae.config.scaling_factor) + self.vae.config.shift_factor
@@ -184,6 +186,7 @@ class ModelSpec:
     repo_flow: str
     repo_ae: str
     repo_id_ae: str
 config = ModelSpec(
@@ -191,6 +194,7 @@ config = ModelSpec(
         repo_flow="flux-mini.safetensors",
         repo_id_ae="black-forest-labs/FLUX.1-dev",
         repo_ae="ae.safetensors",
         params=FluxParams(
             in_channels=64,
             vec_in_dim=768,
@@ -209,11 +213,14 @@ config = ModelSpec(
 def load_flow_model2(config, device: str = "cuda", hf_download: bool = True):
-    if (config.repo_id is not None
         and config.repo_flow is not None
         and hf_download
     ):
         ckpt_path = hf_hub_download(config.repo_id, config.repo_flow.replace("sft", "safetensors"))
     model = Flux(config.params)
     if ckpt_path is not None:
@@ -226,12 +233,12 @@ dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="scheduler")
-vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=dtype).to(device)
-text_encoder = CLIPTextModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder").to(device)
 tokenizer = CLIPTokenizer.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="tokenizer")
-text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2").to(device)
 tokenizer_2 = T5TokenizerFast.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="tokenizer_2")
-transformer = load_flow_model2(config, device)
 pipe = FluxPipeline(
     scheduler,
@@ -245,19 +252,20 @@ pipe = FluxPipeline(
 torch.cuda.empty_cache()
 MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 2048
 pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(pipe)
 @spaces.GPU(duration=75)
 def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
             prompt=prompt,
-            guidance_scale=guidance_scale,
             num_inference_steps=num_inference_steps,
             width=width,
             height=height,
@@ -265,12 +273,13 @@ def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, guidan
             output_type="pil",
             good_vae=good_vae,
         ):
-            yield img, seed
 examples = [
     "thousands of luminous oysters on a shore reflecting and refracting the sunset",
-    "profile of sad Socrates, full body, high detail, dramatic scene, Epic dynamic action, wide angle, cinematic, hyper realistic, concept art, warm muted tones as painted by Bernie Wrightson, Frank Frazetta,",
-    "ghosts, astronauts, robots, cats, superhero costumes, line drawings, naive, simple, exploring a strange planet, coloured pencil crayons, , black canvas background, drawn by 5 year old child",
 ]
 css="""
@@ -365,4 +374,4 @@ A 3.2B param rectified flow transformer distilled from [FLUX.1 [dev]](https://bl
         outputs = [result, seed]
     )
-demo.launch()

 import random
 import spaces
 import torch
 from safetensors.torch import load_file as load_sft
+from huggingface_hub import hf_hub_download
 from diffusers import  DiffusionPipeline, FlowMatchEulerDiscreteScheduler, AutoencoderTiny, AutoencoderKL, FluxPipeline
 from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
 def retrieve_timesteps(
     scheduler,
+    num_inference_steps: Optional = None,
+    device: Optional = None,
+    timesteps: Optional = None,
+    sigmas: Optional = None,
     **kwargs,
 ):
     if timesteps is not None and sigmas is not None:
 @torch.inference_mode()
 def flux_pipe_call_that_returns_an_iterable_of_images(
     self,
+    prompt = None,
+    prompt_2 = None,
+    height = None,
+    width = None,
     num_inference_steps: int = 28,
+    timesteps = None,
     guidance_scale: float = 3.5,
+    num_images_per_prompt = 1,
+    generator = None,
+    latents = None,
+    prompt_embeds = None,
+    pooled_prompt_embeds = None,
+    output_type = "pil",
+    return_dict = True,
+    joint_attention_kwargs = None,
+    max_sequence_length = 512,
+    good_vae = None,
 ):
     height = height or self.default_sample_size * self.vae_scale_factor
     width = width or self.default_sample_size * self.vae_scale_factor
     # 2. Define call parameters
     batch_size = 1 if isinstance(prompt, str) else len(prompt)
+    try:
+        device = self._execution_device
+    except:
+        device = torch.device('cuda:0')
     # 3. Encode prompt
     lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
         lora_scale=lora_scale,
     )
     # 4. Prepare latent variables
+    num_channels_latents = self.transformer.in_channels // 4
     latents, latent_image_ids = self.prepare_latents(
         batch_size * num_images_per_prompt,
         num_channels_latents,
     self._num_timesteps = len(timesteps)
     # Handle guidance
+    guidance = torch.full([1], guidance_scale, device=device, dtype=dtype).expand(latents.shape[0]) # if self.transformer.params.guidance_embeds else None
+    # print(latent_image_ids.shape, text_ids.shape, pooled_prompt_embeds.shape)
     # 6. Denoising loop
     for i, t in enumerate(timesteps):
         if self.interrupt:
             continue
+        timestep = t.expand(latents.shape[0]).to(dtype)
         noise_pred = self.transformer(
+            img=latents.to(dtype).to(device),
+            timesteps=(timestep / 1000).to(dtype),
+            guidance=guidance.to(dtype).to(device),
+            y=pooled_prompt_embeds.to(dtype).to(device),
+            txt=prompt_embeds.to(dtype).to(device),
+            txt_ids=text_ids.to(dtype).to(device),
+            img_ids=latent_image_ids.to(dtype).to(device),
+        )
         # Yield intermediate result
         latents_for_image = self._unpack_latents(latents, height, width, self.vae_scale_factor)
         latents_for_image = (latents_for_image / self.vae.config.scaling_factor) + self.vae.config.shift_factor
     repo_flow: str
     repo_ae: str
     repo_id_ae: str
+    ckpt_path: str
 config = ModelSpec(
         repo_flow="flux-mini.safetensors",
         repo_id_ae="black-forest-labs/FLUX.1-dev",
         repo_ae="ae.safetensors",
+        ckpt_path=None,
         params=FluxParams(
             in_channels=64,
             vec_in_dim=768,
 def load_flow_model2(config, device: str = "cuda", hf_download: bool = True):
+    if (config.ckpt_path is None
+        and config.repo_id is not None
         and config.repo_flow is not None
         and hf_download
     ):
         ckpt_path = hf_hub_download(config.repo_id, config.repo_flow.replace("sft", "safetensors"))
+    else:
+        ckpt_path = config.ckpt_path
     model = Flux(config.params)
     if ckpt_path is not None:
 device = "cuda" if torch.cuda.is_available() else "cpu"
 scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="scheduler")
+good_vae = vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=dtype).to(device)
+text_encoder = CLIPTextModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder", torch_dtype=dtype).to(device)
 tokenizer = CLIPTokenizer.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="tokenizer")
+text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2", torch_dtype=dtype).to(device)
 tokenizer_2 = T5TokenizerFast.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="tokenizer_2")
+transformer = load_flow_model2(config, device).to(dtype).to(device)
 pipe = FluxPipeline(
     scheduler,
 torch.cuda.empty_cache()
 MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
 pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(pipe)
 @spaces.GPU(duration=75)
 def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
+    torch.cuda.empty_cache()
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
             prompt=prompt,
+            guidance_scale=guidance_scale0,
             num_inference_steps=num_inference_steps,
             width=width,
             height=height,
             output_type="pil",
             good_vae=good_vae,
         ):
+        pass
+    return img, seed
 examples = [
+    "a lovely cat",
     "thousands of luminous oysters on a shore reflecting and refracting the sunset",
+    "profile of sad Socrates, full body, high detail, dramatic scene, Epic dynamic action, wide angle, cinematic, hyper realistic, concept art, warm muted tones as painted by Bernie Wrightson, Frank Frazetta,"
 ]
 css="""
         outputs = [result, seed]
     )
+demo.launch(server_name='0.0.0.0', server_port=12345)