Spaces:

HuiZhang0812
/

CreatiLayout

Running on Zero

App Files Files Community

HuiZhang commited on Dec 27, 2024

Commit

6aed6eb

verified ·

1 Parent(s): 9f98f9a

Update src/pipeline/pipeline_CreatiLayout.py

Browse files

Files changed (1) hide show

src/pipeline/pipeline_CreatiLayout.py +6 -16

src/pipeline/pipeline_CreatiLayout.py CHANGED Viewed

@@ -420,21 +420,21 @@ class CreatiLayoutSD3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleF
                 clip_skip=clip_skip,
                 clip_model_index=1,
             )
-            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1) # torch.Size([B, 77, 768])+ torch.Size([B, 77, 1280])-> torch.Size([B, 77, 2048])
             t5_prompt_embed = self._get_t5_prompt_embeds(
                 prompt=prompt_3,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
                 device=device,
-            ) # [B,256,4096]
             clip_prompt_embeds = torch.nn.functional.pad(
                 clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
-            ) # [B,77,4096]
-            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2) # torch.Size([B, 333(256+77), 4096])
-            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)# [B,2048]
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
@@ -867,15 +867,9 @@ class CreatiLayoutSD3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleF
         # 5.5 layout
         max_objs = 10
         if len(bbox_raw) > max_objs:
             print(f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.")
             bbox_phrases = bbox_phrases[:max_objs]
             bbox_raw = bbox_raw[:max_objs]
-        # prepare batched input to the GLIGENTextBoundingboxProjection (boxes, phrases, mask)
-        # Get tokens for phrases from pre-trained CLIPTokenizer
-        # from IPython.core.debugger import set_trace
-        # set_trace()
         tokenizer_inputs = self.tokenizer(
             bbox_phrases,
             padding="max_length",
@@ -883,8 +877,6 @@ class CreatiLayoutSD3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleF
             truncation=True,
             return_tensors="pt",
         ).input_ids.to(device)
-        # For the token, we use the same pre-trained text encoder
-        # to obtain its text feature
         text_embeddings_1 = self.text_encoder(tokenizer_inputs.to(device), output_hidden_states=True)[0]
@@ -896,9 +888,7 @@ class CreatiLayoutSD3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleF
             truncation=True,
             return_tensors="pt",
         ).input_ids.to(device)
-        # For the token, we use the same pre-trained text encoder
-        # to obtain its text feature
         text_embeddings_2 = self.text_encoder_2(tokenizer_inputs_2.to(device), output_hidden_states=True)[0]
         clip_text_embeddings = torch.cat([text_embeddings_1, text_embeddings_2], dim=-1)

                 clip_skip=clip_skip,
                 clip_model_index=1,
             )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
             t5_prompt_embed = self._get_t5_prompt_embeds(
                 prompt=prompt_3,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
                 device=device,
+            )
             clip_prompt_embeds = torch.nn.functional.pad(
                 clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
         # 5.5 layout
         max_objs = 10
         if len(bbox_raw) > max_objs:
             print(f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.")
             bbox_phrases = bbox_phrases[:max_objs]
             bbox_raw = bbox_raw[:max_objs]
         tokenizer_inputs = self.tokenizer(
             bbox_phrases,
             padding="max_length",
             truncation=True,
             return_tensors="pt",
         ).input_ids.to(device)
         text_embeddings_1 = self.text_encoder(tokenizer_inputs.to(device), output_hidden_states=True)[0]
             truncation=True,
             return_tensors="pt",
         ).input_ids.to(device)
         text_embeddings_2 = self.text_encoder_2(tokenizer_inputs_2.to(device), output_hidden_states=True)[0]
         clip_text_embeddings = torch.cat([text_embeddings_1, text_embeddings_2], dim=-1)