SkyReels_L

Paused

App Files Files Community

1inkusFace commited on Apr 2

Commit

3ac5b04

verified ·

1 Parent(s): daaecb2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -53

app.py CHANGED Viewed

@@ -30,8 +30,8 @@ os.environ["SAFETENSORS_FAST_GPU"] = "1"
 os.putenv("TOKENIZERS_PARALLELISM","False")
 def init_predictor():
-    global predictor
-    predictor = SkyReelsVideoSingleGpuInfer(
         task_type= TaskType.I2V,
         model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
         quant_model=False,
@@ -45,58 +45,55 @@ def init_predictor():
 @spaces.GPU(duration=60)
 def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
     random.seed(time.time())
     seed = int(random.randrange(4294967294))
     if segment==1:
         prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
         prompt=prompt, prompt_2=prompt, device=device
         )
-        transformer_pooled_projections = pooled_prompt_embeds
-        transformer_pooled_projections = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
         pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
         timesteps = pipe.scheduler.timesteps
         all_timesteps_cpu = timesteps.cpu()
         timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
         segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
         num_channels_latents = pipe.transformer.config.in_channels
         latents = pipe.prepare_latents(
             batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
             dtype=torch.float32, device=device, generator=generator, latents=None,
         )
         guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
-        kwargs = {
-            "prompt": prompt,
-            "height": size,
-            "width": size,
-            "num_frames": frames,
-            "num_inference_steps": steps,
-            "seed": seed,
-            "guidance_scale": guidance_scale,
-            "embedded_guidance_scale": 1.0,
-            "negative_prompt": "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion",
-            "cfg_for": False,
-        }
-        assert image is not None, "please input image"
-        img = load_image(image=image)
-        img.resize((size,size), Image.LANCZOS)
-        kwargs["image"] = img
     else:
         state_file = f"rv_L_{segment-1}_{seed}.pt"
         state = torch.load(state_file, weights_only=False)
         generator = torch.Generator(device='cuda').manual_seed(seed)
-        current_latents = latents
     for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
-                latent_model_input = latents.to(transformer_dtype)
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
                 with torch.no_grad():
                   noise_pred = self.transformer(
@@ -106,38 +103,29 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
                     encoder_attention_mask=prompt_attention_mask,
                     pooled_projections=pooled_prompt_embeds,
                     guidance=guidance,
-                    attention_kwargs=attention_kwargs,
                     return_dict=False,
                   )[0]
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                else:
-                    video = latents
-                    return latents
-    intermediate_latents_cpu = current_latents.detach().cpu()
     if segment==8:
         latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
         video = self.vae.decode(latents, return_dict=False)[0]
         video = self.video_processor.postprocess_video(video, output_type=output_type)
-        return HunyuanVideoPipelineOutput(frames=video)
         save_dir = f"./"
         video_out_file = f"{save_dir}/{seed}.mp4"
         print(f"generate video, local path: {video_out_file}")
         export_to_video(output, video_out_file, fps=24)
         return video_out_file, seed
     else:
         original_prompt_embeds_cpu = prompt_embeds.cpu()
-        original_negative_prompt_embeds_cpu = negative_prompt_embeds.cpu()
         original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
-        original_negative_pooled_prompt_embeds_cpu = negative_pooled_prompt_embeds.cpu()
         original_add_time_ids_cpu = add_time_ids.cpu()
         timesteps = pipe.scheduler.timesteps
         all_timesteps_cpu = timesteps.cpu() # Move to CPU
@@ -145,9 +133,9 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
             "intermediate_latents": intermediate_latents_cpu,
             "all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
             "prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
-            "negative_prompt_embeds": original_negative_prompt_embeds_cpu,
             "pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
-            "negative_pooled_prompt_embeds": original_negative_pooled_prompt_embeds_cpu,
             "add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
             "guidance_scale": guidance_scale,
             "timesteps_split": timesteps_split_for_state,
@@ -160,8 +148,6 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
         state_file = f"SkyReel_{segment}_{seed}.pt"
         torch.save(state, state_file)
         return None, seed
 with gr.Blocks() as demo:
         with gr.Row():
@@ -212,7 +198,6 @@ with gr.Blocks() as demo:
         inputs=num_inference_steps,
         outputs=range_sliders,
         )
         gr.Examples(
         examples=examples,
         inputs=prompt,
@@ -361,7 +346,6 @@ with gr.Blocks() as demo:
         outputs=[result, seed],
     )
 if __name__ == "__main__":
     init_predictor()
     demo.launch()

 os.putenv("TOKENIZERS_PARALLELISM","False")
 def init_predictor():
+    global pipe
+    pipe = SkyReelsVideoSingleGpuInfer(
         task_type= TaskType.I2V,
         model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
         quant_model=False,
 @spaces.GPU(duration=60)
 def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
     random.seed(time.time())
     seed = int(random.randrange(4294967294))
     if segment==1:
         prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
         prompt=prompt, prompt_2=prompt, device=device
         )
         pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
         timesteps = pipe.scheduler.timesteps
         all_timesteps_cpu = timesteps.cpu()
         timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
         segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
         num_channels_latents = pipe.transformer.config.in_channels
+        num_channels_latents = int(num_channels_latents / 2)
+            image = pipe.video_processor.preprocess(image, height=height, width=width).to(
+                device, dtype=prompt_embeds.dtype
+            )
+        num_latent_frames = (frames - 1) // pipe.vae_scale_factor_temporal + 1
         latents = pipe.prepare_latents(
             batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
             dtype=torch.float32, device=device, generator=generator, latents=None,
         )
+        image_latents = pipe.image_latents(
+                image, batch_size, height, width, device, torch.float32, num_channels_latents, num_latent_frames
+        )
+        image_latents = image_latents.to(pipe.transformer.dtype)
         guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
     else:
         state_file = f"rv_L_{segment-1}_{seed}.pt"
         state = torch.load(state_file, weights_only=False)
         generator = torch.Generator(device='cuda').manual_seed(seed)
+        latents = state["intermediate_latents"].to("cuda", dtype=torch.bfloat16)
+        guidance_scale = state["guidance_scale"]
+        all_timesteps_cpu = state["all_timesteps"]
+        height = state["height"]
+        width = state["width"]
+        pipe.scheduler.set_timesteps(len(all_timesteps_cpu), device=device)
+        timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
+        segment_timesteps = torch.from_numpy(timesteps_split_np[segment - 1]).to("cuda")
+        prompt_embeds = state["prompt_embeds"].to("cuda", dtype=torch.bfloat16)
+        pooled_prompt_embeds = state["pooled_prompt_embeds"].to("cuda", dtype=torch.bfloat16)
+        prompt_attention_mask = state["prompt_attention_mask"].to("cuda", dtype=torch.bfloat16)
+        image_latents = state["image_latents"].to("cuda", dtype=torch.bfloat16)
     for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
+                latents = latents.to(transformer_dtype)
+                latent_model_input = torch.cat([latents] * 2)
+                latent_image_input = (
+                    torch.cat([image_latents] * 2)
+                )
+                latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=1)
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
                 with torch.no_grad():
                   noise_pred = self.transformer(
                     encoder_attention_mask=prompt_attention_mask,
                     pooled_projections=pooled_prompt_embeds,
                     guidance=guidance,
+                    #   attention_kwargs=attention_kwargs,
                     return_dict=False,
                   )[0]
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+    intermediate_latents_cpu = latents.detach().cpu()
     if segment==8:
         latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
         video = self.vae.decode(latents, return_dict=False)[0]
         video = self.video_processor.postprocess_video(video, output_type=output_type)
+        # return HunyuanVideoPipelineOutput(frames=video)
         save_dir = f"./"
         video_out_file = f"{save_dir}/{seed}.mp4"
         print(f"generate video, local path: {video_out_file}")
         export_to_video(output, video_out_file, fps=24)
         return video_out_file, seed
     else:
         original_prompt_embeds_cpu = prompt_embeds.cpu()
+        original_image_latents_cpu = image_latents.cpu()
         original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
+        original_prompt_attention_mask_cpu = prompt_attention_mask.cpu()
         original_add_time_ids_cpu = add_time_ids.cpu()
         timesteps = pipe.scheduler.timesteps
         all_timesteps_cpu = timesteps.cpu() # Move to CPU
             "intermediate_latents": intermediate_latents_cpu,
             "all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
             "prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
+            "image_latents": original_image_latents_cpu,
             "pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
+            "prompt_attention_mask": original_prompt_attention_mask_cpu,
             "add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
             "guidance_scale": guidance_scale,
             "timesteps_split": timesteps_split_for_state,
         state_file = f"SkyReel_{segment}_{seed}.pt"
         torch.save(state, state_file)
         return None, seed
 with gr.Blocks() as demo:
         with gr.Row():
         inputs=num_inference_steps,
         outputs=range_sliders,
         )
         gr.Examples(
         examples=examples,
         inputs=prompt,
         outputs=[result, seed],
     )
 if __name__ == "__main__":
     init_predictor()
     demo.launch()