1inkusFace commited on
Commit
3ac5b04
·
verified ·
1 Parent(s): daaecb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -53
app.py CHANGED
@@ -30,8 +30,8 @@ os.environ["SAFETENSORS_FAST_GPU"] = "1"
30
  os.putenv("TOKENIZERS_PARALLELISM","False")
31
 
32
  def init_predictor():
33
- global predictor
34
- predictor = SkyReelsVideoSingleGpuInfer(
35
  task_type= TaskType.I2V,
36
  model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
37
  quant_model=False,
@@ -45,58 +45,55 @@ def init_predictor():
45
 
46
  @spaces.GPU(duration=60)
47
  def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
48
-
49
  random.seed(time.time())
50
  seed = int(random.randrange(4294967294))
51
  if segment==1:
52
-
53
-
54
  prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
55
  prompt=prompt, prompt_2=prompt, device=device
56
  )
57
- transformer_pooled_projections = pooled_prompt_embeds
58
- transformer_pooled_projections = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
59
  pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
60
  timesteps = pipe.scheduler.timesteps
61
  all_timesteps_cpu = timesteps.cpu()
62
  timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
63
  segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
64
-
65
  num_channels_latents = pipe.transformer.config.in_channels
 
 
 
 
 
66
  latents = pipe.prepare_latents(
67
  batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
68
  dtype=torch.float32, device=device, generator=generator, latents=None,
69
  )
 
 
 
 
70
  guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
71
-
72
- kwargs = {
73
- "prompt": prompt,
74
- "height": size,
75
- "width": size,
76
- "num_frames": frames,
77
- "num_inference_steps": steps,
78
- "seed": seed,
79
- "guidance_scale": guidance_scale,
80
- "embedded_guidance_scale": 1.0,
81
- "negative_prompt": "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion",
82
- "cfg_for": False,
83
- }
84
- assert image is not None, "please input image"
85
- img = load_image(image=image)
86
- img.resize((size,size), Image.LANCZOS)
87
- kwargs["image"] = img
88
  else:
89
  state_file = f"rv_L_{segment-1}_{seed}.pt"
90
  state = torch.load(state_file, weights_only=False)
91
  generator = torch.Generator(device='cuda').manual_seed(seed)
92
-
93
-
94
- current_latents = latents
95
-
 
 
 
 
 
 
 
 
96
  for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
97
-
98
- latent_model_input = latents.to(transformer_dtype)
99
-
 
 
 
100
  timestep = t.expand(latents.shape[0]).to(latents.dtype)
101
  with torch.no_grad():
102
  noise_pred = self.transformer(
@@ -106,38 +103,29 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
106
  encoder_attention_mask=prompt_attention_mask,
107
  pooled_projections=pooled_prompt_embeds,
108
  guidance=guidance,
109
- attention_kwargs=attention_kwargs,
110
  return_dict=False,
111
  )[0]
112
 
113
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
114
- noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
115
  latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
116
-
117
- else:
118
- video = latents
119
- return latents
120
-
121
- intermediate_latents_cpu = current_latents.detach().cpu()
122
-
123
  if segment==8:
124
  latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
125
  video = self.vae.decode(latents, return_dict=False)[0]
126
  video = self.video_processor.postprocess_video(video, output_type=output_type)
127
-
128
- return HunyuanVideoPipelineOutput(frames=video)
129
-
130
  save_dir = f"./"
131
  video_out_file = f"{save_dir}/{seed}.mp4"
132
  print(f"generate video, local path: {video_out_file}")
133
  export_to_video(output, video_out_file, fps=24)
134
-
135
  return video_out_file, seed
136
  else:
137
  original_prompt_embeds_cpu = prompt_embeds.cpu()
138
- original_negative_prompt_embeds_cpu = negative_prompt_embeds.cpu()
139
  original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
140
- original_negative_pooled_prompt_embeds_cpu = negative_pooled_prompt_embeds.cpu()
141
  original_add_time_ids_cpu = add_time_ids.cpu()
142
  timesteps = pipe.scheduler.timesteps
143
  all_timesteps_cpu = timesteps.cpu() # Move to CPU
@@ -145,9 +133,9 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
145
  "intermediate_latents": intermediate_latents_cpu,
146
  "all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
147
  "prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
148
- "negative_prompt_embeds": original_negative_prompt_embeds_cpu,
149
  "pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
150
- "negative_pooled_prompt_embeds": original_negative_pooled_prompt_embeds_cpu,
151
  "add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
152
  "guidance_scale": guidance_scale,
153
  "timesteps_split": timesteps_split_for_state,
@@ -160,8 +148,6 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
160
  state_file = f"SkyReel_{segment}_{seed}.pt"
161
  torch.save(state, state_file)
162
  return None, seed
163
-
164
-
165
 
166
  with gr.Blocks() as demo:
167
  with gr.Row():
@@ -212,7 +198,6 @@ with gr.Blocks() as demo:
212
  inputs=num_inference_steps,
213
  outputs=range_sliders,
214
  )
215
-
216
  gr.Examples(
217
  examples=examples,
218
  inputs=prompt,
@@ -361,7 +346,6 @@ with gr.Blocks() as demo:
361
  outputs=[result, seed],
362
  )
363
 
364
-
365
  if __name__ == "__main__":
366
  init_predictor()
367
  demo.launch()
 
30
  os.putenv("TOKENIZERS_PARALLELISM","False")
31
 
32
  def init_predictor():
33
+ global pipe
34
+ pipe = SkyReelsVideoSingleGpuInfer(
35
  task_type= TaskType.I2V,
36
  model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
37
  quant_model=False,
 
45
 
46
  @spaces.GPU(duration=60)
47
  def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
 
48
  random.seed(time.time())
49
  seed = int(random.randrange(4294967294))
50
  if segment==1:
 
 
51
  prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
52
  prompt=prompt, prompt_2=prompt, device=device
53
  )
 
 
54
  pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
55
  timesteps = pipe.scheduler.timesteps
56
  all_timesteps_cpu = timesteps.cpu()
57
  timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
58
  segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
 
59
  num_channels_latents = pipe.transformer.config.in_channels
60
+ num_channels_latents = int(num_channels_latents / 2)
61
+ image = pipe.video_processor.preprocess(image, height=height, width=width).to(
62
+ device, dtype=prompt_embeds.dtype
63
+ )
64
+ num_latent_frames = (frames - 1) // pipe.vae_scale_factor_temporal + 1
65
  latents = pipe.prepare_latents(
66
  batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
67
  dtype=torch.float32, device=device, generator=generator, latents=None,
68
  )
69
+ image_latents = pipe.image_latents(
70
+ image, batch_size, height, width, device, torch.float32, num_channels_latents, num_latent_frames
71
+ )
72
+ image_latents = image_latents.to(pipe.transformer.dtype)
73
  guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  else:
75
  state_file = f"rv_L_{segment-1}_{seed}.pt"
76
  state = torch.load(state_file, weights_only=False)
77
  generator = torch.Generator(device='cuda').manual_seed(seed)
78
+ latents = state["intermediate_latents"].to("cuda", dtype=torch.bfloat16)
79
+ guidance_scale = state["guidance_scale"]
80
+ all_timesteps_cpu = state["all_timesteps"]
81
+ height = state["height"]
82
+ width = state["width"]
83
+ pipe.scheduler.set_timesteps(len(all_timesteps_cpu), device=device)
84
+ timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
85
+ segment_timesteps = torch.from_numpy(timesteps_split_np[segment - 1]).to("cuda")
86
+ prompt_embeds = state["prompt_embeds"].to("cuda", dtype=torch.bfloat16)
87
+ pooled_prompt_embeds = state["pooled_prompt_embeds"].to("cuda", dtype=torch.bfloat16)
88
+ prompt_attention_mask = state["prompt_attention_mask"].to("cuda", dtype=torch.bfloat16)
89
+ image_latents = state["image_latents"].to("cuda", dtype=torch.bfloat16)
90
  for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
91
+ latents = latents.to(transformer_dtype)
92
+ latent_model_input = torch.cat([latents] * 2)
93
+ latent_image_input = (
94
+ torch.cat([image_latents] * 2)
95
+ )
96
+ latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=1)
97
  timestep = t.expand(latents.shape[0]).to(latents.dtype)
98
  with torch.no_grad():
99
  noise_pred = self.transformer(
 
103
  encoder_attention_mask=prompt_attention_mask,
104
  pooled_projections=pooled_prompt_embeds,
105
  guidance=guidance,
106
+ # attention_kwargs=attention_kwargs,
107
  return_dict=False,
108
  )[0]
109
 
110
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
111
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
112
  latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
113
+ intermediate_latents_cpu = latents.detach().cpu()
 
 
 
 
 
 
114
  if segment==8:
115
  latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
116
  video = self.vae.decode(latents, return_dict=False)[0]
117
  video = self.video_processor.postprocess_video(video, output_type=output_type)
118
+ # return HunyuanVideoPipelineOutput(frames=video)
 
 
119
  save_dir = f"./"
120
  video_out_file = f"{save_dir}/{seed}.mp4"
121
  print(f"generate video, local path: {video_out_file}")
122
  export_to_video(output, video_out_file, fps=24)
 
123
  return video_out_file, seed
124
  else:
125
  original_prompt_embeds_cpu = prompt_embeds.cpu()
126
+ original_image_latents_cpu = image_latents.cpu()
127
  original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
128
+ original_prompt_attention_mask_cpu = prompt_attention_mask.cpu()
129
  original_add_time_ids_cpu = add_time_ids.cpu()
130
  timesteps = pipe.scheduler.timesteps
131
  all_timesteps_cpu = timesteps.cpu() # Move to CPU
 
133
  "intermediate_latents": intermediate_latents_cpu,
134
  "all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
135
  "prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
136
+ "image_latents": original_image_latents_cpu,
137
  "pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
138
+ "prompt_attention_mask": original_prompt_attention_mask_cpu,
139
  "add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
140
  "guidance_scale": guidance_scale,
141
  "timesteps_split": timesteps_split_for_state,
 
148
  state_file = f"SkyReel_{segment}_{seed}.pt"
149
  torch.save(state, state_file)
150
  return None, seed
 
 
151
 
152
  with gr.Blocks() as demo:
153
  with gr.Row():
 
198
  inputs=num_inference_steps,
199
  outputs=range_sliders,
200
  )
 
201
  gr.Examples(
202
  examples=examples,
203
  inputs=prompt,
 
346
  outputs=[result, seed],
347
  )
348
 
 
349
  if __name__ == "__main__":
350
  init_predictor()
351
  demo.launch()