Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -30,8 +30,8 @@ os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
|
30 |
os.putenv("TOKENIZERS_PARALLELISM","False")
|
31 |
|
32 |
def init_predictor():
|
33 |
-
global
|
34 |
-
|
35 |
task_type= TaskType.I2V,
|
36 |
model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
|
37 |
quant_model=False,
|
@@ -45,58 +45,55 @@ def init_predictor():
|
|
45 |
|
46 |
@spaces.GPU(duration=60)
|
47 |
def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
|
48 |
-
|
49 |
random.seed(time.time())
|
50 |
seed = int(random.randrange(4294967294))
|
51 |
if segment==1:
|
52 |
-
|
53 |
-
|
54 |
prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
|
55 |
prompt=prompt, prompt_2=prompt, device=device
|
56 |
)
|
57 |
-
transformer_pooled_projections = pooled_prompt_embeds
|
58 |
-
transformer_pooled_projections = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
|
59 |
pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
|
60 |
timesteps = pipe.scheduler.timesteps
|
61 |
all_timesteps_cpu = timesteps.cpu()
|
62 |
timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
|
63 |
segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
|
64 |
-
|
65 |
num_channels_latents = pipe.transformer.config.in_channels
|
|
|
|
|
|
|
|
|
|
|
66 |
latents = pipe.prepare_latents(
|
67 |
batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
|
68 |
dtype=torch.float32, device=device, generator=generator, latents=None,
|
69 |
)
|
|
|
|
|
|
|
|
|
70 |
guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
|
71 |
-
|
72 |
-
kwargs = {
|
73 |
-
"prompt": prompt,
|
74 |
-
"height": size,
|
75 |
-
"width": size,
|
76 |
-
"num_frames": frames,
|
77 |
-
"num_inference_steps": steps,
|
78 |
-
"seed": seed,
|
79 |
-
"guidance_scale": guidance_scale,
|
80 |
-
"embedded_guidance_scale": 1.0,
|
81 |
-
"negative_prompt": "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion",
|
82 |
-
"cfg_for": False,
|
83 |
-
}
|
84 |
-
assert image is not None, "please input image"
|
85 |
-
img = load_image(image=image)
|
86 |
-
img.resize((size,size), Image.LANCZOS)
|
87 |
-
kwargs["image"] = img
|
88 |
else:
|
89 |
state_file = f"rv_L_{segment-1}_{seed}.pt"
|
90 |
state = torch.load(state_file, weights_only=False)
|
91 |
generator = torch.Generator(device='cuda').manual_seed(seed)
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
|
97 |
-
|
98 |
-
latent_model_input =
|
99 |
-
|
|
|
|
|
|
|
100 |
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
101 |
with torch.no_grad():
|
102 |
noise_pred = self.transformer(
|
@@ -106,38 +103,29 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
|
|
106 |
encoder_attention_mask=prompt_attention_mask,
|
107 |
pooled_projections=pooled_prompt_embeds,
|
108 |
guidance=guidance,
|
109 |
-
attention_kwargs=attention_kwargs,
|
110 |
return_dict=False,
|
111 |
)[0]
|
112 |
|
113 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
114 |
-
noise_pred = noise_pred_uncond +
|
115 |
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
116 |
-
|
117 |
-
else:
|
118 |
-
video = latents
|
119 |
-
return latents
|
120 |
-
|
121 |
-
intermediate_latents_cpu = current_latents.detach().cpu()
|
122 |
-
|
123 |
if segment==8:
|
124 |
latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
|
125 |
video = self.vae.decode(latents, return_dict=False)[0]
|
126 |
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
127 |
-
|
128 |
-
return HunyuanVideoPipelineOutput(frames=video)
|
129 |
-
|
130 |
save_dir = f"./"
|
131 |
video_out_file = f"{save_dir}/{seed}.mp4"
|
132 |
print(f"generate video, local path: {video_out_file}")
|
133 |
export_to_video(output, video_out_file, fps=24)
|
134 |
-
|
135 |
return video_out_file, seed
|
136 |
else:
|
137 |
original_prompt_embeds_cpu = prompt_embeds.cpu()
|
138 |
-
|
139 |
original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
|
140 |
-
|
141 |
original_add_time_ids_cpu = add_time_ids.cpu()
|
142 |
timesteps = pipe.scheduler.timesteps
|
143 |
all_timesteps_cpu = timesteps.cpu() # Move to CPU
|
@@ -145,9 +133,9 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
|
|
145 |
"intermediate_latents": intermediate_latents_cpu,
|
146 |
"all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
|
147 |
"prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
|
148 |
-
"
|
149 |
"pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
|
150 |
-
"
|
151 |
"add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
|
152 |
"guidance_scale": guidance_scale,
|
153 |
"timesteps_split": timesteps_split_for_state,
|
@@ -160,8 +148,6 @@ def generate_video(segment, image, prompt, size, guidance_scale, num_inference_s
|
|
160 |
state_file = f"SkyReel_{segment}_{seed}.pt"
|
161 |
torch.save(state, state_file)
|
162 |
return None, seed
|
163 |
-
|
164 |
-
|
165 |
|
166 |
with gr.Blocks() as demo:
|
167 |
with gr.Row():
|
@@ -212,7 +198,6 @@ with gr.Blocks() as demo:
|
|
212 |
inputs=num_inference_steps,
|
213 |
outputs=range_sliders,
|
214 |
)
|
215 |
-
|
216 |
gr.Examples(
|
217 |
examples=examples,
|
218 |
inputs=prompt,
|
@@ -361,7 +346,6 @@ with gr.Blocks() as demo:
|
|
361 |
outputs=[result, seed],
|
362 |
)
|
363 |
|
364 |
-
|
365 |
if __name__ == "__main__":
|
366 |
init_predictor()
|
367 |
demo.launch()
|
|
|
30 |
os.putenv("TOKENIZERS_PARALLELISM","False")
|
31 |
|
32 |
def init_predictor():
|
33 |
+
global pipe
|
34 |
+
pipe = SkyReelsVideoSingleGpuInfer(
|
35 |
task_type= TaskType.I2V,
|
36 |
model_id="Skywork/SkyReels-V1-Hunyuan-I2V",
|
37 |
quant_model=False,
|
|
|
45 |
|
46 |
@spaces.GPU(duration=60)
|
47 |
def generate_video(segment, image, prompt, size, guidance_scale, num_inference_steps, frames, seed, progress=gr.Progress(track_tqdm=True) ):
|
|
|
48 |
random.seed(time.time())
|
49 |
seed = int(random.randrange(4294967294))
|
50 |
if segment==1:
|
|
|
|
|
51 |
prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
|
52 |
prompt=prompt, prompt_2=prompt, device=device
|
53 |
)
|
|
|
|
|
54 |
pipe.scheduler.set_timesteps(num_inference_steps, device=torch.device('cuda'))
|
55 |
timesteps = pipe.scheduler.timesteps
|
56 |
all_timesteps_cpu = timesteps.cpu()
|
57 |
timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
|
58 |
segment_timesteps = torch.from_numpy(timesteps_split_np[0]).to("cuda")
|
|
|
59 |
num_channels_latents = pipe.transformer.config.in_channels
|
60 |
+
num_channels_latents = int(num_channels_latents / 2)
|
61 |
+
image = pipe.video_processor.preprocess(image, height=height, width=width).to(
|
62 |
+
device, dtype=prompt_embeds.dtype
|
63 |
+
)
|
64 |
+
num_latent_frames = (frames - 1) // pipe.vae_scale_factor_temporal + 1
|
65 |
latents = pipe.prepare_latents(
|
66 |
batch_size=1, num_channels_latents=pipe.transformer.config.in_channels, height=height, width=width, num_frames=frames,
|
67 |
dtype=torch.float32, device=device, generator=generator, latents=None,
|
68 |
)
|
69 |
+
image_latents = pipe.image_latents(
|
70 |
+
image, batch_size, height, width, device, torch.float32, num_channels_latents, num_latent_frames
|
71 |
+
)
|
72 |
+
image_latents = image_latents.to(pipe.transformer.dtype)
|
73 |
guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
else:
|
75 |
state_file = f"rv_L_{segment-1}_{seed}.pt"
|
76 |
state = torch.load(state_file, weights_only=False)
|
77 |
generator = torch.Generator(device='cuda').manual_seed(seed)
|
78 |
+
latents = state["intermediate_latents"].to("cuda", dtype=torch.bfloat16)
|
79 |
+
guidance_scale = state["guidance_scale"]
|
80 |
+
all_timesteps_cpu = state["all_timesteps"]
|
81 |
+
height = state["height"]
|
82 |
+
width = state["width"]
|
83 |
+
pipe.scheduler.set_timesteps(len(all_timesteps_cpu), device=device)
|
84 |
+
timesteps_split_np = np.array_split(all_timesteps_cpu.numpy(), 8)
|
85 |
+
segment_timesteps = torch.from_numpy(timesteps_split_np[segment - 1]).to("cuda")
|
86 |
+
prompt_embeds = state["prompt_embeds"].to("cuda", dtype=torch.bfloat16)
|
87 |
+
pooled_prompt_embeds = state["pooled_prompt_embeds"].to("cuda", dtype=torch.bfloat16)
|
88 |
+
prompt_attention_mask = state["prompt_attention_mask"].to("cuda", dtype=torch.bfloat16)
|
89 |
+
image_latents = state["image_latents"].to("cuda", dtype=torch.bfloat16)
|
90 |
for i, t in enumerate(pipe.progress_bar(segment_timesteps)):
|
91 |
+
latents = latents.to(transformer_dtype)
|
92 |
+
latent_model_input = torch.cat([latents] * 2)
|
93 |
+
latent_image_input = (
|
94 |
+
torch.cat([image_latents] * 2)
|
95 |
+
)
|
96 |
+
latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=1)
|
97 |
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
98 |
with torch.no_grad():
|
99 |
noise_pred = self.transformer(
|
|
|
103 |
encoder_attention_mask=prompt_attention_mask,
|
104 |
pooled_projections=pooled_prompt_embeds,
|
105 |
guidance=guidance,
|
106 |
+
# attention_kwargs=attention_kwargs,
|
107 |
return_dict=False,
|
108 |
)[0]
|
109 |
|
110 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
111 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
112 |
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
113 |
+
intermediate_latents_cpu = latents.detach().cpu()
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
if segment==8:
|
115 |
latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
|
116 |
video = self.vae.decode(latents, return_dict=False)[0]
|
117 |
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
118 |
+
# return HunyuanVideoPipelineOutput(frames=video)
|
|
|
|
|
119 |
save_dir = f"./"
|
120 |
video_out_file = f"{save_dir}/{seed}.mp4"
|
121 |
print(f"generate video, local path: {video_out_file}")
|
122 |
export_to_video(output, video_out_file, fps=24)
|
|
|
123 |
return video_out_file, seed
|
124 |
else:
|
125 |
original_prompt_embeds_cpu = prompt_embeds.cpu()
|
126 |
+
original_image_latents_cpu = image_latents.cpu()
|
127 |
original_pooled_prompt_embeds_cpu = pooled_prompt_embeds.cpu()
|
128 |
+
original_prompt_attention_mask_cpu = prompt_attention_mask.cpu()
|
129 |
original_add_time_ids_cpu = add_time_ids.cpu()
|
130 |
timesteps = pipe.scheduler.timesteps
|
131 |
all_timesteps_cpu = timesteps.cpu() # Move to CPU
|
|
|
133 |
"intermediate_latents": intermediate_latents_cpu,
|
134 |
"all_timesteps": all_timesteps_cpu, # Save full list generated by scheduler
|
135 |
"prompt_embeds": original_prompt_embeds_cpu, # Save ORIGINAL embeds
|
136 |
+
"image_latents": original_image_latents_cpu,
|
137 |
"pooled_prompt_embeds": original_pooled_prompt_embeds_cpu,
|
138 |
+
"prompt_attention_mask": original_prompt_attention_mask_cpu,
|
139 |
"add_time_ids": original_add_time_ids_cpu, # Save ORIGINAL time IDs
|
140 |
"guidance_scale": guidance_scale,
|
141 |
"timesteps_split": timesteps_split_for_state,
|
|
|
148 |
state_file = f"SkyReel_{segment}_{seed}.pt"
|
149 |
torch.save(state, state_file)
|
150 |
return None, seed
|
|
|
|
|
151 |
|
152 |
with gr.Blocks() as demo:
|
153 |
with gr.Row():
|
|
|
198 |
inputs=num_inference_steps,
|
199 |
outputs=range_sliders,
|
200 |
)
|
|
|
201 |
gr.Examples(
|
202 |
examples=examples,
|
203 |
inputs=prompt,
|
|
|
346 |
outputs=[result, seed],
|
347 |
)
|
348 |
|
|
|
349 |
if __name__ == "__main__":
|
350 |
init_predictor()
|
351 |
demo.launch()
|