|
--- |
|
license: mit |
|
--- |
|
|
|
|
|
``` |
|
git clone https://huggingface.co/ProgramerSalar/L1-S |
|
cd L1-S |
|
pip install -r requirements.txt |
|
``` |
|
- Installing time of requirements.txt file is approx=15 minute |
|
|
|
``` |
|
import os |
|
import json |
|
import torch |
|
import numpy as np |
|
import PIL |
|
from PIL import Image |
|
from IPython.display import HTML |
|
from pyramid_dit import PyramidDiTForVideoGeneration |
|
from IPython.display import Image as ipython_image |
|
from diffusers.utils import load_image, export_to_video, export_to_gif |
|
|
|
|
|
variant='diffusion_transformer_768p' # For high resolution |
|
# variant='diffusion_transformer_384p' # For low resolution |
|
|
|
model_path = "Path" # The downloaded checkpoint dir |
|
model_dtype = 'bf16' |
|
|
|
device_id = 0 |
|
torch.cuda.set_device(device_id) |
|
|
|
model = PyramidDiTForVideoGeneration( |
|
model_path, |
|
model_dtype, |
|
model_variant=variant, |
|
) |
|
|
|
model.vae.to("cuda") |
|
model.dit.to("cuda") |
|
model.text_encoder.to("cuda") |
|
|
|
model.vae.enable_tiling() |
|
|
|
if model_dtype == "bf16": |
|
torch_dtype = torch.bfloat16 |
|
elif model_dtype == "fp16": |
|
torch_dtype = torch.float16 |
|
else: |
|
torch_dtype = torch.float32 |
|
|
|
prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors" |
|
|
|
# used for 384p model variant |
|
# width = 640 |
|
# height = 384 |
|
|
|
# used for 768p model variant |
|
width = 1280 |
|
height = 768 |
|
|
|
temp = 16 # temp in [1, 31] <=> frame in [1, 241] <=> duration in [0, 10s] |
|
|
|
with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype): |
|
frames = model.generate( |
|
prompt=prompt, |
|
num_inference_steps=[20, 20, 20], |
|
video_num_inference_steps=[10, 10, 10], |
|
height=height, |
|
width=width, |
|
temp=temp, |
|
guidance_scale=9.0, # The guidance for the first frame, set it to 7 for 384p variant |
|
video_guidance_scale=5.0, # The guidance for the other video latent |
|
output_type="pil", |
|
save_memory=True, # If you have enough GPU memory, set it to `False` to improve vae decoding speed |
|
) |
|
|
|
export_to_video(frames, "./text_to_video_sample.mp4", fps=24) |
|
|
|
``` |
|
|
|
- Video Generating Time is 10 minute |