File size: 2,259 Bytes
a995c69 577822e a995c69 577822e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
---
license: mit
---
```
git clone https://huggingface.co/ProgramerSalar/L1-S
cd L1-S
pip install -r requirements.txt
```
- Installing time of requirements.txt file is approx=15 minute
```
import os
import json
import torch
import numpy as np
import PIL
from PIL import Image
from IPython.display import HTML
from pyramid_dit import PyramidDiTForVideoGeneration
from IPython.display import Image as ipython_image
from diffusers.utils import load_image, export_to_video, export_to_gif
variant='diffusion_transformer_768p' # For high resolution
# variant='diffusion_transformer_384p' # For low resolution
model_path = "Path" # The downloaded checkpoint dir
model_dtype = 'bf16'
device_id = 0
torch.cuda.set_device(device_id)
model = PyramidDiTForVideoGeneration(
model_path,
model_dtype,
model_variant=variant,
)
model.vae.to("cuda")
model.dit.to("cuda")
model.text_encoder.to("cuda")
model.vae.enable_tiling()
if model_dtype == "bf16":
torch_dtype = torch.bfloat16
elif model_dtype == "fp16":
torch_dtype = torch.float16
else:
torch_dtype = torch.float32
prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors"
# used for 384p model variant
# width = 640
# height = 384
# used for 768p model variant
width = 1280
height = 768
temp = 16 # temp in [1, 31] <=> frame in [1, 241] <=> duration in [0, 10s]
with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):
frames = model.generate(
prompt=prompt,
num_inference_steps=[20, 20, 20],
video_num_inference_steps=[10, 10, 10],
height=height,
width=width,
temp=temp,
guidance_scale=9.0, # The guidance for the first frame, set it to 7 for 384p variant
video_guidance_scale=5.0, # The guidance for the other video latent
output_type="pil",
save_memory=True, # If you have enough GPU memory, set it to `False` to improve vae decoding speed
)
export_to_video(frames, "./text_to_video_sample.mp4", fps=24)
```
- Video Generating Time is 10 minute |