|
---
|
|
license: apache-2.0
|
|
---
|
|
|
|
```
|
|
import torch
|
|
from transformers import AutoTokenizer, UMT5EncoderModel
|
|
from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler
|
|
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
|
|
from diffusers.utils import export_to_video
|
|
from torchvision import transforms
|
|
import os
|
|
import cv2
|
|
import numpy as np
|
|
|
|
|
|
from pathlib import Path
|
|
import json
|
|
from safetensors.torch import safe_open
|
|
|
|
device = "cuda"
|
|
seed = 0
|
|
|
|
# TODO: impl AutoencoderKLWan
|
|
vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
vae = vae.to(device)
|
|
|
|
# TODO: impl FlowDPMSolverMultistepScheduler
|
|
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)
|
|
|
|
text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
|
|
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
|
|
|
|
# 14B
|
|
transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)
|
|
# transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)
|
|
|
|
components = {
|
|
"transformer": transformer,
|
|
"vae": vae,
|
|
"scheduler": scheduler,
|
|
"text_encoder": text_encoder,
|
|
"tokenizer": tokenizer,
|
|
}
|
|
pipe = WanPipeline(**components)
|
|
|
|
pipe.to(device)
|
|
|
|
negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
|
|
|
|
generator = torch.Generator(device=device).manual_seed(seed)
|
|
inputs = {
|
|
"prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
|
|
"negative_prompt": negative_prompt, # TODO
|
|
"generator": generator,
|
|
"num_inference_steps": 50,
|
|
"flow_shift": 3.0,
|
|
"guidance_scale": 5.0,
|
|
"height": 480,
|
|
"width": 832,
|
|
"num_frames": 81,
|
|
"max_sequence_length": 512,
|
|
"output_type": "np"
|
|
}
|
|
|
|
video = pipe(**inputs).frames[0]
|
|
|
|
print(video.shape)
|
|
|
|
export_to_video(video, "output.mp4", fps=16)
|
|
```
|
|
|