|
import spaces |
|
import gradio as gr |
|
import torch |
|
import torchvision as tv |
|
import random, os |
|
from diffusers import StableVideoDiffusionPipeline |
|
from PIL import Image |
|
from glob import glob |
|
from typing import Optional |
|
|
|
from tdd_svd_scheduler import TDDSVDStochasticIterativeScheduler |
|
from utils import load_lora_weights, save_video |
|
|
|
|
|
LOCAL = False |
|
|
|
if LOCAL: |
|
svd_path = '/share2/duanyuxuan/diff_playground/diffusers_models/stable-video-diffusion-img2vid-xt-1-1' |
|
lora_file_path = '/share2/duanyuxuan/diff_playground/SVD-TDD/svd-xt-1-1_tdd_lora_weights.safetensors' |
|
else: |
|
svd_path = 'stabilityai/stable-video-diffusion-img2vid-xt-1-1' |
|
lora_repo_path = 'RED-AIGC/TDD' |
|
lora_weight_name = 'svd-xt-1-1_tdd_lora_weights.safetensors' |
|
|
|
if torch.cuda.is_available(): |
|
noise_scheduler = TDDSVDStochasticIterativeScheduler(num_train_timesteps = 250, sigma_min = 0.002, sigma_max = 700.0, sigma_data = 1.0, |
|
s_noise = 1.0, rho = 7, clip_denoised = False) |
|
|
|
pipeline = StableVideoDiffusionPipeline.from_pretrained(svd_path, scheduler = noise_scheduler, torch_dtype = torch.float16, variant = "fp16").to('cuda') |
|
if LOCAL: |
|
load_lora_weights(pipeline.unet, lora_file_path) |
|
else: |
|
load_lora_weights(pipeline.unet, lora_repo_path, weight_name = lora_weight_name) |
|
|
|
max_64_bit_int = 2**63 - 1 |
|
|
|
@spaces.GPU |
|
def sample( |
|
image: Image, |
|
seed: Optional[int] = 1, |
|
randomize_seed: bool = False, |
|
num_inference_steps: int = 4, |
|
eta: float = 0.3, |
|
min_guidance_scale: float = 1.0, |
|
max_guidance_scale: float = 1.0, |
|
|
|
fps: int = 7, |
|
width: int = 512, |
|
height: int = 512, |
|
num_frames: int = 25, |
|
motion_bucket_id: int = 127, |
|
output_folder: str = "outputs_gradio", |
|
): |
|
pipeline.scheduler.set_eta(eta) |
|
|
|
if randomize_seed: |
|
seed = random.randint(0, max_64_bit_int) |
|
generator = torch.manual_seed(seed) |
|
|
|
os.makedirs(output_folder, exist_ok=True) |
|
base_count = len(glob(os.path.join(output_folder, "*.mp4"))) |
|
video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") |
|
|
|
with torch.autocast("cuda"): |
|
frames = pipeline( |
|
image, height = height, width = width, |
|
num_inference_steps = num_inference_steps, |
|
min_guidance_scale = min_guidance_scale, |
|
max_guidance_scale = max_guidance_scale, |
|
num_frames = num_frames, fps = fps, motion_bucket_id = motion_bucket_id, |
|
decode_chunk_size = 8, |
|
noise_aug_strength = 0.02, |
|
generator = generator, |
|
).frames[0] |
|
save_video(frames, video_path, fps = fps, quality = 5.0) |
|
torch.manual_seed(seed) |
|
|
|
return video_path, seed |
|
|
|
|
|
def preprocess_image(image, height = 512, width = 512): |
|
image = image.convert('RGB') |
|
if image.size[0] != image.size[1]: |
|
image = tv.transforms.functional.pil_to_tensor(image) |
|
image = tv.transforms.functional.center_crop(image, min(image.shape[-2:])) |
|
image = tv.transforms.functional.to_pil_image(image) |
|
image = image.resize((width, height)) |
|
return image |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# Stable Video Diffusion distilled by ✨Target-Driven Distillation✨ |
|
|
|
Target-Driven Distillation (TDD) is a state-of-the-art consistency distillation model that largely accelerates the inference processes of diffusion models. Using its delicate strategies of *target timestep selection* and *decoupled guidance*, models distilled by TDD can generated highly detailed images with only a few steps. |
|
|
|
Besides, TDD is also available for distilling video generation models. This space presents TDD-distilled [SVD-xt 1.1](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1). |
|
|
|
[**Project Page**](https://redaigc.github.io/TDD/) **|** [**Paper**](https://arxiv.org/abs/2409.01347) **|** [**Code**](https://github.com/RedAIGC/Target-Driven-Distillation) **|** [**Model**](https://huggingface.co/RED-AIGC/TDD) **|** [🤗 **TDD-SDXL Demo**](https://huggingface.co/spaces/RED-AIGC/TDD) **|** [🤗 **TDD-SVD Demo**](https://huggingface.co/spaces/RED-AIGC/SVD-TDD) |
|
|
|
The codes of this space are built on [AnimateLCM-SVD](https://huggingface.co/spaces/wangfuyun/AnimateLCM-SVD) and we acknowledge their contribution. |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
image = gr.Image(label="Upload your image", type="pil") |
|
generate_btn = gr.Button("Generate") |
|
video = gr.Video() |
|
with gr.Accordion("Options", open = True): |
|
seed = gr.Slider( |
|
label="Seed", |
|
value=1, |
|
randomize=False, |
|
minimum=0, |
|
maximum=max_64_bit_int, |
|
step=1, |
|
) |
|
randomize_seed = gr.Checkbox(label="Randomize seed", value=False) |
|
min_guidance_scale = gr.Slider( |
|
label="Min guidance scale", |
|
info="min strength of classifier-free guidance", |
|
value=1.0, |
|
minimum=1.0, |
|
maximum=1.5, |
|
) |
|
max_guidance_scale = gr.Slider( |
|
label="Max guidance scale", |
|
info="max strength of classifier-free guidance, it should not be less than Min guidance scale", |
|
value=1.0, |
|
minimum=1.0, |
|
maximum=3.0, |
|
) |
|
num_inference_steps = gr.Slider( |
|
label="Num inference steps", |
|
info="steps for inference", |
|
value=4, |
|
minimum=4, |
|
maximum=8, |
|
step=1, |
|
) |
|
eta = gr.Slider( |
|
label = "Eta", |
|
info = "the value of gamma in gamma-sampling", |
|
value = 0.3, |
|
minimum = 0.0, |
|
maximum = 1.0, |
|
step = 0.1, |
|
) |
|
|
|
image.upload(fn = preprocess_image, inputs = image, outputs = image, queue = False) |
|
generate_btn.click( |
|
fn = sample, |
|
inputs = [ |
|
image, |
|
seed, |
|
randomize_seed, |
|
num_inference_steps, |
|
eta, |
|
min_guidance_scale, |
|
max_guidance_scale, |
|
], |
|
outputs = [video, seed], |
|
api_name = "video", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
if LOCAL: |
|
demo.queue().launch(share=True, server_name='0.0.0.0') |
|
else: |
|
demo.queue(api_open=False).launch(show_api=False) |