stable-video-diffusion

Paused

File size: 5,037 Bytes

9ac31b8
 
3b06696
e6b3471
 
 
3b06696
 
 
9ac31b8
 
3b06696
e3d310b
 
9ac31b8
 
d56d267
9ac31b8
 
 
d56d267
 
0cd72ee
9ac31b8
25d3956
3b06696
492fffc
 
3b06696
9ac31b8
d56d267
 
3b06696
9ac31b8
 
25d3956
 
9ac31b8
e6b3471
 
9ac31b8
 
 
0cd72ee
efa319b
e6b3471
 
9ac31b8
 
e6b3471
 
0cd72ee
3b06696
0cd72ee
d56d267
 
 
3b06696
d56d267
 
 
 
 
 
e6b3471
d56d267
 
 
 
 
 
 
 
 
 
e6b3471
d56d267
 
 
 
 
3b06696
d56d267
 
 
ff46702
d56d267
e6b3471
 
 
 
3b06696
e6b3471
25d3956
 
0cd72ee
25d3956
 
e6b3471
c8d4706
 
3780d1e
492fffc
 
25d3956
d56d267
492fffc
9ac31b8
 
e6b3471
 
 
 
 
9ac31b8
 
 
 
 
 
 
d56d267
decf237
ba7dc43

import gradio as gr
import torch
import os
import uuid
import random

from glob import glob
from pathlib import Path
from typing import Optional
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image
from huggingface_hub import hf_hub_download

pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
)
pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
max_64_bit_int = 2**63 - 1

def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
):
    if image.mode == "RGBA":
        image = image.convert("RGB")
    if(randomize_seed):
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)

    # Count completed mp4 videos and set the path
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]

    # Export frames to video
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    # Return the video and seed
    return video_path, seed

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
  gr.Markdown('''# Stable Video Diffusion using Image 2 Video XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), 
      [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), 
      [stability's ui waitlist](https://stability.ai/contact))
      #### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
  ''')
    
  with gr.Row():
    with gr.Column():
        image = gr.Image(label="Upload your image", type="pil")
        generate_btn = gr.Button("Generate")
    video = gr.Video()
      
  with gr.Accordion("Advanced options", open=False):
      seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
      randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
      motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
      fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
      
  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
  gr.Examples(
    examples=[
        "images/01.png",
        "images/02.png",
        "images/03.png",
        "images/04.png",
        "images/05.png",
    ],
    inputs=image,
    outputs=[video, seed],
    fn=sample,
    cache_examples=True,
  )

if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch(share=True)