File size: 7,392 Bytes
7fe98ab
 
 
e1d5bb5
 
 
 
7fe98ab
d61a0bc
 
7fe98ab
ddd3c88
7fe98ab
 
 
 
 
d191aca
 
 
7fe98ab
 
ad80429
527a615
 
ad80429
7fe98ab
fad21f9
 
 
 
 
 
 
 
 
7fe98ab
 
 
 
64041b2
 
48fbb23
 
fad21f9
7fe98ab
ddd3c88
af73a4f
ddd3c88
d191aca
d1866f3
fad21f9
ddd3c88
64041b2
 
 
e1d5bb5
 
48fbb23
ddd3c88
e1d5bb5
 
ddd3c88
34999ab
48fbb23
64041b2
fad21f9
d1866f3
 
64041b2
fad21f9
 
 
d191aca
fad21f9
d1866f3
64041b2
01900db
9c5c2ad
 
 
 
 
 
 
 
 
 
 
 
d1866f3
9c5c2ad
 
 
 
ddd3c88
 
 
e1d5bb5
 
 
 
 
 
 
9c5c2ad
e1d5bb5
 
 
 
 
 
 
d1866f3
48fbb23
0093903
e1d5bb5
 
 
 
 
 
 
 
 
 
 
 
ddd3c88
 
e1d5bb5
bd83817
afc7727
ddd3c88
7fe98ab
 
2eea82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f55bc7
7fe98ab
 
fad21f9
2eea82e
 
 
fad21f9
48fbb23
fad21f9
48fbb23
fad21f9
48fbb23
af73a4f
2eea82e
48fbb23
2eea82e
 
 
 
7fe98ab
 
806b2b0
7fe98ab
 
 
 
d1866f3
7fe98ab
fad21f9
af73a4f
ad80429
 
64041b2
ddd3c88
fad21f9
 
 
 
1a236aa
 
 
64041b2
fad21f9
48fbb23
 
fad21f9
1a236aa
 
af73a4f
1a236aa
d1866f3
1a236aa
fad21f9
 
7fe98ab
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import gradio as gr
import spaces 
import torch
# from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
# from diffusers import LTXLatentUpsamplePipeline
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video, load_video
import numpy as np


pipe = LTXConditionPipeline.from_pretrained("linoyts/LTX-Video-0.9.7-distilled-diffusers", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 2048


def round_to_nearest_resolution_acceptable_by_vae(height, width):
    print("before rounding",height, width)
    height = height - (height % pipe.vae_spatial_compression_ratio)
    width = width - (width % pipe.vae_spatial_compression_ratio)
    print("after rounding",height, width)
    return height, width

def change_mode_to_text():
  return gr.update(value="text-to-video")

def change_mode_to_image():
  return gr.update(value="image-to-video")

def change_mode_to_video():
  return gr.update(value="video-to-video")
    
@spaces.GPU
def generate(prompt,
             negative_prompt,
             image,
             video,
             height,
             width,
             mode,
             steps,
             num_frames,
             frames_to_use,
             seed,
             randomize_seed,
             guidance_scale,
             improve_texture=False, progress=gr.Progress(track_tqdm=True)):
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
        
    # Part 1. Generate video at smaller resolution
    # Text-only conditioning is also supported without the need to pass `conditions`
    expected_height, expected_width = height, width 
    downscale_factor = 2 / 3
    downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
    downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)

    print(mode)
    if mode == "text-to-video" and (video is not None):
        video = load_video(video)[:frames_to_use]
        condition = True
    elif mode == "image-to-video" and (image is not None):
        print("WTFFFFFF 1")
        video = [image]
        condition = True
    else:
       condition=False

    if condition:
        print("WTFFFFFF 2")
        condition1 = LTXVideoCondition(video=video, frame_index=0)
    else:
        condition1 = None
    
    latents = pipe(
        conditions=condition1,
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=downscaled_width,
        height=downscaled_height,
        num_frames=num_frames,
        num_inference_steps=steps,
        decode_timestep = 0.05,
        decode_noise_scale = 0.025,
        guidance_scale=guidance_scale,
        generator=torch.Generator(device="cuda").manual_seed(seed),
        output_type="latent",
    ).frames
   
        
    # Part 2. Upscale generated video using latent upsampler with fewer inference steps
    # The available latent upsampler upscales the height/width by 2x
    if improve_texture:
        upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
        upscaled_latents = pipe_upsample(
            latents=latents,
            output_type="latent"
        ).frames
        
        # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)  
        video = pipe(
            conditions=condition1,
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=upscaled_width,
            height=upscaled_height,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            denoise_strength=0.6,  # Effectively, 0.6 * 3 inference steps
            num_inference_steps=3,
            latents=upscaled_latents,
            decode_timestep=0.05,
            image_cond_noise_scale=0.025,
            generator=torch.Generator().manual_seed(seed),
            output_type="pil",
        ).frames[0]
    else:
        upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
        video = pipe_upsample(
            latents=latents,
            # output_type="latent"
        ).frames[0]
    
    # Part 4. Downscale the video to the expected resolution
    video = [frame.resize((expected_width, expected_height)) for frame in video]
    export_to_video(video, "output.mp4", fps=24)
    return "output.mp4"



css="""
#col-container {
    margin: 0 auto;
    max-width: 900px;
}
"""

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo:

  gr.Markdown("# LTX Video 0.9.7 Distilled")
  mode = gr.State(value="text-to-video")
  with gr.Row():
    with gr.Column():
      with gr.Group():
        with gr.Tab("text-to-video") as text_tab:
          image_n = gr.Image(label="", visible=False)
        with gr.Tab("image-to-video") as image_tab:
          image = gr.Image(label="input image")
        with gr.Tab("video-to-video") as video_tab:
          video = gr.Video(label="input video")
          frames_to_use = gr.Number(label="num frames to use",info="first # of frames to use from the input video", value=1)
        prompt = gr.Textbox(label="prompt")
        improve_texture = gr.Checkbox(label="improve texture", value=False, info="slows down generation")
      run_button = gr.Button()
    with gr.Column():
      output = gr.Video(interactive=False)
      

  with gr.Accordion("Advanced settings", open=False):
     negative_prompt = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", visible=False)  
     with gr.Row():
      seed = gr.Number(label="seed", value=0, precision=0)
      randomize_seed = gr.Checkbox(label="randomize seed")
     with gr.Row():
      guidance_scale= gr.Slider(label="guidance scale", minimum=0, maximum=10, value=3, step=1)
      steps = gr.Slider(label="Steps", minimum=1, maximum=30, value=8, step=1)
      num_frames = gr.Slider(label="# frames", minimum=1, maximum=161, value=96, step=1)
     with gr.Row():
       height = gr.Slider(label="height", value=512, step=1, maximum=2048)
       width = gr.Slider(label="width", value=704, step=1, maximum=2048)
    

  text_tab.select(fn=change_mode_to_text, inputs=[], outputs=[mode])
  image_tab.select(fn=change_mode_to_image, inputs=[], outputs=[mode])
  video_tab.select(fn=change_mode_to_video, inputs=[], outputs=[mode])
    
  run_button.click(fn=generate, 
                   inputs=[prompt,
             negative_prompt,
             image,
             video,
             height,
             width,
             mode,
             steps,
             num_frames,
             frames_to_use,
             seed,
             randomize_seed,guidance_scale, improve_texture], 
                   outputs=[output])


  

demo.launch()