Fabrice-TIERCELIN commited on
Commit
562f1d2
1 Parent(s): 351335e

This PR makes the space work again

Browse files

Click on _Merge_ to add this feature.

It's based on a [working space](https://huggingface.co/spaces/xi0v/Stable-Video-Diffusion-Img2Vid).

Files changed (3) hide show
  1. README.md +3 -42
  2. app.py +246 -60
  3. requirements.txt +2 -1
README.md CHANGED
@@ -3,46 +3,7 @@ title: 📺RTV🖼️ - Real Time Video AI
3
  emoji: 🖼️📺
4
  colorFrom: purple
5
  colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.4.0
8
- app_file: app.py
9
- pinned: false
10
  license: other
11
- ---
12
-
13
- 1. Process Images in real time with prompts:
14
- 2. Example:
15
- - Elves Toy Factories North Pole Christmas Magic Elf Happy Magical Toy Robots Polar Bears Creatures Winter Streets with Holiday Festivals Christmas Present Lists Toys Candy Books Christmas Fun Facts
16
- Happy New Years
17
- 3. Add prompt here and download images: https://huggingface.co/spaces/awacke1/RealTimeImageGen
18
- 4. Load images to 01.png thru 09.png below
19
- 5. Rebuild
20
-
21
-
22
- One of the greatest new generative AI models. This model is being tested to generate Christmas themed videos including:
23
- 1. Reindeer
24
- 2. Sunset and Sunrise views of Christmas Eve and Christmas Day
25
- 3. Saint Nicholas
26
- 4. Elves and Toy Factories
27
- 5. Polar Bears, and Other Winter Creatures
28
- 6. Winter Streets with Holiday Festivals
29
- 7. Christmas Present Lists Toys and Candy
30
- 8. Books of Christmas with Fun Facts
31
- 9. Happy New Years!
32
-
33
- -In app.py this is implemented here and will cache the examples and process while loading creating 4 second videos for each image:
34
-
35
- ```
36
- gr.Examples(
37
- examples=[
38
- "images/01.png",
39
- "images/02.png",
40
- "images/03.png",
41
- "images/04.png",
42
- "images/05.png",
43
- "images/06.png",
44
- "images/07.png",
45
- "images/08.png",
46
- "images/09.png"
47
- ],
48
- ```
 
3
  emoji: 🖼️📺
4
  colorFrom: purple
5
  colorTo: yellow
6
+ short_description: Animate Your Pictures With Stable VIdeo DIffusion
 
 
 
7
  license: other
8
+ sdk: gradio
9
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,114 +1,300 @@
1
  import gradio as gr
2
  import torch
3
  import os
4
- import uuid
5
  import random
 
 
 
6
  from glob import glob
7
  from pathlib import Path
8
- from typing import Optional
 
9
  from diffusers import StableVideoDiffusionPipeline
10
- from diffusers.utils import load_image, export_to_video
11
  from PIL import Image
12
- from huggingface_hub import hf_hub_download
13
 
 
 
 
 
14
 
15
- pipe = StableVideoDiffusionPipeline.from_pretrained(
16
- "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
17
  )
18
- pipe.to("cuda")
19
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
20
- max_64_bit_int = 2**63 - 1
 
 
 
21
 
 
22
 
23
- def sample(
24
  image: Image,
25
  seed: Optional[int] = 42,
26
  randomize_seed: bool = True,
27
  motion_bucket_id: int = 127,
28
- fps_id: int = 6,
29
- version: str = "svd_xt",
30
- cond_aug: float = 0.02,
31
- decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
32
- device: str = "cuda",
33
- output_folder: str = "outputs",
 
 
 
 
34
  ):
35
-
36
- if image.mode == "RGBA":
37
- image = image.convert("RGB")
38
- if(randomize_seed):
39
- seed = random.randint(0, max_64_bit_int)
 
 
 
 
 
 
 
40
 
41
- generator = torch.manual_seed(seed)
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  os.makedirs(output_folder, exist_ok=True)
44
- base_count = len(glob(os.path.join(output_folder, "*.mp4")))
45
- video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
46
- frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
47
- export_to_video(frames, video_path, fps=fps_id)
48
- torch.manual_seed(seed)
49
- return video_path, seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def resize_image(image, output_size=(1024, 576)):
 
 
 
 
 
52
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
53
  image_aspect = image.width / image.height # Aspect ratio of the original image
54
 
 
55
  if image_aspect > target_aspect:
 
56
  new_height = output_size[1]
57
  new_width = int(new_height * image_aspect)
58
  resized_image = image.resize((new_width, new_height), Image.LANCZOS)
 
59
  left = (new_width - output_size[0]) / 2
60
  top = 0
61
  right = (new_width + output_size[0]) / 2
62
  bottom = output_size[1]
63
  else:
 
64
  new_width = output_size[0]
65
  new_height = int(new_width / image_aspect)
66
  resized_image = image.resize((new_width, new_height), Image.LANCZOS)
 
67
  left = 0
68
  top = (new_height - output_size[1]) / 2
69
  right = output_size[0]
70
  bottom = (new_height + output_size[1]) / 2
71
 
72
- cropped_image = resized_image.crop((left, top, right, bottom))
73
- return cropped_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  with gr.Blocks() as demo:
 
 
 
 
76
 
77
- gr.Markdown('''# Stable Video Diffusion using Image 2 Video XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt),
78
- [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets),
79
- [stability's ui waitlist](https://stability.ai/contact))
80
- #### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
81
- ''')
82
-
 
83
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- with gr.Column():
86
- image = gr.Image(label="Upload your image", type="pil")
87
- generate_btn = gr.Button("Generate")
88
-
89
- video = gr.Video()
90
-
91
- with gr.Accordion("Advanced options", open=False):
92
-
93
- seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
94
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
95
- motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
96
- fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
97
-
98
- image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
99
- generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  gr.Examples(
101
  examples=[
102
- "images/01.png",
103
- "images/02.png",
104
- "images/03.png",
105
  ],
106
- inputs=image,
107
- outputs=[video, seed],
108
- fn=sample,
109
- cache_examples=True,
 
110
  )
111
 
112
  if __name__ == "__main__":
113
- demo.queue(max_size=20)
114
- demo.launch(share=True)
 
1
  import gradio as gr
2
  import torch
3
  import os
 
4
  import random
5
+ import time
6
+ import math
7
+ import spaces
8
  from glob import glob
9
  from pathlib import Path
10
+ from typing import Optional, List, Union
11
+
12
  from diffusers import StableVideoDiffusionPipeline
13
+ from diffusers.utils import export_to_video, export_to_gif
14
  from PIL import Image
 
15
 
16
+ fps25Pipe = StableVideoDiffusionPipeline.from_pretrained(
17
+ "vdo/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16"
18
+ )
19
+ fps25Pipe.to("cuda")
20
 
21
+ fps14Pipe = StableVideoDiffusionPipeline.from_pretrained(
22
+ "stabilityai/stable-video-diffusion-img2vid", torch_dtype=torch.float16, variant="fp16"
23
  )
24
+ fps14Pipe.to("cuda")
25
+
26
+ dragnuwaPipe = StableVideoDiffusionPipeline.from_pretrained(
27
+ "a-r-r-o-w/dragnuwa-svd", torch_dtype=torch.float16, variant="fp16", low_cpu_mem_usage=False, device_map=None
28
+ )
29
+ dragnuwaPipe.to("cuda")
30
 
31
+ max_64_bit_int = 2**63 - 1
32
 
33
+ def animate(
34
  image: Image,
35
  seed: Optional[int] = 42,
36
  randomize_seed: bool = True,
37
  motion_bucket_id: int = 127,
38
+ fps_id: int = 25,
39
+ noise_aug_strength: float = 0.1,
40
+ decoding_t: int = 3,
41
+ video_format: str = "mp4",
42
+ frame_format: str = "webp",
43
+ version: str = "auto",
44
+ width: int = 1024,
45
+ height: int = 576,
46
+ motion_control: bool = False,
47
+ num_inference_steps: int = 25
48
  ):
49
+ start = time.time()
50
+
51
+ if image is None:
52
+ raise gr.Error("Please provide an image to animate.")
53
+
54
+ output_folder = "outputs"
55
+ image_data = resize_image(image, output_size=(width, height))
56
+ if image_data.mode == "RGBA":
57
+ image_data = image_data.convert("RGB")
58
+
59
+ if motion_control:
60
+ image_data = [image_data] * 2
61
 
62
+ if randomize_seed:
63
+ seed = random.randint(0, max_64_bit_int)
64
+
65
+ if version == "auto":
66
+ if 14 < fps_id:
67
+ version = "svdxt"
68
+ else:
69
+ version = "svd"
70
 
71
+ frames = animate_on_gpu(
72
+ image_data,
73
+ seed,
74
+ motion_bucket_id,
75
+ fps_id,
76
+ noise_aug_strength,
77
+ decoding_t,
78
+ version,
79
+ width,
80
+ height,
81
+ num_inference_steps
82
+ )
83
+
84
  os.makedirs(output_folder, exist_ok=True)
85
+ base_count = len(glob(os.path.join(output_folder, "*." + video_format)))
86
+ result_path = os.path.join(output_folder, f"{base_count:06d}." + video_format)
87
+
88
+ if video_format == "gif":
89
+ video_path = None
90
+ gif_path = result_path
91
+ export_to_gif(image=frames, output_gif_path=gif_path, fps=fps_id)
92
+ else:
93
+ video_path = result_path
94
+ gif_path = None
95
+ export_to_video(frames, video_path, fps=fps_id)
96
+
97
+ end = time.time()
98
+ secondes = int(end - start)
99
+ minutes = math.floor(secondes / 60)
100
+ secondes = secondes - (minutes * 60)
101
+ hours = math.floor(minutes / 60)
102
+ minutes = minutes - (hours * 60)
103
+ information = ("Start the process again if you want a different result. " if randomize_seed else "") + \
104
+ "Wait 2 min before a new run to avoid quota penalty or use another computer. " + \
105
+ "The video has been generated in " + \
106
+ ((str(hours) + " h, ") if hours != 0 else "") + \
107
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
108
+ str(secondes) + " sec."
109
+
110
+ return [
111
+ # Display for video
112
+ gr.update(value = video_path, visible = video_format != "gif"),
113
+ # Display for gif
114
+ gr.update(value = gif_path, visible = video_format == "gif"),
115
+ # Download button
116
+ gr.update(label = "💾 Download animation in *." + video_format + " format", value=result_path, visible=True),
117
+ # Frames
118
+ gr.update(label = "Generated frames in *." + frame_format + " format", format = frame_format, value = frames, visible = True),
119
+ # Used seed
120
+ seed,
121
+ # Information
122
+ gr.update(value = information, visible = True),
123
+ # Reset button
124
+ gr.update(visible = True)
125
+ ]
126
+
127
+ @torch.no_grad()
128
+ @spaces.GPU(duration=180)
129
+ def animate_on_gpu(
130
+ image_data: Union[Image.Image, List[Image.Image]],
131
+ seed: Optional[int] = 42,
132
+ motion_bucket_id: int = 127,
133
+ fps_id: int = 6,
134
+ noise_aug_strength: float = 0.1,
135
+ decoding_t: int = 3,
136
+ version: str = "svdxt",
137
+ width: int = 1024,
138
+ height: int = 576,
139
+ num_inference_steps: int = 25
140
+ ):
141
+ generator = torch.manual_seed(seed)
142
+
143
+ if version == "dragnuwa":
144
+ return dragnuwaPipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
145
+ elif version == "svdxt":
146
+ return fps25Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
147
+ else:
148
+ return fps14Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
149
+
150
 
151
  def resize_image(image, output_size=(1024, 576)):
152
+ # Do not touch the image if the size is good
153
+ if image.width == output_size[0] and image.height == output_size[1]:
154
+ return image
155
+
156
+ # Calculate aspect ratios
157
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
158
  image_aspect = image.width / image.height # Aspect ratio of the original image
159
 
160
+ # Resize if the original image is larger
161
  if image_aspect > target_aspect:
162
+ # Resize the image to match the target height, maintaining aspect ratio
163
  new_height = output_size[1]
164
  new_width = int(new_height * image_aspect)
165
  resized_image = image.resize((new_width, new_height), Image.LANCZOS)
166
+ # Calculate coordinates for cropping
167
  left = (new_width - output_size[0]) / 2
168
  top = 0
169
  right = (new_width + output_size[0]) / 2
170
  bottom = output_size[1]
171
  else:
172
+ # Resize the image to match the target width, maintaining aspect ratio
173
  new_width = output_size[0]
174
  new_height = int(new_width / image_aspect)
175
  resized_image = image.resize((new_width, new_height), Image.LANCZOS)
176
+ # Calculate coordinates for cropping
177
  left = 0
178
  top = (new_height - output_size[1]) / 2
179
  right = output_size[0]
180
  bottom = (new_height + output_size[1]) / 2
181
 
182
+ # Crop the image
183
+ return resized_image.crop((left, top, right, bottom))
184
+
185
+ def reset():
186
+ return [
187
+ None,
188
+ random.randint(0, max_64_bit_int),
189
+ True,
190
+ 127,
191
+ 6,
192
+ 0.1,
193
+ 3,
194
+ "mp4",
195
+ "webp",
196
+ "auto",
197
+ 1024,
198
+ 576,
199
+ False,
200
+ 25
201
+ ]
202
 
203
  with gr.Blocks() as demo:
204
+ gr.HTML("""
205
+ <h1><center>Image-to-Video</center></h1>
206
+ <big><center>Animate your image into 25 frames of 1024x576 pixels freely, without account, without watermark and download the video</center></big>
207
+ <br/>
208
 
209
+ <p>
210
+ This demo is based on <i>Stable Video Diffusion</i> artificial intelligence.
211
+ No prompt or camera control is handled here.
212
+ To control motions, rather use <i><a href="https://huggingface.co/spaces/TencentARC/MotionCtrl_SVD">MotionCtrl SVD</a></i>.
213
+ If you need 128 frames, rather use <i><a href="https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1">ExVideo</a></i>.
214
+ </p>
215
+ """)
216
  with gr.Row():
217
+ with gr.Column():
218
+ image = gr.Image(label="Upload your image", type="pil")
219
+ with gr.Accordion("Advanced options", open=False):
220
+ width = gr.Slider(label="Width", info="Width of the video", value=1024, minimum=256, maximum=1024, step=8)
221
+ height = gr.Slider(label="Height", info="Height of the video", value=576, minimum=256, maximum=576, step=8)
222
+ motion_control = gr.Checkbox(label="Motion control (experimental)", info="Fix the camera", value=False)
223
+ video_format = gr.Radio([["*.mp4", "mp4"], ["*.avi", "avi"], ["*.wmv", "wmv"], ["*.mkv", "mkv"], ["*.mov", "mov"], ["*.gif", "gif"]], label="Video format for result", info="File extention", value="mp4", interactive=True)
224
+ frame_format = gr.Radio([["*.webp", "webp"], ["*.png", "png"], ["*.jpeg", "jpeg"], ["*.gif (unanimated)", "gif"], ["*.bmp", "bmp"]], label="Image format for frames", info="File extention", value="webp", interactive=True)
225
+ fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=25, minimum=5, maximum=30)
226
+ motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
227
+ noise_aug_strength = gr.Slider(label="Noise strength", info="The noise to add", value=0.1, minimum=0, maximum=1, step=0.1)
228
+ num_inference_steps = gr.Slider(label="Number inference steps", info="More denoising steps usually lead to a higher quality video at the expense of slower inference", value=25, minimum=1, maximum=100, step=1)
229
+ decoding_t = gr.Slider(label="Decoding", info="Number of frames decoded at a time; this eats more VRAM; reduce if necessary", value=3, minimum=1, maximum=5, step=1)
230
+ version = gr.Radio([["Auto", "auto"], ["🏃🏻‍♀️ SVD (trained on 14 f/s)", "svd"], ["🏃🏻‍♀️💨 SVD-XT (trained on 25 f/s)", "svdxt"]], label="Model", info="Trained model", value="auto", interactive=True)
231
+ seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
232
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
233
+
234
+ generate_btn = gr.Button(value="🚀 Animate", variant="primary")
235
+ reset_btn = gr.Button(value="🧹 Reinit page", variant="stop", elem_id="reset_button", visible = False)
236
+
237
+ with gr.Column():
238
+ video_output = gr.Video(label="Generated video", format="mp4", autoplay=True, show_download_button=False)
239
+ gif_output = gr.Image(label="Generated video", format="gif", show_download_button=False, visible=False)
240
+ download_button = gr.DownloadButton(label="💾 Download video", visible=False)
241
+ information_msg = gr.HTML(visible=False)
242
+ gallery = gr.Gallery(label="Generated frames", visible=False)
243
 
244
+ generate_btn.click(fn=animate, inputs=[
245
+ image,
246
+ seed,
247
+ randomize_seed,
248
+ motion_bucket_id,
249
+ fps_id,
250
+ noise_aug_strength,
251
+ decoding_t,
252
+ video_format,
253
+ frame_format,
254
+ version,
255
+ width,
256
+ height,
257
+ motion_control,
258
+ num_inference_steps
259
+ ], outputs=[
260
+ video_output,
261
+ gif_output,
262
+ download_button,
263
+ gallery,
264
+ seed,
265
+ information_msg,
266
+ reset_btn
267
+ ], api_name="video")
268
+
269
+ reset_btn.click(fn = reset, inputs = [], outputs = [
270
+ image,
271
+ seed,
272
+ randomize_seed,
273
+ motion_bucket_id,
274
+ fps_id,
275
+ noise_aug_strength,
276
+ decoding_t,
277
+ video_format,
278
+ frame_format,
279
+ version,
280
+ width,
281
+ height,
282
+ motion_control,
283
+ num_inference_steps
284
+ ], queue = False, show_progress = False)
285
+
286
  gr.Examples(
287
  examples=[
288
+ ["images/01.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
289
+ ["images/02.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
290
+ ["images/03.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25]
291
  ],
292
+ inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id, noise_aug_strength, decoding_t, video_format, frame_format, version, width, height, motion_control, num_inference_steps],
293
+ outputs=[video_output, gif_output, download_button, gallery, seed, information_msg, reset_btn],
294
+ fn=animate,
295
+ run_on_click=True,
296
+ cache_examples=False,
297
  )
298
 
299
  if __name__ == "__main__":
300
+ demo.launch(share=True, show_api=False)
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ transformers
4
  accelerate
5
  safetensors
6
  opencv-python
7
- uuid
 
 
4
  accelerate
5
  safetensors
6
  opencv-python
7
+ uuid
8
+ torch