Kvikontent Fabrice-TIERCELIN commited on
Commit
e0f9e9c
β€’
1 Parent(s): bef1271

This PR adds 12 features (#12)

Browse files

- This PR adds 12 features (b312696aba2507c6ac758a35c1df27b01fe9bbc9)


Co-authored-by: Fabrice TIERCELIN <[email protected]>

Files changed (2) hide show
  1. app.py +235 -42
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,63 +1,163 @@
1
  import gradio as gr
2
- #import gradio.helpers
3
  import torch
4
  import os
 
 
 
 
5
  from glob import glob
6
  from pathlib import Path
7
- from typing import Optional
8
 
9
- from diffusers import StableVideoDiffusionPipeline
10
- from diffusers.utils import load_image, export_to_video
11
  from PIL import Image
12
 
13
- import uuid
14
- import random
15
- from huggingface_hub import hf_hub_download
16
- import spaces
17
-
18
- pipe = StableVideoDiffusionPipeline.from_pretrained(
19
  "vdo/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16"
20
  )
21
- pipe.to("cuda")
 
 
 
 
 
 
 
 
 
 
22
 
23
  max_64_bit_int = 2**63 - 1
24
 
25
- @spaces.GPU(duration=120)
26
- def sample(
27
  image: Image,
28
  seed: Optional[int] = 42,
29
  randomize_seed: bool = True,
30
  motion_bucket_id: int = 127,
31
- fps_id: int = 6,
32
- version: str = "svd_xt",
33
- cond_aug: float = 0.02,
34
- decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
35
- device: str = "cuda",
36
- output_folder: str = "outputs",
 
 
 
 
37
  ):
38
- if image.mode == "RGBA":
39
- image = image.convert("RGB")
 
 
 
 
 
 
 
 
 
 
40
 
41
- if(randomize_seed):
42
  seed = random.randint(0, max_64_bit_int)
43
- generator = torch.manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  os.makedirs(output_folder, exist_ok=True)
46
- base_count = len(glob(os.path.join(output_folder, "*.mp4")))
47
- video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
48
 
49
- frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
50
- export_to_video(frames, video_path, fps=fps_id)
51
- torch.manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- return video_path, frames, seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def resize_image(image, output_size=(1024, 576)):
 
 
 
 
56
  # Calculate aspect ratios
57
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
58
  image_aspect = image.width / image.height # Aspect ratio of the original image
59
 
60
- # Resize then crop if the original image is larger
61
  if image_aspect > target_aspect:
62
  # Resize the image to match the target height, maintaining aspect ratio
63
  new_height = output_size[1]
@@ -80,28 +180,121 @@ def resize_image(image, output_size=(1024, 576)):
80
  bottom = (new_height + output_size[1]) / 2
81
 
82
  # Crop the image
83
- cropped_image = resized_image.crop((left, top, right, bottom))
84
- return cropped_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  with gr.Blocks() as demo:
87
- gr.Markdown('''# Community demo for Stable Video Diffusion - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
88
- #### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
89
- ''')
 
 
 
 
 
 
 
 
 
90
  with gr.Row():
91
  with gr.Column():
92
  image = gr.Image(label="Upload your image", type="pil")
93
  with gr.Accordion("Advanced options", open=False):
 
 
 
 
 
 
 
 
 
 
 
94
  seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
95
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
96
- motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
97
- fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
98
- generate_btn = gr.Button(value="Animate", variant="primary")
 
99
  with gr.Column():
100
- video = gr.Video(label="Generated video")
101
- gallery = gr.Gallery(label="Generated frames")
 
 
 
102
 
103
- image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
104
- generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, gallery, seed], api_name="video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  if __name__ == "__main__":
107
  demo.launch(share=True, show_api=False)
 
1
  import gradio as gr
 
2
  import torch
3
  import os
4
+ import random
5
+ import time
6
+ import math
7
+ import spaces
8
  from glob import glob
9
  from pathlib import Path
10
+ from typing import Optional, List, Union
11
 
12
+ from diffusers import StableVideoDiffusionPipeline, StableVideoDragNUWAPipeline
13
+ from diffusers.utils import export_to_video, export_to_gif
14
  from PIL import Image
15
 
16
+ fps25Pipe = StableVideoDiffusionPipeline.from_pretrained(
 
 
 
 
 
17
  "vdo/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16"
18
  )
19
+ fps25Pipe.to("cuda")
20
+
21
+ fps14Pipe = StableVideoDiffusionPipeline.from_pretrained(
22
+ "stabilityai/stable-video-diffusion-img2vid", torch_dtype=torch.float16, variant="fp16"
23
+ )
24
+ fps14Pipe.to("cuda")
25
+
26
+ dragnuwaPipe = StableVideoDragNUWAPipeline.from_pretrained(
27
+ "a-r-r-o-w/dragnuwa-svd", torch_dtype=torch.float16, variant="fp16", low_cpu_mem_usage=False, device_map=None
28
+ )
29
+ dragnuwaPipe.to("cuda")
30
 
31
  max_64_bit_int = 2**63 - 1
32
 
33
+ def animate(
 
34
  image: Image,
35
  seed: Optional[int] = 42,
36
  randomize_seed: bool = True,
37
  motion_bucket_id: int = 127,
38
+ fps_id: int = 25,
39
+ noise_aug_strength: float = 0.1,
40
+ decoding_t: int = 3,
41
+ video_format: str = "mp4",
42
+ frame_format: str = "webp",
43
+ version: str = "auto",
44
+ width: int = 1024,
45
+ height: int = 576,
46
+ motion_control: bool = False,
47
+ num_inference_steps: int = 25
48
  ):
49
+ start = time.time()
50
+
51
+ if image is None:
52
+ raise gr.Error("Please provide an image to animate.")
53
+
54
+ output_folder = "outputs"
55
+ image_data = resize_image(image, output_size=(width, height))
56
+ if image_data.mode == "RGBA":
57
+ image_data = image_data.convert("RGB")
58
+
59
+ if motion_control:
60
+ image_data = [image_data] * 3
61
 
62
+ if randomize_seed:
63
  seed = random.randint(0, max_64_bit_int)
64
+
65
+ if version == "auto":
66
+ if 14 < fps_id:
67
+ version = "svdxt"
68
+ else:
69
+ version = "svd"
70
+
71
+ frames = animate_on_gpu(
72
+ image_data,
73
+ seed,
74
+ motion_bucket_id,
75
+ fps_id,
76
+ noise_aug_strength,
77
+ decoding_t,
78
+ version,
79
+ width,
80
+ height,
81
+ num_inference_steps
82
+ )
83
 
84
  os.makedirs(output_folder, exist_ok=True)
85
+ base_count = len(glob(os.path.join(output_folder, "*." + video_format)))
86
+ result_path = os.path.join(output_folder, f"{base_count:06d}." + video_format)
87
 
88
+ if video_format == "gif":
89
+ video_path = None
90
+ gif_path = result_path
91
+ export_to_gif(image=frames, output_gif_path=gif_path, fps=fps_id)
92
+ else:
93
+ video_path = result_path
94
+ gif_path = None
95
+ export_to_video(frames, video_path, fps=fps_id)
96
+
97
+ end = time.time()
98
+ secondes = int(end - start)
99
+ minutes = math.floor(secondes / 60)
100
+ secondes = secondes - (minutes * 60)
101
+ hours = math.floor(minutes / 60)
102
+ minutes = minutes - (hours * 60)
103
+ information = ("Start the process again if you want a different result. " if randomize_seed else "") + \
104
+ "Wait 2 min before a new run to avoid quota penalty or use another computer. " + \
105
+ "The video has been generated in " + \
106
+ ((str(hours) + " h, ") if hours != 0 else "") + \
107
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
108
+ str(secondes) + " sec."
109
 
110
+ return [
111
+ # Display for video
112
+ gr.update(value = video_path, visible = video_format != "gif"),
113
+ # Display for gif
114
+ gr.update(value = gif_path, visible = video_format == "gif"),
115
+ # Download button
116
+ gr.update(label = "πŸ’Ύ Download animation in *." + video_format + " format", value=result_path, visible=True),
117
+ # Frames
118
+ gr.update(label = "Generated frames in *." + frame_format + " format", format = frame_format, value = frames, visible = True),
119
+ # Used seed
120
+ seed,
121
+ # Information
122
+ gr.update(value = information, visible = True),
123
+ # Reset button
124
+ gr.update(visible = True)
125
+ ]
126
+
127
+ @torch.no_grad()
128
+ @spaces.GPU(duration=180)
129
+ def animate_on_gpu(
130
+ image_data: Union[Image.Image, List[Image.Image]],
131
+ seed: Optional[int] = 42,
132
+ motion_bucket_id: int = 127,
133
+ fps_id: int = 6,
134
+ noise_aug_strength: float = 0.1,
135
+ decoding_t: int = 3,
136
+ version: str = "svdxt",
137
+ width: int = 1024,
138
+ height: int = 576,
139
+ num_inference_steps: int = 25
140
+ ):
141
+ generator = torch.manual_seed(seed)
142
+
143
+ if version == "dragnuwa":
144
+ return dragnuwaPipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
145
+ elif version == "svdxt":
146
+ return fps25Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
147
+ else:
148
+ return fps14Pipe(image_data, width=width, height=height, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=noise_aug_strength, num_frames=25, num_inference_steps=num_inference_steps).frames[0]
149
+
150
 
151
  def resize_image(image, output_size=(1024, 576)):
152
+ # Do not touch the image if the size is good
153
+ if image.width == output_size[0] and image.height == output_size[1]:
154
+ return image
155
+
156
  # Calculate aspect ratios
157
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
158
  image_aspect = image.width / image.height # Aspect ratio of the original image
159
 
160
+ # Resize if the original image is larger
161
  if image_aspect > target_aspect:
162
  # Resize the image to match the target height, maintaining aspect ratio
163
  new_height = output_size[1]
 
180
  bottom = (new_height + output_size[1]) / 2
181
 
182
  # Crop the image
183
+ return resized_image.crop((left, top, right, bottom))
184
+
185
+ def reset():
186
+ return [
187
+ None,
188
+ random.randint(0, max_64_bit_int),
189
+ True,
190
+ 127,
191
+ 6,
192
+ 0.1,
193
+ 3,
194
+ "mp4",
195
+ "webp",
196
+ "auto",
197
+ 1024,
198
+ 576,
199
+ False,
200
+ 25
201
+ ]
202
 
203
  with gr.Blocks() as demo:
204
+ gr.HTML("""
205
+ <h1><center>Image-to-Video</center></h1>
206
+ <big><center>Animate your image into 25 frames of 1024x576 pixels freely, without account, without watermark and download the video</center></big>
207
+ <br/>
208
+
209
+ <p>
210
+ This demo is based on <i>Stable Video Diffusion</i> artificial intelligence.
211
+ No prompt or camera control is handled here.
212
+ To control motions, rather use <i><a href="https://huggingface.co/spaces/TencentARC/MotionCtrl_SVD">MotionCtrl SVD</a></i>.
213
+ If you need 128 frames, rather use <i><a href="https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1">ExVideo</a></i>.
214
+ </p>
215
+ """)
216
  with gr.Row():
217
  with gr.Column():
218
  image = gr.Image(label="Upload your image", type="pil")
219
  with gr.Accordion("Advanced options", open=False):
220
+ width = gr.Slider(label="Width", info="Width of the video", value=1024, minimum=256, maximum=1024, step=8)
221
+ height = gr.Slider(label="Height", info="Height of the video", value=576, minimum=256, maximum=576, step=8)
222
+ motion_control = gr.Checkbox(label="Motion control (experimental)", info="Fix the camera", value=False)
223
+ video_format = gr.Radio([["*.mp4", "mp4"], ["*.avi", "avi"], ["*.wmv", "wmv"], ["*.mkv", "mkv"], ["*.mov", "mov"], ["*.gif", "gif"]], label="Video format for result", info="File extention", value="mp4", interactive=True)
224
+ frame_format = gr.Radio([["*.webp", "webp"], ["*.png", "png"], ["*.jpeg", "jpeg"], ["*.gif (unanimated)", "gif"], ["*.bmp", "bmp"]], label="Image format for frames", info="File extention", value="webp", interactive=True)
225
+ fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=25, minimum=5, maximum=30)
226
+ motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
227
+ noise_aug_strength = gr.Slider(label="Noise strength", info="The noise to add", value=0.1, minimum=0, maximum=1, step=0.1)
228
+ num_inference_steps = gr.Slider(label="Number inference steps", info="More denoising steps usually lead to a higher quality video at the expense of slower inference", value=25, minimum=1, maximum=100, step=1)
229
+ decoding_t = gr.Slider(label="Decoding", info="Number of frames decoded at a time; this eats more VRAM; reduce if necessary", value=3, minimum=1, maximum=5, step=1)
230
+ version = gr.Radio([["Auto", "auto"], ["πŸƒπŸ»β€β™€οΈ SVD (trained on 14 f/s)", "svd"], ["πŸƒπŸ»β€β™€οΈπŸ’¨ SVD-XT (trained on 25 f/s)", "svdxt"], ["DragNUWA (unstable)", "dragnuwa"]], label="Model", info="Trained model", value="auto", interactive=True)
231
  seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
232
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
233
+
234
+ generate_btn = gr.Button(value="πŸš€ Animate", variant="primary")
235
+ reset_btn = gr.Button(value="🧹 Reinit page", variant="stop", elem_id="reset_button", visible = False)
236
+
237
  with gr.Column():
238
+ video_output = gr.Video(label="Generated video", format="mp4", autoplay=True, show_download_button=False)
239
+ gif_output = gr.Image(label="Generated video", format="gif", show_download_button=False, visible=False)
240
+ download_button = gr.DownloadButton(label="πŸ’Ύ Download video", visible=False)
241
+ information_msg = gr.HTML(visible=False)
242
+ gallery = gr.Gallery(label="Generated frames", visible=False)
243
 
244
+ generate_btn.click(fn=animate, inputs=[
245
+ image,
246
+ seed,
247
+ randomize_seed,
248
+ motion_bucket_id,
249
+ fps_id,
250
+ noise_aug_strength,
251
+ decoding_t,
252
+ video_format,
253
+ frame_format,
254
+ version,
255
+ width,
256
+ height,
257
+ motion_control,
258
+ num_inference_steps
259
+ ], outputs=[
260
+ video_output,
261
+ gif_output,
262
+ download_button,
263
+ gallery,
264
+ seed,
265
+ information_msg,
266
+ reset_btn
267
+ ], api_name="video")
268
+
269
+ reset_btn.click(fn = reset, inputs = [], outputs = [
270
+ image,
271
+ seed,
272
+ randomize_seed,
273
+ motion_bucket_id,
274
+ fps_id,
275
+ noise_aug_strength,
276
+ decoding_t,
277
+ video_format,
278
+ frame_format,
279
+ version,
280
+ width,
281
+ height,
282
+ motion_control,
283
+ num_inference_steps
284
+ ], queue = False, show_progress = False)
285
+
286
+ gr.Examples(
287
+ examples=[
288
+ ["Examples/Fire.webp", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
289
+ ["Examples/Water.png", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25],
290
+ ["Examples/Town.jpeg", 42, True, 127, 25, 0.1, 3, "mp4", "png", "auto", 1024, 576, False, 25]
291
+ ],
292
+ inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id, noise_aug_strength, decoding_t, video_format, frame_format, version, width, height, motion_control, num_inference_steps],
293
+ outputs=[video_output, gif_output, download_button, gallery, seed, information_msg, reset_btn],
294
+ fn=animate,
295
+ run_on_click=True,
296
+ cache_examples=False,
297
+ )
298
 
299
  if __name__ == "__main__":
300
  demo.launch(share=True, show_api=False)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- git+https://github.com/huggingface/diffusers.git
 
2
  transformers
3
  accelerate
4
  safetensors
 
1
+ git+https://github.com/Fabrice-TIERCELIN/diffusers.git
2
+ scipy
3
  transformers
4
  accelerate
5
  safetensors