Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

App Files Files Community

Update app.py

by chavinlo - opened May 2, 2023

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+213

-439

Files changed (20) hide show

.gitattributes +0 -2
README.md +3 -1
app.py +148 -230
example.gif → example.webp +2 -2
examples/example_01_barbarian/input.png +0 -3
examples/example_01_barbarian/output.gif +0 -3
examples/example_01_barbarian/params.json +0 -14
examples/example_02_zombies/output.gif +0 -3
examples/example_02_zombies/params.json +0 -14
examples/example_03_astronaut/output.gif +0 -3
examples/example_03_astronaut/params.json +0 -14
examples/example_04_furry_moster/output.gif +0 -3
examples/example_04_furry_moster/params.json +0 -14
examples/example_05_people/input.png +0 -3
examples/example_05_people/output.gif +0 -3
examples/example_05_people/params.json +0 -14
examples/example_06_sophie/output.gif +0 -3
examples/example_06_sophie/params.json +0 -14
makeavid_sd/inference.py +58 -94
requirements.txt +1 -1

.gitattributes CHANGED Viewed

@@ -1,6 +1,4 @@
 *.webp filter=lfs diff=lfs merge=lfs -text
-*.gif filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

 *.webp filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -12,10 +12,12 @@ library_name: diffusers
 pipeline_tag: text-to-video
 datasets:
 - TempoFunk/tempofunk-sdance
-- TempoFunk/small
 models:
 - TempoFunk/makeavid-sd-jax
 - runwayml/stable-diffusion-v1-5
 tags:
 - jax-diffusers-event
 ---

 pipeline_tag: text-to-video
 datasets:
 - TempoFunk/tempofunk-sdance
+- TempoFunk/tempofunk-m
 models:
 - TempoFunk/makeavid-sd-jax
 - runwayml/stable-diffusion-v1-5
 tags:
 - jax-diffusers-event
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
-import json
 from io import BytesIO
 import base64
 from functools import partial
@@ -8,88 +7,65 @@ from functools import partial
 from PIL import Image, ImageOps
 import gradio as gr
-from makeavid_sd.inference import (
-        InferenceUNetPseudo3D,
-        jnp,
-        SCHEDULERS
-)
 print(os.environ.get('XLA_PYTHON_CLIENT_PREALLOCATE', 'NotSet'))
 print(os.environ.get('XLA_PYTHON_CLIENT_ALLOCATOR', 'NotSet'))
 _seen_compilations = set()
 _model = InferenceUNetPseudo3D(
         model_path = 'TempoFunk/makeavid-sd-jax',
         dtype = jnp.float16,
         hf_auth_token = os.environ.get('HUGGING_FACE_HUB_TOKEN', None)
 )
-import datetime
-print(datetime.datetime.now(datetime.timezone.utc).isoformat())
 if _model.failed != False:
     trace = f'```{_model.failed}```'
     with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled = False) as demo:
         exception = gr.Markdown(trace)
-    demo.launch()
-_examples = []
-_expath = 'examples'
-for x in sorted(os.listdir(_expath)):
-    with open(os.path.join(_expath, x, 'params.json'), 'r') as f:
-        ex = json.load(f)
-    ex['image_input'] = None
-    if os.path.isfile(os.path.join(_expath, x, 'input.png')):
-        ex['image_input'] = os.path.join(_expath, x, 'input.png')
-    ex['image_output'] = os.path.join(_expath, x, 'output.gif')
-    _examples.append(ex)
-_output_formats = (
-    'webp', 'gif'
-)
 # gradio is illiterate. type hints make it go poopoo in pantsu.
 def generate(
         prompt = 'An elderly man having a great time in the park.',
         neg_prompt = '',
-        hint_image = None,
         inference_steps = 20,
-        cfg = 15.0,
-        cfg_image = 9.0,
         seed = 0,
-        fps = 12,
         num_frames = 24,
         height = 512,
-        width = 512,
-        scheduler_type = 'dpm',
-        output_format = 'gif'
 ) -> str:
-    num_frames = min(24, max(2, int(num_frames)))
-    inference_steps = min(60, max(2, int(inference_steps)))
-    height = min(576, max(256, int(height)))
-    width = min(576, max(256, int(width)))
-    height = (height // 64) * 64
-    width = (width // 64) * 64
-    cfg = max(cfg, 1.0)
-    cfg_image = max(cfg_image, 1.0)
-    fps = min(1000, max(1, int(fps)))
-    seed = min(2**32-2, int(seed))
     if seed < 0:
         seed = -seed
     if hint_image is not None:
         if hint_image.mode != 'RGB':
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
-    scheduler_type = scheduler_type.lower()
-    if scheduler_type not in SCHEDULERS:
-        scheduler_type = 'dpm'
-    output_format = output_format.lower()
-    if output_format not in _output_formats:
-        output_format = 'gif'
-    mask_image = None
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
             neg_prompt = neg_prompt,
@@ -97,44 +73,56 @@ def generate(
             mask_image = mask_image,
             inference_steps = inference_steps,
             cfg = cfg,
-            cfg_image = cfg_image,
             height = height,
             width = width,
             num_frames = num_frames,
-            seed = seed,
-            scheduler_type = scheduler_type
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
-    with BytesIO() as buffer:
-        images[1].save(
-                buffer,
-                format = output_format,
-                save_all = True,
-                append_images = images[2:],
-                loop = 0,
-                duration = round(1000 / fps),
-                allow_mixed = True,
-                optimize = True
-        )
-        data = f'data:image/{output_format};base64,' + base64.b64encode(buffer.getvalue()).decode()
-    with BytesIO() as buffer:
-        images[-1].save(buffer, format = 'png', optimize = True)
-        last_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
-    with BytesIO() as buffer:
-        images[0].save(buffer, format ='png', optimize = True)
-        first_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
-    return data, last_data, first_data
-def check_if_compiled(hint_image, inference_steps, height, width, num_frames, scheduler_type, message):
     height = int(height)
     width = int(width)
-    inference_steps = int(inference_steps)
-    height = (height // 64) * 64
-    width = (width // 64) * 64
-    if (hint_image is None, inference_steps, height, width, num_frames, scheduler_type) in _seen_compilations:
         return ''
     else:
-        return  message
 with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled = False) as demo:
     variant = 'panel'
@@ -143,67 +131,60 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
-                        We have extended a pretrained latent-diffusion inpainting image generation model with **temporal convolutions and attention**.
-                        We guide the video generation with a hint image by taking advantage of the extra 5 input channels of the inpainting model.
-                        In this demo the hint image can be given by the user, otherwise it is generated by an generative image model.
-                        The temporal layers are a port of [Make-A-Video PyTorch](https://github.com/lucidrains/make-a-video-pytorch) to [JAX](https://github.com/google/jax) utilizing [FLAX](https://github.com/google/flax).
-                        The convolution is pseudo 3D and seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
-                        Temporal attention is purely self attention and also separately attends to time.
                         Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
-                        The model has been trained for 80 epochs on a dataset of 18,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
-                        Model: [TempoFunk/makeavid-sd-jax](https://huggingface.co/TempoFunk/makeavid-sd-jax)
-                        Datasets: [TempoFunk/tempofunk-sdance](https://huggingface.co/datasets/TempoFunk/tempofunk-sdance), [TempoFunk/small](https://huggingface.co/datasets/TempoFunk/small)
-                        Model implementation and training code can be found at <https://github.com/lopho/makeavid-sd-tpu> (WIP)
             """)
         with gr.Column():
             intro3 = gr.Markdown("""
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
-                        The compilation will be cached and later runs with the same parameters
                         will be much faster.
                         Changes to the following parameters require the model to compile
                         - Number of frames
                         - Width & Height
-                        - Inference steps
                         - Input image vs. no input image
-                        - Noise scheduler type
-                        If you encounter any issues, please report them here: [Space discussions](https://huggingface.co/spaces/TempoFunk/makeavid-sd-jax/discussions) (or DM [@lopho](https://twitter.com/lopho))
-                        <small>Leave a ❤️ like if you like. Consider it a dopamine donation at no cost.</small>
             """)
     with gr.Row(variant = variant):
-        with gr.Column():
             with gr.Row():
                 #cancel_button = gr.Button(value = 'Cancel')
                 submit_button = gr.Button(value = 'Make A Video', variant = 'primary')
             prompt_input = gr.Textbox(
                     label = 'Prompt',
-                    value = 'They are dancing in the club but everybody is a 3d cg  hairy monster wearing a hairy costume.',
                     interactive = True
             )
             neg_prompt_input = gr.Textbox(
                     label = 'Negative prompt (optional)',
-                    value = 'monochrome, saturated',
                     interactive = True
             )
-            cfg_input = gr.Slider(
-                    label = 'Guidance scale video',
-                    minimum = 1.0,
-                    maximum = 20.0,
-                    step = 0.1,
-                    value = 15.0,
-                    interactive = True
             )
-            cfg_image_input = gr.Slider(
-                    label = 'Guidance scale hint (no effect with input image)',
                     minimum = 1.0,
                     maximum = 20.0,
                     step = 0.1,
@@ -217,152 +198,89 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                     precision = 0
             )
             image_input = gr.Image(
-                    label = 'Hint image (optional)',
                     interactive = True,
                     image_mode = 'RGB',
                     type = 'pil',
                     optional = True,
-                    source = 'upload'
-            )
-            inference_steps_input = gr.Slider(
-                    label = 'Steps',
-                    minimum = 2,
-                    maximum = 60,
-                    value = 20,
-                    step = 1,
-                    interactive = True
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
-                    minimum = 2,
                     maximum = 24,
                     step = 1,
-                    value = 24,
-                    interactive = True
             )
             width_input = gr.Slider(
                     label = 'Width',
-                    minimum = 256,
-                    maximum = 576,
-                    step = 64,
-                    value = 512,
-                    interactive = True
             )
             height_input = gr.Slider(
                     label = 'Height',
-                    minimum = 256,
-                    maximum = 576,
-                    step = 64,
-                    value = 512,
-                    interactive = True
             )
-            scheduler_input = gr.Dropdown(
-                    label = 'Noise scheduler',
-                    choices = list(SCHEDULERS.keys()),
-                    value = 'dpm',
-                    interactive = True
             )
-            with gr.Row():
-                fps_input = gr.Slider(
-                        label = 'Output FPS',
-                        minimum = 1,
-                        maximum = 1000,
-                        step = 1,
-                        value = 12,
-                        interactive = True
-                )
-                output_format = gr.Dropdown(
-                        label = 'Output format',
-                        choices = _output_formats,
-                        value = 'gif',
-                        interactive = True
-                )
-        with gr.Column():
-            #will_trigger = gr.Markdown('')
-            patience = gr.Markdown('**Please be patient. The model might have to compile with current parameters.**')
             image_output = gr.Image(
                     label = 'Output',
-                    value = 'example.gif',
                     interactive = False
             )
-            tips = gr.Markdown('🤫 *Secret tip*: try using the last frame as input for the next generation.')
-            with gr.Row():
-                last_frame_output = gr.Image(
-                        label = 'Last frame',
-                        interactive = False
-                )
-                first_frame_output = gr.Image(
-                        label = 'Initial frame',
-                        interactive = False
-                )
-    examples_lst = []
-    for x in _examples:
-        examples_lst.append([
-                x['image_output'],
-                x['prompt'],
-                x['neg_prompt'],
-                x['image_input'],
-                x['cfg'],
-                x['cfg_image'],
-                x['seed'],
-                x['fps'],
-                x['steps'],
-                x['scheduler'],
-                x['num_frames'],
-                x['height'],
-                x['width'],
-                x['format']
-        ])
-    examples = gr.Examples(
-            examples = examples_lst,
-            inputs = [
-                    image_output,
-                    prompt_input,
-                    neg_prompt_input,
-                    image_input,
-                    cfg_input,
-                    cfg_image_input,
-                    seed_input,
-                    fps_input,
-                    inference_steps_input,
-                    scheduler_input,
-                    num_frames_input,
-                    height_input,
-                    width_input,
-                    output_format
-            ],
-            postprocess = False
-    )
-    #trigger_inputs = [ image_input, inference_steps_input, height_input, width_input, num_frames_input, scheduler_input ]
-    #trigger_check_fun = partial(check_if_compiled, message = 'Current parameters need compilation.')
-    #height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #scheduler_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    submit_button.click(
-            fn = generate,
-            inputs = [
-                    prompt_input,
-                    neg_prompt_input,
-                    image_input,
-                    inference_steps_input,
-                    cfg_input,
-                    cfg_image_input,
-                    seed_input,
-                    fps_input,
-                    num_frames_input,
-                    height_input,
-                    width_input,
-                    scheduler_input,
-                    output_format
-            ],
-            outputs = [ image_output, last_frame_output, first_frame_output ],
-            postprocess = False
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
-demo.queue(concurrency_count = 1, max_size = 8, api_open = True)
-demo.launch(show_api = True)

 import os
 from io import BytesIO
 import base64
 from functools import partial
 from PIL import Image, ImageOps
 import gradio as gr
+from makeavid_sd.inference import InferenceUNetPseudo3D, FlaxDPMSolverMultistepScheduler, jnp
 print(os.environ.get('XLA_PYTHON_CLIENT_PREALLOCATE', 'NotSet'))
 print(os.environ.get('XLA_PYTHON_CLIENT_ALLOCATOR', 'NotSet'))
+_preheat: bool = False
 _seen_compilations = set()
 _model = InferenceUNetPseudo3D(
         model_path = 'TempoFunk/makeavid-sd-jax',
+        scheduler_cls = FlaxDPMSolverMultistepScheduler,
         dtype = jnp.float16,
         hf_auth_token = os.environ.get('HUGGING_FACE_HUB_TOKEN', None)
 )
 if _model.failed != False:
     trace = f'```{_model.failed}```'
     with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled = False) as demo:
         exception = gr.Markdown(trace)
+    demo.launch()
 # gradio is illiterate. type hints make it go poopoo in pantsu.
 def generate(
         prompt = 'An elderly man having a great time in the park.',
         neg_prompt = '',
+        image = { 'image': None, 'mask': None },
         inference_steps = 20,
+        cfg = 12.0,
         seed = 0,
+        fps = 24,
         num_frames = 24,
         height = 512,
+        width = 512
 ) -> str:
+    height = int((height//32)*32)
+    width = int((width//32)*32)
+    num_frames = int(num_frames)
+    seed = int(seed)
     if seed < 0:
         seed = -seed
+    inference_steps = int(inference_steps)
+    if image is not None:
+        hint_image = image['image']
+        mask_image = image['mask']
+    else:
+        hint_image = None
+        mask_image = None
     if hint_image is not None:
         if hint_image.mode != 'RGB':
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
+    if mask_image is not None:
+        if mask_image.mode != 'L':
+            mask_image = mask_image.convert('L')
+        if mask_image.size != (width, height):
+            mask_image = ImageOps.fit(mask_image, (width, height), method = Image.Resampling.LANCZOS)
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
             neg_prompt = neg_prompt,
             mask_image = mask_image,
             inference_steps = inference_steps,
             cfg = cfg,
             height = height,
             width = width,
             num_frames = num_frames,
+            seed = seed
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
+    buffer = BytesIO()
+    images[0].save(
+            buffer,
+            format = 'webp',
+            save_all = True,
+            append_images = images[1:],
+            loop = 0,
+            duration = round(1000 / fps),
+            allow_mixed = True
+    )
+    data = base64.b64encode(buffer.getvalue()).decode()
+    data = 'data:image/webp;base64,' + data
+    buffer.close()
+    return data
+def check_if_compiled(image, inference_steps, height, width, num_frames, message):
     height = int(height)
     width = int(width)
+    hint_image = None if image is None else image['image']
+    if (hint_image is None, inference_steps, height, width, num_frames) in _seen_compilations:
         return ''
     else:
+        return  f"""{message}"""
+if _preheat:
+    print('\npreheating the oven')
+    generate(
+            prompt = 'preheating the oven',
+            neg_prompt = '',
+            image = { 'image': None, 'mask': None },
+            inference_steps = 20,
+            cfg = 12.0,
+            seed = 0
+    )
+    print('Entertaining the guests with sailor songs played on an old piano.')
+    dada = generate(
+            prompt = 'Entertaining the guests with sailor songs played on an old harmonium.',
+            neg_prompt = '',
+            image = { 'image': Image.new('RGB', size = (512, 512), color = (0, 0, 0)), 'mask': None },
+            inference_steps = 20,
+            cfg = 12.0,
+            seed = 0
+    )
+    print('dinner is ready\n')
 with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled = False) as demo:
     variant = 'panel'
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
+                        We have extended a pretrained LDM inpainting image generation model with temporal convolutions and attention.
+                        We take advantage of the extra 5 input channels of the inpaint model to guide the video generation with a hint image and mask.
+                        The hint image can be given by the user, otherwise it is generated by an generative image model.
+                        The temporal convolution and attention is a port of [Make-A-Video Pytorch](https://github.com/lucidrains/make-a-video-pytorch/blob/main/make_a_video_pytorch) to FLAX.
+                        It is a pseudo 3D convolution that seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
+                        Temporal attention is purely self attention and also separately attends to time and space.
                         Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
+                        The model has been trained for 60 epochs on a dataset of 10,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
+                        See model and dataset links in the metadata.
+                        Model implementation and training code can be found at [https://github.com/lopho/makeavid-sd-tpu](https://github.com/lopho/makeavid-sd-tpu)
             """)
         with gr.Column():
             intro3 = gr.Markdown("""
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
+                        The compilation will be cached and consecutive runs with the same parameters
                         will be much faster.
                         Changes to the following parameters require the model to compile
                         - Number of frames
                         - Width & Height
+                        - Steps
                         - Input image vs. no input image
             """)
     with gr.Row(variant = variant):
+        with gr.Column(variant = variant):
             with gr.Row():
                 #cancel_button = gr.Button(value = 'Cancel')
                 submit_button = gr.Button(value = 'Make A Video', variant = 'primary')
             prompt_input = gr.Textbox(
                     label = 'Prompt',
+                    value = 'They are dancing in the club while sweat drips from the ceiling.',
                     interactive = True
             )
             neg_prompt_input = gr.Textbox(
                     label = 'Negative prompt (optional)',
+                    value = '',
                     interactive = True
             )
+            inference_steps_input = gr.Slider(
+                label = 'Steps',
+                minimum = 2,
+                maximum = 100,
+                value = 20,
+                step = 1
             )
+            cfg_input = gr.Slider(
+                    label = 'Guidance scale',
                     minimum = 1.0,
                     maximum = 20.0,
                     step = 0.1,
                     precision = 0
             )
             image_input = gr.Image(
+                    label = 'Input image (optional)',
                     interactive = True,
                     image_mode = 'RGB',
                     type = 'pil',
                     optional = True,
+                    source = 'upload',
+                    tool = 'sketch'
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
+                    minimum = 1,
                     maximum = 24,
                     step = 1,
+                    value = 24
             )
             width_input = gr.Slider(
                     label = 'Width',
+                    minimum = 64,
+                    maximum = 512,
+                    step = 32,
+                    value = 448
             )
             height_input = gr.Slider(
                     label = 'Height',
+                    minimum = 64,
+                    maximum = 512,
+                    step = 32,
+                    value = 448
             )
+            fps_input = gr.Slider(
+                    label = 'Output FPS',
+                    minimum = 1,
+                    maximum = 1000,
+                    step = 1,
+                    value = 12
             )
+        with gr.Column(variant = variant):
+            #no_gpu = gr.Markdown('**Until a GPU is assigned expect extremely long runtimes up to 1h+**')
+            will_trigger = gr.Markdown('')
+            patience = gr.Markdown('')
             image_output = gr.Image(
                     label = 'Output',
+                    value = 'example.webp',
                     interactive = False
             )
+    trigger_inputs =  [ image_input, inference_steps_input, height_input, width_input, num_frames_input ]
+    trigger_check_fun = partial(check_if_compiled, message = 'Current parameters will trigger compilation.')
+    height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
+    ev = submit_button.click(
+        fn = partial(
+                check_if_compiled,
+                message = 'Please be patient. The model has to be compiled with current parameters.'
+        ),
+        inputs = trigger_inputs,
+        outputs = patience
+    ).then(
+        fn = generate,
+        inputs = [
+                prompt_input,
+                neg_prompt_input,
+                image_input,
+                inference_steps_input,
+                cfg_input,
+                seed_input,
+                fps_input,
+                num_frames_input,
+                height_input,
+                width_input
+        ],
+        outputs = image_output,
+        postprocess = False
+    ).then(
+        fn = trigger_check_fun,
+        inputs = trigger_inputs,
+        outputs = will_trigger
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
+demo.queue(concurrency_count = 1, max_size = 32)
+demo.launch()

example.gif → example.webp RENAMED Viewed

File without changes

examples/example_01_barbarian/input.png DELETED Viewed

Git LFS Details

SHA256: 87c4d10eb4e1bfa8f09657ec0d85de66052e34c1801b7b21e1cfd4123504b42b
Pointer size: 131 Bytes
Size of remote file: 471 kB

examples/example_01_barbarian/output.gif DELETED Viewed

Git LFS Details

SHA256: b9d3fb7269244fe2e60bfe5a1104e4b5ec6c9e322b371573d8adacba017b594d
Pointer size: 132 Bytes
Size of remote file: 5.33 MB

examples/example_01_barbarian/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "he is dancing as minotaur dancer wearing a fur armor water in a dark cave, john cena, fantasy, barbarian",
-    "neg_prompt": "",
-    "cfg": 15,
-    "cfg_image": 9,
-    "seed": 1,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 20,
-    "format": "gif",
-    "num_frames": 24
-}

examples/example_02_zombies/output.gif DELETED Viewed

Git LFS Details

SHA256: f31690b537f0f45dda16c16281a7bfb730e2f431825fe0b5f5db6f0b9626b388
Pointer size: 132 Bytes
Size of remote file: 4.39 MB

examples/example_02_zombies/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "Group of scary zombies dancing. Halloween concept.",
-    "neg_prompt": "monochrome",
-    "cfg": 15,
-    "cfg_image": 15,
-    "seed": 0,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 20,
-    "format": "gif",
-    "num_frames": 24
-}

examples/example_03_astronaut/output.gif DELETED Viewed

Git LFS Details

SHA256: d849e387f15d15ba192eba4ffd11613fb70c19a23a2df4df47cee2d1cd049695
Pointer size: 132 Bytes
Size of remote file: 5.36 MB

examples/example_03_astronaut/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "Astronaut performing shuffle dance moves on a Moon surface. Stanley Kubrick.",
-    "neg_prompt": "",
-    "cfg": 15,
-    "cfg_image": 15,
-    "seed": 0,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 20,
-    "format": "gif",
-    "num_frames": 24
-}

examples/example_04_furry_moster/output.gif DELETED Viewed

Git LFS Details

SHA256: d5cd05f2a45e4b0b3fa5465d8a8203fad029246071163787cb602e8d630aa70d
Pointer size: 132 Bytes
Size of remote file: 4.33 MB

examples/example_04_furry_moster/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "They are dancing in the club but everybody is a 3d cg  hairy monster wearing a hairy costume.",
-    "neg_prompt": "monochrome, saturated",
-    "cfg": 15,
-    "cfg_image": 15,
-    "seed": 0,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 12,
-    "format": "gif",
-    "num_frames": 24
-}

examples/example_05_people/input.png DELETED Viewed

Git LFS Details

SHA256: 62ab7da78435c4284915c836986d3d4c72610a28b5ca5d971bccb9a639686b43
Pointer size: 131 Bytes
Size of remote file: 408 kB

examples/example_05_people/output.gif DELETED Viewed

Git LFS Details

SHA256: 0be83e354696c3c113d2053ec00b1ca4a7a5d797ffa58310e58973084b025a57
Pointer size: 132 Bytes
Size of remote file: 4.48 MB

examples/example_05_people/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "Front view close up of group of people dancing at a concert in nightclub.",
-    "neg_prompt": "",
-    "cfg": 15,
-    "cfg_image": 9,
-    "seed": 3,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 20,
-    "format": "gif",
-    "num_frames": 24
-}

examples/example_06_sophie/output.gif DELETED Viewed

Git LFS Details

SHA256: 464c468839bdc51e36f8f3c61f1d8d5f823414d207d024059ee9dbcebceda044
Pointer size: 132 Bytes
Size of remote file: 4.63 MB

examples/example_06_sophie/params.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-    "prompt": "A girl is dancing by a beautiful lake by sophie anderson and greg rutkowski and alphonse mucha.",
-    "neg_prompt": "",
-    "cfg": 15,
-    "cfg_image": 15,
-    "seed": 1,
-    "steps": 20,
-    "width": 512,
-    "height": 512,
-    "scheduler": "dpm",
-    "fps": 20,
-    "format": "gif",
-    "num_frames": 24
-}

makeavid_sd/inference.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from typing import Any, Union, Optional, Tuple, List, Dict
 import os
 import gc
 from functools import partial
@@ -17,14 +17,13 @@ import einops
 from diffusers import FlaxAutoencoderKL, FlaxUNet2DConditionModel
 from diffusers import (
         FlaxDDIMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
 )
-from diffusers.schedulers.scheduling_ddim_flax import DDIMSchedulerState
-from diffusers.schedulers.scheduling_pndm_flax import PNDMSchedulerState
-from diffusers.schedulers.scheduling_lms_discrete_flax import LMSDiscreteSchedulerState
-from diffusers.schedulers.scheduling_dpmsolver_multistep_flax import DPMSolverMultistepSchedulerState
 from transformers import FlaxCLIPTextModel, CLIPTokenizer
@@ -32,31 +31,14 @@ from .flax_impl.flax_unet_pseudo3d_condition import UNetPseudo3DConditionModel
 SchedulerType = Union[
         FlaxDDIMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
 ]
-SchedulerStateType = Union[
-        DDIMSchedulerState,
-        PNDMSchedulerState,
-        LMSDiscreteSchedulerState,
-        DPMSolverMultistepSchedulerState,
-]
-SCHEDULERS: Dict[str, SchedulerType] = {
-        'dpm': FlaxDPMSolverMultistepScheduler, # husbando
-        'ddim': FlaxDDIMScheduler,
-        #'PLMS': FlaxPNDMScheduler, # its not correctly implemented in diffusers, output is bad, but at least it "works"
-        #'LMS': FlaxLMSDiscreteScheduler, # borked
-        #    image_latents, image_scheduler_state = scheduler.step(
-        #    File "/mnt/work1/make_a_vid/makeavid-space/.venv/lib/python3.10/site-packages/diffusers/schedulers/scheduling_lms_discrete_flax.py", line 255, in step
-        #    order = min(timestep + 1, order)
-        #    jax._src.errors.ConcretizationTypeError: Abstract tracer value encountered where concrete value is expected: Traced<ShapedArray(bool[])>with<DynamicJaxprTrace(level=1/1)>
-        #    The problem arose with the `bool` function.
-        # The error occurred while tracing the function scanned_fun at /mnt/work1/make_a_vid/makeavid-space/.venv/lib/python3.10/site-packages/jax/_src/lax/control_flow/loops.py:1668 for scan. This concrete value was not available in Python because it depends on the values of the arguments loop_carry[0] and loop_carry[1][1].timesteps
-}
 def dtypestr(x: jnp.dtype):
     if x == jnp.float32: return 'float32'
     elif x == jnp.float16: return 'float16'
@@ -71,6 +53,7 @@ def castto(dtype, m, x):
 class InferenceUNetPseudo3D:
     def __init__(self,
             model_path: str,
             dtype: jnp.dtype = jnp.float16,
             hf_auth_token: Union[str, None] = None
     ) -> None:
@@ -146,27 +129,28 @@ class InferenceUNetPseudo3D:
                 subfolder = 'tokenizer',
                 use_auth_token = self.hf_auth_token
         )
-        self.schedulers: Dict[str, Dict[str, SchedulerType]] = {}
-        for scheduler_name in SCHEDULERS:
-            if scheduler_name not in ['KarrasVe', 'SDEVe']:
-                scheduler, scheduler_state = SCHEDULERS[scheduler_name].from_pretrained(
-                        self.model_path,
-                        subfolder = 'scheduler',
-                        dtype = jnp.float32,
-                        use_auth_token = self.hf_auth_token
-                )
-            else:
-                scheduler, scheduler_state = SCHEDULERS[scheduler_name].from_pretrained(
-                        self.model_path,
-                        subfolder = 'scheduler',
-                        use_auth_token = self.hf_auth_token
-                )
-            self.schedulers[scheduler_name] = scheduler
-            self.params[scheduler_name] = scheduler_state
         self.vae_scale_factor: int = int(2 ** (len(self.vae.config.block_out_channels) - 1))
         self.device_count = jax.device_count()
         gc.collect()
     def prepare_inputs(self,
             prompt: List[str],
             neg_prompt: List[str],
@@ -224,18 +208,16 @@ class InferenceUNetPseudo3D:
         return tokens, neg_tokens, hint, mask
     def generate(self,
-            prompt: Union[str, List[str]] = '',
-            inference_steps: int = 20,
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
-            cfg: float = 15.0,
-            cfg_image: Optional[float] = None,
             num_frames: int = 24,
             width: int = 512,
             height: int = 512,
-            seed: int = 0,
-            scheduler_type: str = 'dpm'
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'
@@ -261,7 +243,6 @@ class InferenceUNetPseudo3D:
         if isinstance(neg_prompt, str):
             neg_prompt = [ neg_prompt ] * batch_size
         assert len(neg_prompt) == batch_size, f'number of negative prompts must be equal to batch size {batch_size} but is {len(neg_prompt)}'
-        assert scheduler_type in SCHEDULERS, f'unknown type of noise scheduler: {scheduler_type}, must be one of {list(SCHEDULERS.keys())}'
         tokens, neg_tokens, hint, mask = self.prepare_inputs(
                 prompt = prompt,
                 neg_prompt = neg_prompt,
@@ -270,14 +251,11 @@ class InferenceUNetPseudo3D:
                 width = width,
                 height = height
         )
-        if cfg_image is None:
-            cfg_image = cfg
-        #params['scheduler'] = scheduler_state
         # NOTE splitting rngs is not deterministic,
         # running on different device counts gives different seeds
         #rng = jax.random.PRNGKey(seed)
         #rngs = jax.random.split(rng, self.device_count)
-        # manually assign seeded RNGs to devices for reproducability
         rngs = jnp.array([ jax.random.PRNGKey(seed + i) for i in range(self.device_count) ])
         params = jax_utils.replicate(self.params)
         tokens = shard(tokens)
@@ -294,11 +272,9 @@ class InferenceUNetPseudo3D:
             height,
             width,
             cfg,
-            cfg_image,
             rngs,
             params,
-            use_imagegen,
-            scheduler_type,
         )
         if images.ndim == 5:
             images = einops.rearrange(images, 'd f c h w -> (d f) h w c')
@@ -319,11 +295,9 @@ class InferenceUNetPseudo3D:
             height,
             width,
             cfg: float,
-            cfg_image: float,
             rng: jax.random.KeyArray,
             params: Union[Dict[str, Any], FrozenDict[str, Any]],
-            use_imagegen: bool,
-            scheduler_type: str
     ) -> List[Image.Image]:
         batch_size = tokens.shape[0]
         latent_h = height // self.vae_scale_factor
@@ -338,18 +312,15 @@ class InferenceUNetPseudo3D:
         encoded_prompt = self.text_encoder(tokens, params = params['text_encoder'])[0]
         encoded_neg_prompt = self.text_encoder(neg_tokens, params = params['text_encoder'])[0]
-        scheduler = self.schedulers[scheduler_type]
-        scheduler_state = params[scheduler_type]
         if use_imagegen:
             image_latent_shape = (batch_size, self.vae.config.latent_channels, latent_h, latent_w)
             image_latents = jax.random.normal(
                     rng,
                     shape = image_latent_shape,
                     dtype = jnp.float32
-            ) * scheduler_state.init_noise_sigma
-            image_scheduler_state = scheduler.set_timesteps(
-                    scheduler_state,
                     num_inference_steps = inference_steps,
                     shape = image_latents.shape
             )
@@ -357,21 +328,21 @@ class InferenceUNetPseudo3D:
                 image_latents, image_scheduler_state = args
                 t = image_scheduler_state.timesteps[step]
                 tt = jnp.broadcast_to(t, image_latents.shape[0])
-                latents_input = scheduler.scale_model_input(image_scheduler_state, image_latents, t)
                 noise_pred = self.imunet.apply(
-                        { 'params': params['imunet']} ,
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_prompt
                 ).sample
                 noise_pred_uncond = self.imunet.apply(
-                        { 'params': params['imunet'] },
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_neg_prompt
                 ).sample
-                noise_pred = noise_pred_uncond + cfg_image * (noise_pred - noise_pred_uncond)
-                image_latents, image_scheduler_state = scheduler.step(
                         image_scheduler_state,
                         noise_pred.astype(jnp.float32),
                         t,
@@ -386,7 +357,7 @@ class InferenceUNetPseudo3D:
             hint = image_latents
         else:
             hint = self.vae.apply(
-                    { 'params': params['vae'] },
                     hint,
                     method = self.vae.encode
             ).latent_dist.mean * self.vae.config.scaling_factor
@@ -404,9 +375,9 @@ class InferenceUNetPseudo3D:
                 rng,
                 shape = latent_shape,
                 dtype = jnp.float32
-        ) * scheduler_state.init_noise_sigma
-        scheduler_state = scheduler.set_timesteps(
-                scheduler_state,
                 num_inference_steps = inference_steps,
                 shape = latents.shape
         )
@@ -415,7 +386,7 @@ class InferenceUNetPseudo3D:
             latents, scheduler_state = args
             t = scheduler_state.timesteps[step]#jnp.array(scheduler_state.timesteps, dtype = jnp.int32)[step]
             tt = jnp.broadcast_to(t, latents.shape[0])
-            latents_input = scheduler.scale_model_input(scheduler_state, latents, t)
             latents_input = jnp.concatenate([latents_input, mask, hint], axis = 1)
             noise_pred = self.unet.apply(
                     { 'params': params['unet'] },
@@ -430,7 +401,7 @@ class InferenceUNetPseudo3D:
                     encoded_neg_prompt
             ).sample
             noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
-            latents, scheduler_state = scheduler.step(
                     scheduler_state,
                     noise_pred.astype(jnp.float32),
                     t,
@@ -482,11 +453,9 @@ class InferenceUNetPseudo3D:
                 None,   #  7 height
                 None,   #  8 width
                 None,   #  9 cfg
-                None,   # 10 cfg_image
-                0,      # 11 rng
-                0,      # 12 params
-                None,   # 13 use_imagegen
-                None,   # 14 scheduler_type
         ),
         static_broadcasted_argnums = ( # trigger recompilation on change
                 0,      # inference_class
@@ -494,8 +463,7 @@ class InferenceUNetPseudo3D:
                 6,      # num_frames
                 7,      # height
                 8,      # width
-                13,     # use_imagegen
-                14,     # scheduler_type
         )
 )
 def _p_generate(
@@ -504,16 +472,14 @@ def _p_generate(
         neg_tokens,
         hint,
         mask,
-        inference_steps: int,
-        num_frames: int,
-        height: int,
-        width: int,
-        cfg: float,
-        cfg_image: float,
         rng,
         params,
-        use_imagegen: bool,
-        scheduler_type: str
 ):
     return inference_class._generate(
             tokens,
@@ -525,10 +491,8 @@ def _p_generate(
             height,
             width,
             cfg,
-            cfg_image,
             rng,
             params,
-            use_imagegen,
-            scheduler_type
     )

+from typing import Any, Union, Tuple, List, Dict
 import os
 import gc
 from functools import partial
 from diffusers import FlaxAutoencoderKL, FlaxUNet2DConditionModel
 from diffusers import (
         FlaxDDIMScheduler,
+        FlaxDDPMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
+        FlaxKarrasVeScheduler,
+        FlaxScoreSdeVeScheduler
 )
 from transformers import FlaxCLIPTextModel, CLIPTokenizer
 SchedulerType = Union[
         FlaxDDIMScheduler,
+        FlaxDDPMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
+        FlaxKarrasVeScheduler,
+        FlaxScoreSdeVeScheduler
 ]
 def dtypestr(x: jnp.dtype):
     if x == jnp.float32: return 'float32'
     elif x == jnp.float16: return 'float16'
 class InferenceUNetPseudo3D:
     def __init__(self,
             model_path: str,
+            scheduler_cls: SchedulerType = FlaxDDIMScheduler,
             dtype: jnp.dtype = jnp.float16,
             hf_auth_token: Union[str, None] = None
     ) -> None:
                 subfolder = 'tokenizer',
                 use_auth_token = self.hf_auth_token
         )
+        scheduler, scheduler_state = scheduler_cls.from_pretrained(
+                self.model_path,
+                subfolder = 'scheduler',
+                dtype = jnp.float32,
+                use_auth_token = self.hf_auth_token
+        )
+        self.scheduler: scheduler_cls = scheduler
+        self.params['scheduler'] = scheduler_state
         self.vae_scale_factor: int = int(2 ** (len(self.vae.config.block_out_channels) - 1))
         self.device_count = jax.device_count()
         gc.collect()
+    def set_scheduler(self, scheduler_cls: SchedulerType) -> None:
+        scheduler, scheduler_state = scheduler_cls.from_pretrained(
+                self.model_path,
+                subfolder = 'scheduler',
+                dtype = jnp.float32,
+                use_auth_token = self.hf_auth_token
+        )
+        self.scheduler: scheduler_cls = scheduler
+        self.params['scheduler'] = scheduler_state
     def prepare_inputs(self,
             prompt: List[str],
             neg_prompt: List[str],
         return tokens, neg_tokens, hint, mask
     def generate(self,
+            prompt: Union[str, List[str]],
+            inference_steps: int,
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
+            cfg: float = 10.0,
             num_frames: int = 24,
             width: int = 512,
             height: int = 512,
+            seed: int = 0
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'
         if isinstance(neg_prompt, str):
             neg_prompt = [ neg_prompt ] * batch_size
         assert len(neg_prompt) == batch_size, f'number of negative prompts must be equal to batch size {batch_size} but is {len(neg_prompt)}'
         tokens, neg_tokens, hint, mask = self.prepare_inputs(
                 prompt = prompt,
                 neg_prompt = neg_prompt,
                 width = width,
                 height = height
         )
         # NOTE splitting rngs is not deterministic,
         # running on different device counts gives different seeds
         #rng = jax.random.PRNGKey(seed)
         #rngs = jax.random.split(rng, self.device_count)
+        # manually assign seeded RNGs to devices for reproducability
         rngs = jnp.array([ jax.random.PRNGKey(seed + i) for i in range(self.device_count) ])
         params = jax_utils.replicate(self.params)
         tokens = shard(tokens)
             height,
             width,
             cfg,
             rngs,
             params,
+            use_imagegen
         )
         if images.ndim == 5:
             images = einops.rearrange(images, 'd f c h w -> (d f) h w c')
             height,
             width,
             cfg: float,
             rng: jax.random.KeyArray,
             params: Union[Dict[str, Any], FrozenDict[str, Any]],
+            use_imagegen: bool
     ) -> List[Image.Image]:
         batch_size = tokens.shape[0]
         latent_h = height // self.vae_scale_factor
         encoded_prompt = self.text_encoder(tokens, params = params['text_encoder'])[0]
         encoded_neg_prompt = self.text_encoder(neg_tokens, params = params['text_encoder'])[0]
         if use_imagegen:
             image_latent_shape = (batch_size, self.vae.config.latent_channels, latent_h, latent_w)
             image_latents = jax.random.normal(
                     rng,
                     shape = image_latent_shape,
                     dtype = jnp.float32
+            ) * params['scheduler'].init_noise_sigma
+            image_scheduler_state = self.scheduler.set_timesteps(
+                    params['scheduler'],
                     num_inference_steps = inference_steps,
                     shape = image_latents.shape
             )
                 image_latents, image_scheduler_state = args
                 t = image_scheduler_state.timesteps[step]
                 tt = jnp.broadcast_to(t, image_latents.shape[0])
+                latents_input = self.scheduler.scale_model_input(image_scheduler_state, image_latents, t)
                 noise_pred = self.imunet.apply(
+                        {'params': params['imunet']},
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_prompt
                 ).sample
                 noise_pred_uncond = self.imunet.apply(
+                        {'params': params['imunet']},
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_neg_prompt
                 ).sample
+                noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
+                image_latents, image_scheduler_state = self.scheduler.step(
                         image_scheduler_state,
                         noise_pred.astype(jnp.float32),
                         t,
             hint = image_latents
         else:
             hint = self.vae.apply(
+                    {'params': params['vae']},
                     hint,
                     method = self.vae.encode
             ).latent_dist.mean * self.vae.config.scaling_factor
                 rng,
                 shape = latent_shape,
                 dtype = jnp.float32
+        ) * params['scheduler'].init_noise_sigma
+        scheduler_state = self.scheduler.set_timesteps(
+                params['scheduler'],
                 num_inference_steps = inference_steps,
                 shape = latents.shape
         )
             latents, scheduler_state = args
             t = scheduler_state.timesteps[step]#jnp.array(scheduler_state.timesteps, dtype = jnp.int32)[step]
             tt = jnp.broadcast_to(t, latents.shape[0])
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents, t)
             latents_input = jnp.concatenate([latents_input, mask, hint], axis = 1)
             noise_pred = self.unet.apply(
                     { 'params': params['unet'] },
                     encoded_neg_prompt
             ).sample
             noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
+            latents, scheduler_state = self.scheduler.step(
                     scheduler_state,
                     noise_pred.astype(jnp.float32),
                     t,
                 None,   #  7 height
                 None,   #  8 width
                 None,   #  9 cfg
+                0,      # 10 rng
+                0,      # 11 params
+                None,   # 12 use_imagegen
         ),
         static_broadcasted_argnums = ( # trigger recompilation on change
                 0,      # inference_class
                 6,      # num_frames
                 7,      # height
                 8,      # width
+                12,     # use_imagegen
         )
 )
 def _p_generate(
         neg_tokens,
         hint,
         mask,
+        inference_steps,
+        num_frames,
+        height,
+        width,
+        cfg,
         rng,
         params,
+        use_imagegen
 ):
     return inference_class._generate(
             tokens,
             height,
             width,
             cfg,
             rng,
             params,
+            use_imagegen
     )

requirements.txt CHANGED Viewed

@@ -6,5 +6,5 @@ einops
 -f https://download.pytorch.org/whl/cpu/torch
 torch[cpu]
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-jax[cuda11_pip] #jax[cuda11_cudnn82] #jax[cuda11_cudnn86] #jax[cuda11_cudnn805]
 flax

 -f https://download.pytorch.org/whl/cpu/torch
 torch[cpu]
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+jax[cuda11_cudnn82] #jax[cuda11_cudnn86] #jax[cuda11_cudnn805]
 flax