Diffusers documentation
LTX-2
LTX-2
LTX-2 is a DiT-based audio-video foundation model designed to generate synchronized video and audio within a single model. It brings together the core building blocks of modern video generation, with open weights and a focus on practical, local execution.
You can find all the original LTX-Video checkpoints under the Lightricks organization.
The original codebase for LTX-2 can be found here.
Two-stages Generation
Recommended pipeline to achieve production quality generation, this pipeline is composed of two stages:
- Stage 1: Generate a video at the target resolution using diffusion sampling with classifier-free guidance (CFG). This stage produces a coherent low-noise video sequence that respects the text/image conditioning.
- Stage 2: Upsample the Stage 1 output by 2 and refine details using a distilled LoRA model to improve fidelity and visual quality. Stage 2 may apply lighter CFG to preserve the structure from Stage 1 while enhancing texture and sharpness.
Sample usage of text-to-video two stages pipeline
import torch
from diffusers import FlowMatchEulerDiscreteScheduler
from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
device = "cuda:0"
width = 768
height = 512
pipe = LTX2Pipeline.from_pretrained(
"Lightricks/LTX-2", torch_dtype=torch.bfloat16
)
pipe.enable_sequential_cpu_offload(device=device)
prompt = "A beautiful sunset over the ocean"
negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
# Stage 1 default (non-distilled) inference
frame_rate = 24.0
video_latent, audio_latent = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=40,
sigmas=None,
guidance_scale=4.0,
output_type="latent",
return_dict=False,
)
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
"Lightricks/LTX-2",
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]
# Load Stage 2 distilled LoRA
pipe.load_lora_weights(
"Lightricks/LTX-2", adapter_name="stage_2_distilled", weight_name="ltx-2-19b-distilled-lora-384.safetensors"
)
pipe.set_adapters("stage_2_distilled", 1.0)
# VAE tiling is usually necessary to avoid OOM error when VAE decoding
pipe.vae.enable_tiling()
# Change scheduler to use Stage 2 distilled sigmas as is
new_scheduler = FlowMatchEulerDiscreteScheduler.from_config(
pipe.scheduler.config, use_dynamic_shifting=False, shift_terminal=None
)
pipe.scheduler = new_scheduler
# Stage 2 inference with distilled LoRA and sigmas
video, audio = pipe(
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=3,
noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], # renoise with first sigma value https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py#L218
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_lora_distilled_sample.mp4",
)Distilled checkpoint generation
Fastest two-stages generation pipeline using a distilled checkpoint.
import torch
from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"
pipe = LTX2Pipeline.from_pretrained(
model_path, torch_dtype=torch.bfloat16
)
pipe.enable_sequential_cpu_offload(device=device)
prompt = "A beautiful sunset over the ocean"
negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
frame_rate = 24.0
video_latent, audio_latent = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=8,
sigmas=DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
generator=generator,
output_type="latent",
return_dict=False,
)
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
model_path,
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]
video, audio = pipe(
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=3,
noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], # renoise with first sigma value https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/src/ltx_pipelines/distilled.py#L178
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
generator=generator,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_distilled_sample.mp4",
)Condition Pipeline Generation
You can use LTX2ConditionPipeline to specify image and/or video conditions at arbitrary latent indices. For example, we can specify both a first-frame and last-frame condition to perform first-last-frame-to-video (FLF2V) generation:
import torch
from diffusers import LTX2ConditionPipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image
device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"
pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload(device=device)
pipe.vae.enable_tiling()
prompt = (
"CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are "
"delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright "
"sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, "
"low-angle perspective."
)
first_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png",
)
last_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png",
)
first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
conditions = [first_cond, last_cond]
frame_rate = 24.0
video_latent, audio_latent = pipe(
conditions=conditions,
prompt=prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=8,
sigmas=DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
generator=generator,
output_type="latent",
return_dict=False,
)
latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
model_path,
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]
video, audio = pipe(
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
width=width * 2,
height=height * 2,
num_inference_steps=3,
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
generator=generator,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_distilled_flf2v.mp4",
)You can use both image and video conditions:
import torch
from diffusers import LTX2ConditionPipeline
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image, load_video
device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"
pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload(device=device)
pipe.vae.enable_tiling()
prompt = (
"The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is "
"divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features "
"dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered "
"clouds, suggesting a bright, sunny day. And then the camera switch to a winding mountain road covered in snow, "
"with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The "
"landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the "
"solitude and beauty of a winter drive through a mountainous region."
)
negative_prompt = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
)
cond_video = load_video(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
)
cond_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)
video_cond = LTX2VideoCondition(frames=cond_video, index=0, strength=1.0)
image_cond = LTX2VideoCondition(frames=cond_image, index=8, strength=1.0)
conditions = [video_cond, image_cond]
frame_rate = 24.0
video, audio = pipe(
conditions=conditions,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=40,
guidance_scale=4.0,
generator=generator,
output_type="np",
return_dict=False,
)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_cond_video.mp4",
)Because the conditioning is done via latent frames, the 8 data space frames corresponding to the specified latent frame for an image condition will tend to be static.
LTX2Pipeline
class diffusers.LTX2Pipeline
< source >( scheduler: FlowMatchEulerDiscreteScheduler vae: AutoencoderKLLTX2Video audio_vae: AutoencoderKLLTX2Audio text_encoder: Gemma3ForConditionalGeneration tokenizer: transformers.models.gemma.tokenization_gemma.GemmaTokenizer | transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast connectors: LTX2TextConnectors transformer: LTX2VideoTransformer3DModel vocoder: diffusers.pipelines.ltx2.vocoder.LTX2Vocoder | diffusers.pipelines.ltx2.vocoder.LTX2VocoderWithBWE processor: transformers.models.gemma3.processing_gemma3.Gemma3Processor | None = None )
Parameters
- transformer (LTXVideoTransformer3DModel) — Conditional Transformer architecture to denoise the encoded video latents.
- scheduler (FlowMatchEulerDiscreteScheduler) —
A scheduler to be used in combination with
transformerto denoise the encoded image latents. - vae (AutoencoderKLLTXVideo) — Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
- text_encoder (
T5EncoderModel) — T5, specifically the google/t5-v1_1-xxl variant. - tokenizer (
CLIPTokenizer) — Tokenizer of class CLIPTokenizer. - tokenizer (
T5TokenizerFast) — Second Tokenizer of class T5TokenizerFast. - connectors (
LTX2TextConnectors) — Text connector stack used to adapt text encoder hidden states for the video and audio branches.
Pipeline for text-to-video generation.
Reference: https://github.com/Lightricks/LTX-Video
__call__
< source >( prompt: str | list[str] = None negative_prompt: str | list[str] | None = None height: int = 512 width: int = 768 num_frames: int = 121 frame_rate: float = 24.0 num_inference_steps: int = 40 sigmas: list[float] | None = None timesteps: list = None guidance_scale: float = 4.0 stg_scale: float = 0.0 modality_scale: float = 1.0 guidance_rescale: float = 0.0 audio_guidance_scale: float | None = None audio_stg_scale: float | None = None audio_modality_scale: float | None = None audio_guidance_rescale: float | None = None spatio_temporal_guidance_blocks: list[int] | None = None noise_scale: float = 0.0 num_videos_per_prompt: int = 1 generator: torch._C.Generator | list[torch._C.Generator] | None = None latents: torch.Tensor | None = None audio_latents: torch.Tensor | None = None prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None decode_timestep: float | list[float] = 0.0 decode_noise_scale: float | list[float] | None = None use_cross_timestep: bool = False system_prompt: str | None = None prompt_max_new_tokens: int = 512 prompt_enhancement_kwargs: dict[str, typing.Any] | None = None prompt_enhancement_seed: int = 10 output_type: str = 'pil' return_dict: bool = True attention_kwargs: dict[str, typing.Any] | None = None callback_on_step_end: typing.Optional[typing.Callable[[int, int], NoneType]] = None callback_on_step_end_tensor_inputs: list = ['latents'] max_sequence_length: int = 1024 ) → ~pipelines.ltx.LTX2PipelineOutput or tuple
Parameters
- prompt (
strorlist[str], optional) — The prompt or prompts to guide the image generation. If not defined, one has to passprompt_embeds. instead. - height (
int, optional, defaults to512) — The height in pixels of the generated image. This is set to 480 by default for the best results. - width (
int, optional, defaults to768) — The width in pixels of the generated image. This is set to 848 by default for the best results. - num_frames (
int, optional, defaults to121) — The number of video frames to generate - frame_rate (
float, optional, defaults to24.0) — The frames per second (FPS) of the generated video. - num_inference_steps (
int, optional, defaults to 40) — The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - sigmas (
List[float], optional) — Custom sigmas to use for the denoising process with schedulers which support asigmasargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. - timesteps (
list[int], optional) — Custom timesteps to use for the denoising process with schedulers which support atimestepsargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. Must be in descending order. - guidance_scale (
float, optional, defaults to4.0) — Guidance scale as defined in Classifier-Free Diffusion Guidance.guidance_scaleis defined aswof equation 2. of Imagen Paper. Guidance scale is enabled by settingguidance_scale > 1. Higher guidance scale encourages to generate images that are closely linked to the textprompt, usually at the expense of lower image quality. Used for the video modality (there is a separate valueaudio_guidance_scalefor the audio modality). - stg_scale (
float, optional, defaults to0.0) — Video guidance scale for Spatio-Temporal Guidance (STG), proposed in Spatiotemporal Skip Guidance for Enhanced Video Diffusion Sampling. STG uses a CFG-like estimate where we move the sample away from a weak sample from a perturbed version of the denoising model. Enabling STG will result in an additional denoising model forward pass; the default value of0.0means that STG is disabled. - modality_scale (
float, optional, defaults to1.0) — Video guidance scale for LTX-2.X modality isolation guidance, where we move the sample away from a weaker sample generated by the denoising model withy cross-modality (audio-to-video and video-to-audio) cross attention disabled using a CFG-like estimate. Enabling modality guidance will result in an additional denoising model forward pass; the default value of1.0means that modality guidance is disabled. - guidance_rescale (
float, optional, defaults to 0.0) — Guidance rescale factor proposed by Common Diffusion Noise Schedules and Sample Steps are Flawedguidance_scaleis defined asφin equation 16. of Common Diffusion Noise Schedules and Sample Steps are Flawed. Guidance rescale factor should fix overexposure when using zero terminal SNR. Used for the video modality. - audio_guidance_scale (
float, optional defaults toNone) — Audio guidance scale for CFG with respect to the negative prompt. The CFG update rule is the same for video and audio, but they can use different values for the guidance scale. The LTX-2.X authors suggest that theaudio_guidance_scaleshould be higher relative to the videoguidance_scale(e.g. for LTX-2.3 they suggest 3.0 for video and 7.0 for audio). IfNone, defaults to the video valueguidance_scale. - audio_stg_scale (
float, optional, defaults toNone) — Audio guidance scale for STG. As with CFG, the STG update rule is otherwise the same for video and audio. For LTX-2.3, a value of 1.0 is suggested for both video and audio. IfNone, defaults to the video valuestg_scale. - audio_modality_scale (
float, optional, defaults toNone) — Audio guidance scale for LTX-2.X modality isolation guidance. As with CFG, the modality guidance rule is otherwise the same for video and audio. For LTX-2.3, a value of 3.0 is suggested for both video and audio. IfNone, defaults to the video valuemodality_scale. - audio_guidance_rescale (
float, optional, defaults toNone) — A separate guidance rescale factor for the audio modality. IfNone, defaults to the video valueguidance_rescale. - spatio_temporal_guidance_blocks (
list[int], optional, defaults toNone) — The zero-indexed transformer block indices at which to apply STG. Must be supplied if STG is used (stg_scaleoraudio_stg_scaleis greater than0). A value of[29]is recommended for LTX-2.0 and[28]is recommended for LTX-2.3. - noise_scale (
float, optional, defaults to0.0) — The interpolation factor between random noise and denoised latents at each timestep. Applying noise to thelatentsandaudio_latentsbefore continue denoising. - num_videos_per_prompt (
int, optional, defaults to 1) — The number of videos to generate per prompt. - generator (
torch.Generatororlist[torch.Generator], optional) — One or a list of torch generator(s) to make generation deterministic. - latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - audio_latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - prompt_attention_mask (
torch.Tensor, optional) — Pre-generated attention mask for text embeddings. - negative_prompt_embeds (
torch.FloatTensor, optional) — Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - negative_prompt_attention_mask (
torch.FloatTensor, optional) — Pre-generated attention mask for negative text embeddings. - decode_timestep (
float, defaults to0.0) — The timestep at which generated video is decoded. - decode_noise_scale (
float, defaults toNone) — The interpolation factor between random noise and denoised latents at the decode timestep. - use_cross_timestep (
booloptional, defaults toFalse) — Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when calculating the cross attention modulation parameters.Trueis the newer (e.g. LTX-2.3) behavior;Falseis the legacy LTX-2.0 behavior. - system_prompt (
str, optional, defaults toNone) — Optional system prompt to use for prompt enhancement. The system prompt will be used by the current text encoder (by default, aGemma3ForConditionalGenerationmodel) to generate an enhanced prompt from the originalpromptto condition generation. If not supplied, prompt enhancement will not be performed. - prompt_max_new_tokens (
int, optional, defaults to512) — The maximum number of new tokens to generate when performing prompt enhancement. - prompt_enhancement_kwargs (
dict[str, Any], optional, defaults toNone) — Keyword arguments forself.text_encoder.generate. If not supplied, default arguments ofdo_sample=Trueandtemperature=0.7will be used. See https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate for more details. - prompt_enhancement_seed (
int, optional, default to10) — Random seed for any random operations during prompt enhancement. - output_type (
str, optional, defaults to"pil") — The output format of the generate image. Choose between PIL:PIL.Image.Imageornp.array. - return_dict (
bool, optional, defaults toTrue) — Whether or not to return a~pipelines.ltx.LTX2PipelineOutputinstead of a plain tuple. - attention_kwargs (
dict, optional) — A kwargs dictionary that if specified is passed along to theAttentionProcessoras defined underself.processorin diffusers.models.attention_processor. - callback_on_step_end (
Callable, optional) — A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments:callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict).callback_kwargswill include a list of all tensors as specified bycallback_on_step_end_tensor_inputs. - callback_on_step_end_tensor_inputs (
List, optional, defaults to["latents"]) — The list of tensor inputs for thecallback_on_step_endfunction. The tensors specified in the list will be passed ascallback_kwargsargument. You will only be able to include variables listed in the._callback_tensor_inputsattribute of your pipeline class. - max_sequence_length (
int, optional, defaults to1024) — Maximum sequence length to use with theprompt.
Returns
~pipelines.ltx.LTX2PipelineOutput or tuple
If return_dict is True, ~pipelines.ltx.LTX2PipelineOutput is returned, otherwise a tuple is
returned where the first element is a list with the generated images.
Function invoked when calling the pipeline for generation.
Examples:
>>> import torch
>>> from diffusers import LTX2Pipeline
>>> from diffusers.pipelines.ltx2.export_utils import encode_video
>>> pipe = LTX2Pipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
>>> pipe.enable_model_cpu_offload()
>>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
>>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
>>> frame_rate = 24.0
>>> video, audio = pipe(
... prompt=prompt,
... negative_prompt=negative_prompt,
... width=768,
... height=512,
... num_frames=121,
... frame_rate=frame_rate,
... num_inference_steps=40,
... guidance_scale=4.0,
... output_type="np",
... return_dict=False,
... )
>>> encode_video(
... video[0],
... fps=frame_rate,
... audio=audio[0].float().cpu(),
... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000
... output_path="video.mp4",
... )encode_prompt
< source >( prompt: str | list[str] negative_prompt: str | list[str] | None = None do_classifier_free_guidance: bool = True num_videos_per_prompt: int = 1 prompt_embeds: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None max_sequence_length: int = 1024 scale_factor: int = 8 device: torch.device | None = None dtype: torch.dtype | None = None )
Parameters
- prompt (
strorlist[str], optional) — prompt to be encoded - negative_prompt (
strorlist[str], optional) — The prompt or prompts not to guide the image generation. If not defined, one has to passnegative_prompt_embedsinstead. Ignored when not using guidance (i.e., ignored ifguidance_scaleis less than1). - do_classifier_free_guidance (
bool, optional, defaults toTrue) — Whether to use classifier free guidance or not. - num_videos_per_prompt (
int, optional, defaults to 1) — Number of videos that should be generated per prompt. torch device to place the resulting embeddings on - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - negative_prompt_embeds (
torch.Tensor, optional) — Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - device — (
torch.device, optional): torch device - dtype — (
torch.dtype, optional): torch dtype
Encodes the prompt into text encoder hidden states.
enhance_prompt
< source >( prompt: str system_prompt: str max_new_tokens: int = 512 seed: int = 10 generator: torch._C.Generator | None = None generation_kwargs: dict[str, typing.Any] | None = None device: str | torch.device | None = None )
Enhances the supplied prompt by generating a new prompt using the current text encoder (default is a
transformers.Gemma3ForConditionalGeneration model) from it and a system prompt.
LTX2ImageToVideoPipeline
class diffusers.LTX2ImageToVideoPipeline
< source >( scheduler: FlowMatchEulerDiscreteScheduler vae: AutoencoderKLLTX2Video audio_vae: AutoencoderKLLTX2Audio text_encoder: Gemma3ForConditionalGeneration tokenizer: transformers.models.gemma.tokenization_gemma.GemmaTokenizer | transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast connectors: LTX2TextConnectors transformer: LTX2VideoTransformer3DModel vocoder: diffusers.pipelines.ltx2.vocoder.LTX2Vocoder | diffusers.pipelines.ltx2.vocoder.LTX2VocoderWithBWE processor: transformers.models.gemma3.processing_gemma3.Gemma3Processor | None = None )
Pipeline for image-to-video generation.
Reference: https://github.com/Lightricks/LTX-Video
TODO
__call__
< source >( image: PIL.Image.Image | numpy.ndarray | torch.Tensor | list[PIL.Image.Image] | list[numpy.ndarray] | list[torch.Tensor] = None prompt: str | list[str] = None negative_prompt: str | list[str] | None = None height: int = 512 width: int = 768 num_frames: int = 121 frame_rate: float = 24.0 num_inference_steps: int = 40 sigmas: list[float] | None = None timesteps: list[int] | None = None guidance_scale: float = 4.0 stg_scale: float = 0.0 modality_scale: float = 1.0 guidance_rescale: float = 0.0 audio_guidance_scale: float | None = None audio_stg_scale: float | None = None audio_modality_scale: float | None = None audio_guidance_rescale: float | None = None spatio_temporal_guidance_blocks: list[int] | None = None noise_scale: float = 0.0 num_videos_per_prompt: int = 1 generator: torch._C.Generator | list[torch._C.Generator] | None = None latents: torch.Tensor | None = None audio_latents: torch.Tensor | None = None prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None decode_timestep: float | list[float] = 0.0 decode_noise_scale: float | list[float] | None = None use_cross_timestep: bool = False system_prompt: str | None = None prompt_max_new_tokens: int = 512 prompt_enhancement_kwargs: dict[str, typing.Any] | None = None prompt_enhancement_seed: int = 10 output_type: str = 'pil' return_dict: bool = True attention_kwargs: dict[str, typing.Any] | None = None callback_on_step_end: typing.Optional[typing.Callable[[int, int], NoneType]] = None callback_on_step_end_tensor_inputs: list = ['latents'] max_sequence_length: int = 1024 ) → ~pipelines.ltx.LTX2PipelineOutput or tuple
Parameters
- image (
PipelineImageInput) — The input image to condition the generation on. Must be an image, a list of images or atorch.Tensor. - prompt (
strorlist[str], optional) — The prompt or prompts to guide the image generation. If not defined, one has to passprompt_embeds. instead. - height (
int, optional, defaults to512) — The height in pixels of the generated image. This is set to 480 by default for the best results. - width (
int, optional, defaults to768) — The width in pixels of the generated image. This is set to 848 by default for the best results. - num_frames (
int, optional, defaults to121) — The number of video frames to generate - frame_rate (
float, optional, defaults to24.0) — The frames per second (FPS) of the generated video. - num_inference_steps (
int, optional, defaults to 40) — The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - sigmas (
List[float], optional) — Custom sigmas to use for the denoising process with schedulers which support asigmasargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. - timesteps (
List[int], optional) — Custom timesteps to use for the denoising process with schedulers which support atimestepsargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. Must be in descending order. - guidance_scale (
float, optional, defaults to4.0) — Guidance scale as defined in Classifier-Free Diffusion Guidance.guidance_scaleis defined aswof equation 2. of Imagen Paper. Guidance scale is enabled by settingguidance_scale > 1. Higher guidance scale encourages to generate images that are closely linked to the textprompt, usually at the expense of lower image quality. Used for the video modality (there is a separate valueaudio_guidance_scalefor the audio modality). - stg_scale (
float, optional, defaults to0.0) — Video guidance scale for Spatio-Temporal Guidance (STG), proposed in Spatiotemporal Skip Guidance for Enhanced Video Diffusion Sampling. STG uses a CFG-like estimate where we move the sample away from a weak sample from a perturbed version of the denoising model. Enabling STG will result in an additional denoising model forward pass; the default value of0.0means that STG is disabled. - modality_scale (
float, optional, defaults to1.0) — Video guidance scale for LTX-2.X modality isolation guidance, where we move the sample away from a weaker sample generated by the denoising model withy cross-modality (audio-to-video and video-to-audio) cross attention disabled using a CFG-like estimate. Enabling modality guidance will result in an additional denoising model forward pass; the default value of1.0means that modality guidance is disabled. - guidance_rescale (
float, optional, defaults to 0.0) — Guidance rescale factor proposed by Common Diffusion Noise Schedules and Sample Steps are Flawedguidance_scaleis defined asφin equation 16. of Common Diffusion Noise Schedules and Sample Steps are Flawed. Guidance rescale factor should fix overexposure when using zero terminal SNR. Used for the video modality. - audio_guidance_scale (
float, optional defaults toNone) — Audio guidance scale for CFG with respect to the negative prompt. The CFG update rule is the same for video and audio, but they can use different values for the guidance scale. The LTX-2.X authors suggest that theaudio_guidance_scaleshould be higher relative to the videoguidance_scale(e.g. for LTX-2.3 they suggest 3.0 for video and 7.0 for audio). IfNone, defaults to the video valueguidance_scale. - audio_stg_scale (
float, optional, defaults toNone) — Audio guidance scale for STG. As with CFG, the STG update rule is otherwise the same for video and audio. For LTX-2.3, a value of 1.0 is suggested for both video and audio. IfNone, defaults to the video valuestg_scale. - audio_modality_scale (
float, optional, defaults toNone) — Audio guidance scale for LTX-2.X modality isolation guidance. As with CFG, the modality guidance rule is otherwise the same for video and audio. For LTX-2.3, a value of 3.0 is suggested for both video and audio. IfNone, defaults to the video valuemodality_scale. - audio_guidance_rescale (
float, optional, defaults toNone) — A separate guidance rescale factor for the audio modality. IfNone, defaults to the video valueguidance_rescale. - spatio_temporal_guidance_blocks (
list[int], optional, defaults toNone) — The zero-indexed transformer block indices at which to apply STG. Must be supplied if STG is used (stg_scaleoraudio_stg_scaleis greater than0). A value of[29]is recommended for LTX-2.0 and[28]is recommended for LTX-2.3. - noise_scale (
float, optional, defaults to0.0) — The interpolation factor between random noise and denoised latents at each timestep. Applying noise to thelatentsandaudio_latentsbefore continue denoising. - num_videos_per_prompt (
int, optional, defaults to 1) — The number of videos to generate per prompt. - generator (
torch.Generatororlist[torch.Generator], optional) — One or a list of torch generator(s) to make generation deterministic. - latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - audio_latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - prompt_attention_mask (
torch.Tensor, optional) — Pre-generated attention mask for text embeddings. - negative_prompt_embeds (
torch.FloatTensor, optional) — Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - negative_prompt_attention_mask (
torch.FloatTensor, optional) — Pre-generated attention mask for negative text embeddings. - decode_timestep (
float, defaults to0.0) — The timestep at which generated video is decoded. - decode_noise_scale (
float, defaults toNone) — The interpolation factor between random noise and denoised latents at the decode timestep. - use_cross_timestep (
booloptional, defaults toFalse) — Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when calculating the cross attention modulation parameters.Trueis the newer (e.g. LTX-2.3) behavior;Falseis the legacy LTX-2.0 behavior. - system_prompt (
str, optional, defaults toNone) — Optional system prompt to use for prompt enhancement. The system prompt will be used by the current text encoder (by default, aGemma3ForConditionalGenerationmodel) to generate an enhanced prompt from the originalpromptto condition generation. If not supplied, prompt enhancement will not be performed. - prompt_max_new_tokens (
int, optional, defaults to512) — The maximum number of new tokens to generate when performing prompt enhancement. - prompt_enhancement_kwargs (
dict[str, Any], optional, defaults toNone) — Keyword arguments forself.text_encoder.generate. If not supplied, default arguments ofdo_sample=Trueandtemperature=0.7will be used. See https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate for more details. - prompt_enhancement_seed (
int, optional, default to10) — Random seed for any random operations during prompt enhancement. - output_type (
str, optional, defaults to"pil") — The output format of the generate image. Choose between PIL:PIL.Image.Imageornp.array. - return_dict (
bool, optional, defaults toTrue) — Whether or not to return a~pipelines.ltx.LTX2PipelineOutputinstead of a plain tuple. - attention_kwargs (
dict, optional) — A kwargs dictionary that if specified is passed along to theAttentionProcessoras defined underself.processorin diffusers.models.attention_processor. - callback_on_step_end (
Callable, optional) — A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments:callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict).callback_kwargswill include a list of all tensors as specified bycallback_on_step_end_tensor_inputs. - callback_on_step_end_tensor_inputs (
List, optional) — The list of tensor inputs for thecallback_on_step_endfunction. The tensors specified in the list will be passed ascallback_kwargsargument. You will only be able to include variables listed in the._callback_tensor_inputsattribute of your pipeline class. - max_sequence_length (
int, optional, defaults to1024) — Maximum sequence length to use with theprompt.
Returns
~pipelines.ltx.LTX2PipelineOutput or tuple
If return_dict is True, ~pipelines.ltx.LTX2PipelineOutput is returned, otherwise a tuple is
returned where the first element is a list with the generated images.
Function invoked when calling the pipeline for generation.
Examples:
>>> import torch
>>> from diffusers import LTX2ImageToVideoPipeline
>>> from diffusers.pipelines.ltx2.export_utils import encode_video
>>> from diffusers.utils import load_image
>>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
>>> pipe.enable_model_cpu_offload()
>>> image = load_image(
... "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
... )
>>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background."
>>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
>>> frame_rate = 24.0
>>> video, audio = pipe(
... image=image,
... prompt=prompt,
... negative_prompt=negative_prompt,
... width=768,
... height=512,
... num_frames=121,
... frame_rate=frame_rate,
... num_inference_steps=40,
... guidance_scale=4.0,
... output_type="np",
... return_dict=False,
... )
>>> encode_video(
... video[0],
... fps=frame_rate,
... audio=audio[0].float().cpu(),
... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000
... output_path="video.mp4",
... )encode_prompt
< source >( prompt: str | list[str] negative_prompt: str | list[str] | None = None do_classifier_free_guidance: bool = True num_videos_per_prompt: int = 1 prompt_embeds: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None max_sequence_length: int = 1024 scale_factor: int = 8 device: torch.device | None = None dtype: torch.dtype | None = None )
Parameters
- prompt (
strorlist[str], optional) — prompt to be encoded - negative_prompt (
strorlist[str], optional) — The prompt or prompts not to guide the image generation. If not defined, one has to passnegative_prompt_embedsinstead. Ignored when not using guidance (i.e., ignored ifguidance_scaleis less than1). - do_classifier_free_guidance (
bool, optional, defaults toTrue) — Whether to use classifier free guidance or not. - num_videos_per_prompt (
int, optional, defaults to 1) — Number of videos that should be generated per prompt. torch device to place the resulting embeddings on - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - negative_prompt_embeds (
torch.Tensor, optional) — Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - device — (
torch.device, optional): torch device - dtype — (
torch.dtype, optional): torch dtype
Encodes the prompt into text encoder hidden states.
enhance_prompt
< source >( image: PIL.Image.Image | numpy.ndarray | torch.Tensor | list[PIL.Image.Image] | list[numpy.ndarray] | list[torch.Tensor] prompt: str system_prompt: str max_new_tokens: int = 512 seed: int = 10 generator: torch._C.Generator | None = None generation_kwargs: dict[str, typing.Any] | None = None device: str | torch.device | None = None )
Enhances the supplied prompt by generating a new prompt using the current text encoder (default is a
transformers.Gemma3ForConditionalGeneration model) from it and a system prompt.
LTX2ConditionPipeline
class diffusers.LTX2ConditionPipeline
< source >( scheduler: FlowMatchEulerDiscreteScheduler vae: AutoencoderKLLTX2Video audio_vae: AutoencoderKLLTX2Audio text_encoder: Gemma3ForConditionalGeneration tokenizer: transformers.models.gemma.tokenization_gemma.GemmaTokenizer | transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast connectors: LTX2TextConnectors transformer: LTX2VideoTransformer3DModel vocoder: diffusers.pipelines.ltx2.vocoder.LTX2Vocoder | diffusers.pipelines.ltx2.vocoder.LTX2VocoderWithBWE )
Pipeline for video generation which allows image conditions to be inserted at arbitary parts of the video.
Reference: https://github.com/Lightricks/LTX-Video
TODO
__call__
< source >( conditions: diffusers.pipelines.ltx2.pipeline_ltx2_condition.LTX2VideoCondition | list[diffusers.pipelines.ltx2.pipeline_ltx2_condition.LTX2VideoCondition] | None = None prompt: str | list[str] = None negative_prompt: str | list[str] | None = None height: int = 512 width: int = 768 num_frames: int = 121 frame_rate: float = 24.0 num_inference_steps: int = 40 sigmas: list[float] | None = None timesteps: list[float] | None = None guidance_scale: float = 4.0 stg_scale: float = 0.0 modality_scale: float = 1.0 guidance_rescale: float = 0.0 audio_guidance_scale: float | None = None audio_stg_scale: float | None = None audio_modality_scale: float | None = None audio_guidance_rescale: float | None = None spatio_temporal_guidance_blocks: list[int] | None = None noise_scale: float | None = None num_videos_per_prompt: int | None = 1 generator: torch._C.Generator | list[torch._C.Generator] | None = None latents: torch.Tensor | None = None audio_latents: torch.Tensor | None = None prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None decode_timestep: float | list[float] = 0.0 decode_noise_scale: float | list[float] | None = None use_cross_timestep: bool = False output_type: str = 'pil' return_dict: bool = True attention_kwargs: dict[str, typing.Any] | None = None callback_on_step_end: typing.Optional[typing.Callable[[int, int], NoneType]] = None callback_on_step_end_tensor_inputs: list = ['latents'] max_sequence_length: int = 1024 ) → ~pipelines.ltx.LTX2PipelineOutput or tuple
Parameters
- conditions (
List[LTXVideoCondition], *optional*) — The list of frame-conditioning items for the video generation. - prompt (
strorList[str], optional) — The prompt or prompts to guide the image generation. If not defined, one has to passprompt_embeds. instead. - height (
int, optional, defaults to512) — The height in pixels of the generated image. This is set to 480 by default for the best results. - width (
int, optional, defaults to768) — The width in pixels of the generated image. This is set to 848 by default for the best results. - num_frames (
int, optional, defaults to121) — The number of video frames to generate - frame_rate (
float, optional, defaults to24.0) — The frames per second (FPS) of the generated video. - num_inference_steps (
int, optional, defaults to 40) — The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - sigmas (
List[float], optional) — Custom sigmas to use for the denoising process with schedulers which support asigmasargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. - timesteps (
List[int], optional) — Custom timesteps to use for the denoising process with schedulers which support atimestepsargument in theirset_timestepsmethod. If not defined, the default behavior whennum_inference_stepsis passed will be used. Must be in descending order. - guidance_scale (
float, optional, defaults to4.0) — Guidance scale as defined in Classifier-Free Diffusion Guidance.guidance_scaleis defined aswof equation 2. of Imagen Paper. Guidance scale is enabled by settingguidance_scale > 1. Higher guidance scale encourages to generate images that are closely linked to the textprompt, usually at the expense of lower image quality. Used for the video modality (there is a separate valueaudio_guidance_scalefor the audio modality). - stg_scale (
float, optional, defaults to0.0) — Video guidance scale for Spatio-Temporal Guidance (STG), proposed in Spatiotemporal Skip Guidance for Enhanced Video Diffusion Sampling. STG uses a CFG-like estimate where we move the sample away from a weak sample from a perturbed version of the denoising model. Enabling STG will result in an additional denoising model forward pass; the default value of0.0means that STG is disabled. - modality_scale (
float, optional, defaults to1.0) — Video guidance scale for LTX-2.X modality isolation guidance, where we move the sample away from a weaker sample generated by the denoising model withy cross-modality (audio-to-video and video-to-audio) cross attention disabled using a CFG-like estimate. Enabling modality guidance will result in an additional denoising model forward pass; the default value of1.0means that modality guidance is disabled. - guidance_rescale (
float, optional, defaults to 0.0) — Guidance rescale factor proposed by Common Diffusion Noise Schedules and Sample Steps are Flawedguidance_scaleis defined asφin equation 16. of Common Diffusion Noise Schedules and Sample Steps are Flawed. Guidance rescale factor should fix overexposure when using zero terminal SNR. Used for the video modality. - audio_guidance_scale (
float, optional defaults toNone) — Audio guidance scale for CFG with respect to the negative prompt. The CFG update rule is the same for video and audio, but they can use different values for the guidance scale. The LTX-2.X authors suggest that theaudio_guidance_scaleshould be higher relative to the videoguidance_scale(e.g. for LTX-2.3 they suggest 3.0 for video and 7.0 for audio). IfNone, defaults to the video valueguidance_scale. - audio_stg_scale (
float, optional, defaults toNone) — Audio guidance scale for STG. As with CFG, the STG update rule is otherwise the same for video and audio. For LTX-2.3, a value of 1.0 is suggested for both video and audio. IfNone, defaults to the video valuestg_scale. - audio_modality_scale (
float, optional, defaults toNone) — Audio guidance scale for LTX-2.X modality isolation guidance. As with CFG, the modality guidance rule is otherwise the same for video and audio. For LTX-2.3, a value of 3.0 is suggested for both video and audio. IfNone, defaults to the video valuemodality_scale. - audio_guidance_rescale (
float, optional, defaults toNone) — A separate guidance rescale factor for the audio modality. IfNone, defaults to the video valueguidance_rescale. - spatio_temporal_guidance_blocks (
list[int], optional, defaults toNone) — The zero-indexed transformer block indices at which to apply STG. Must be supplied if STG is used (stg_scaleoraudio_stg_scaleis greater than0). A value of[29]is recommended for LTX-2.0 and[28]is recommended for LTX-2.3. - noise_scale (
float, optional, defaults toNone) — The interpolation factor between random noise and denoised latents at each timestep. Applying noise to thelatentsandaudio_latentsbefore continue denoising. If not set, will be inferred from the sigma schedule. - num_videos_per_prompt (
int, optional, defaults to 1) — The number of videos to generate per prompt. - generator (
torch.GeneratororList[torch.Generator], optional) — One or a list of torch generator(s) to make generation deterministic. - latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - audio_latents (
torch.Tensor, optional) — Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied randomgenerator. - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - prompt_attention_mask (
torch.Tensor, optional) — Pre-generated attention mask for text embeddings. - negative_prompt_embeds (
torch.FloatTensor, optional) — Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - negative_prompt_attention_mask (
torch.FloatTensor, optional) — Pre-generated attention mask for negative text embeddings. - decode_timestep (
float, defaults to0.0) — The timestep at which generated video is decoded. - decode_noise_scale (
float, defaults toNone) — The interpolation factor between random noise and denoised latents at the decode timestep. - use_cross_timestep (
booloptional, defaults toFalse) — Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when calculating the cross attention modulation parameters.Trueis the newer (e.g. LTX-2.3) behavior;Falseis the legacy LTX-2.0 behavior. - output_type (
str, optional, defaults to"pil") — The output format of the generate image. Choose between PIL:PIL.Image.Imageornp.array. - return_dict (
bool, optional, defaults toTrue) — Whether or not to return a~pipelines.ltx.LTX2PipelineOutputinstead of a plain tuple. - attention_kwargs (
dict, optional) — A kwargs dictionary that if specified is passed along to theAttentionProcessoras defined underself.processorin diffusers.models.attention_processor. - callback_on_step_end (
Callable, optional) — A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments:callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict).callback_kwargswill include a list of all tensors as specified bycallback_on_step_end_tensor_inputs. - callback_on_step_end_tensor_inputs (
List, optional) — The list of tensor inputs for thecallback_on_step_endfunction. The tensors specified in the list will be passed ascallback_kwargsargument. You will only be able to include variables listed in the._callback_tensor_inputsattribute of your pipeline class. - max_sequence_length (
int, optional, defaults to1024) — Maximum sequence length to use with theprompt.
Returns
~pipelines.ltx.LTX2PipelineOutput or tuple
If return_dict is True, ~pipelines.ltx.LTX2PipelineOutput is returned, otherwise a tuple is
returned where the first element is a list with the generated images.
Function invoked when calling the pipeline for generation.
Examples:
>>> import torch
>>> from diffusers import LTX2ConditionPipeline
>>> from diffusers.pipelines.ltx2.export_utils import encode_video
>>> from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
>>> from diffusers.utils import load_image
>>> pipe = LTX2ConditionPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
>>> pipe.enable_model_cpu_offload()
>>> first_image = load_image(
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
... )
>>> last_image = load_image(
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
... )
>>> first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
>>> last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
>>> conditions = [first_cond, last_cond]
>>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings."
>>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, static"
>>> frame_rate = 24.0
>>> video = pipe(
... conditions=conditions,
... prompt=prompt,
... negative_prompt=negative_prompt,
... width=768,
... height=512,
... num_frames=121,
... frame_rate=frame_rate,
... num_inference_steps=40,
... guidance_scale=4.0,
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)
>>> encode_video(
... video[0],
... fps=frame_rate,
... audio=audio[0].float().cpu(),
... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000
... output_path="video.mp4",
... )apply_visual_conditioning
< source >( latents: Tensor conditioning_mask: Tensor condition_latents: list condition_strengths: list condition_indices: list latent_height: int latent_width: int ) → Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
Parameters
- latents (
torch.Tensor) — Initial packed (patchified) latents of shape [batch_size, patch_seq_len, hidden_dim]. - conditioning_mask (
torch.Tensor, optional) — Initial packed (patchified) conditioning mask of shape [batch_size, patch_seq_len, 1] with values in [0, 1] where 0 means that the denoising model output will be fully used and 1 means that the condition will be fully used (with intermediate values specifying a blend of the denoised and latent values).
Returns
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
Returns a 3-tuple of tensors where:
- The first element is the packed video latents (with unchanged shape [batch_size, patch_seq_len, hidden_dim]) with the conditions applied
- The second element is the packed conditioning mask with conditioning strengths applied
- The third element holds the clean conditioning latents.
Applies visual conditioning frames to an initial latent.
encode_prompt
< source >( prompt: str | list[str] negative_prompt: str | list[str] | None = None do_classifier_free_guidance: bool = True num_videos_per_prompt: int = 1 prompt_embeds: torch.Tensor | None = None negative_prompt_embeds: torch.Tensor | None = None prompt_attention_mask: torch.Tensor | None = None negative_prompt_attention_mask: torch.Tensor | None = None max_sequence_length: int = 1024 scale_factor: int = 8 device: torch.device | None = None dtype: torch.dtype | None = None )
Parameters
- prompt (
strorlist[str], optional) — prompt to be encoded - negative_prompt (
strorlist[str], optional) — The prompt or prompts not to guide the image generation. If not defined, one has to passnegative_prompt_embedsinstead. Ignored when not using guidance (i.e., ignored ifguidance_scaleis less than1). - do_classifier_free_guidance (
bool, optional, defaults toTrue) — Whether to use classifier free guidance or not. - num_videos_per_prompt (
int, optional, defaults to 1) — Number of videos that should be generated per prompt. torch device to place the resulting embeddings on - prompt_embeds (
torch.Tensor, optional) — Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated frompromptinput argument. - negative_prompt_embeds (
torch.Tensor, optional) — Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, negative_prompt_embeds will be generated fromnegative_promptinput argument. - device — (
torch.device, optional): torch device - dtype — (
torch.dtype, optional): torch dtype
Encodes the prompt into text encoder hidden states.
preprocess_conditions
< source >( conditions: diffusers.pipelines.ltx2.pipeline_ltx2_condition.LTX2VideoCondition | list[diffusers.pipelines.ltx2.pipeline_ltx2_condition.LTX2VideoCondition] | None = None height: int = 512 width: int = 768 num_frames: int = 121 device: torch.device | None = None ) → Tuple[List[torch.Tensor], List[float], List[int]]
Parameters
- conditions (
LTX2VideoConditionorList[LTX2VideoCondition], optional, defaults toNone) — A list of image/video condition instances. - height (
int, optional, defaults to512) — The desired height in pixels. - width (
int, optional, defaults to768) — The desired width in pixels. - num_frames (
int, optional, defaults to121) — The desired number of frames in the generated video. - device (
torch.device, optional, defaults toNone) — The device on which to put the preprocessed image/video tensors.
Returns
Tuple[List[torch.Tensor], List[float], List[int]]
Returns a 3-tuple of lists of length len(conditions) as follows:
- The first list is a list of preprocessed video tensors of shape [batch_size=1, num_channels, num_frames, height, width].
- The second list is a list of conditioning strengths.
- The third list is a list of indices in latent space to insert the corresponding condition.
Preprocesses the condition images/videos to torch tensors.
trim_conditioning_sequence
< source >( start_frame: int sequence_num_frames: int target_num_frames: int ) → int
Trim a conditioning sequence to the allowed number of frames.
LTX2LatentUpsamplePipeline
class diffusers.LTX2LatentUpsamplePipeline
< source >( vae: AutoencoderKLLTX2Video latent_upsampler: LTX2LatentUpsamplerModel )
__call__
< source >( video: list[PIL.Image.Image | numpy.ndarray | torch.Tensor | list[PIL.Image.Image] | list[numpy.ndarray] | list[torch.Tensor]] | None = None height: int = 512 width: int = 768 num_frames: int = 121 spatial_patch_size: int = 1 temporal_patch_size: int = 1 latents: torch.Tensor | None = None latents_normalized: bool = False decode_timestep: float | list[float] = 0.0 decode_noise_scale: float | list[float] | None = None adain_factor: float = 0.0 tone_map_compression_ratio: float = 0.0 generator: torch._C.Generator | list[torch._C.Generator] | None = None output_type: str | None = 'pil' return_dict: bool = True ) → ~pipelines.ltx.LTXPipelineOutput or tuple
Parameters
- video (
list[PipelineImageInput], optional) — The video to be upsampled (such as a LTX 2.0 first stage output). If not supplied,latentsshould be supplied. - height (
int, optional, defaults to512) — The height in pixels of the input video (not the generated video, which will have a larger resolution). - width (
int, optional, defaults to768) — The width in pixels of the input video (not the generated video, which will have a larger resolution). - num_frames (
int, optional, defaults to121) — The number of frames in the input video. - spatial_patch_size (
int, optional, defaults to1) — The spatial patch size of the video latents. Used whenlatentsis supplied if unpacking is necessary. - temporal_patch_size (
int, optional, defaults to1) — The temporal patch size of the video latents. Used whenlatentsis supplied if unpacking is necessary. - latents (
torch.Tensor, optional) — Pre-generated video latents. This can be supplied in place of thevideoargument. Can either be a patch sequence of shape(batch_size, seq_len, hidden_dim)or a video latent of shape(batch_size, latent_channels, latent_frames, latent_height, latent_width). - latents_normalized (
bool, optional, defaults toFalse) — Iflatentsare supplied, whether thelatentsare normalized using the VAE latent mean and std. IfTrue, thelatentswill be denormalized before being supplied to the latent upsampler. - decode_timestep (
float, defaults to0.0) — The timestep at which generated video is decoded. - decode_noise_scale (
float, defaults toNone) — The interpolation factor between random noise and denoised latents at the decode timestep. - adain_factor (
float, optional, defaults to0.0) — Adaptive Instance Normalization (AdaIN) blending factor between the upsampled and original latents. Should be in [-10.0, 10.0]; supplying 0.0 (the default) means that AdaIN is not performed. - tone_map_compression_ratio (
float, optional, defaults to0.0) — The compression strength for tone mapping, which will reduce the dynamic range of the latent values. This is useful for regularizing high-variance latents or for conditioning outputs during generation. Should be in [0, 1], where 0.0 (the default) means tone mapping is not applied and 1.0 corresponds to the full compression effect. - generator (
torch.Generatororlist[torch.Generator], optional) — One or a list of torch generator(s) to make generation deterministic. - output_type (
str, optional, defaults to"pil") — The output format of the generate image. Choose between PIL:PIL.Image.Imageornp.array. - return_dict (
bool, optional, defaults toTrue) — Whether or not to return a~pipelines.ltx.LTXPipelineOutputinstead of a plain tuple.
Returns
~pipelines.ltx.LTXPipelineOutput or tuple
If return_dict is True, ~pipelines.ltx.LTXPipelineOutput is returned, otherwise a tuple is
returned where the first element is the upsampled video.
Function invoked when calling the pipeline for generation.
Examples:
>>> import torch
>>> from diffusers import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline
>>> from diffusers.pipelines.ltx2.export_utils import encode_video
>>> from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
>>> from diffusers.utils import load_image
>>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
>>> pipe.enable_model_cpu_offload()
>>> image = load_image(
... "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
... )
>>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background."
>>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
>>> frame_rate = 24.0
>>> video, audio = pipe(
... image=image,
... prompt=prompt,
... negative_prompt=negative_prompt,
... width=768,
... height=512,
... num_frames=121,
... frame_rate=frame_rate,
... num_inference_steps=40,
... guidance_scale=4.0,
... output_type="pil",
... return_dict=False,
... )
>>> latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
... "Lightricks/LTX-2", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
... )
>>> upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
>>> upsample_pipe.vae.enable_tiling()
>>> upsample_pipe.to(device="cuda", dtype=torch.bfloat16)
>>> video = upsample_pipe(
... video=video,
... width=768,
... height=512,
... output_type="np",
... return_dict=False,
... )[0]
>>> encode_video(
... video[0],
... fps=frame_rate,
... audio=audio[0].float().cpu(),
... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000
... output_path="video.mp4",
... )adain_filter_latent
< source >( latents: Tensor reference_latents: Tensor factor: float = 1.0 ) → torch.Tensor
Parameters
- latent (
torch.Tensor) — Input latents to normalize - reference_latents (
torch.Tensor) — The reference latents providing style statistics. - factor (
float) — Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0
Returns
torch.Tensor
The transformed latent tensor
Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent tensor.
tone_map_latents
< source >( latents: Tensor compression: float )
Applies a non-linear tone-mapping function to latent values to reduce their dynamic range in a perceptually smooth way using a sigmoid-based compression.
This is useful for regularizing high-variance latents or for conditioning outputs during generation, especially
when controlling dynamic behavior with a compression factor.
LTX2PipelineOutput
class diffusers.pipelines.ltx2.pipeline_output.LTX2PipelineOutput
< source >( frames: Tensor audio: Tensor )
Parameters
- frames (
torch.Tensor,np.ndarray, or list[list[PIL.Image.Image]]) — List of video outputs - It can be a nested list of lengthbatch_size,with each sub-list containing denoised PIL image sequences of lengthnum_frames.It can also be a NumPy array or Torch tensor of shape(batch_size, num_frames, channels, height, width). - audio (
torch.Tensor,np.ndarray) — TODO
Output class for LTX pipelines.