jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -28,12 +28,18 @@ class EndpointHandler:
     MAX_HEIGHT = 720
     MAX_FRAMES = 257
     def __init__(self, path: str = ""):
         """Initialize the LTX Video handler with both text-to-video and image-to-video pipelines.
         Args:
             path (str): Path to the model weights directory
         """
         # Load both pipelines with bfloat16 precision as recommended in docs
         self.text_to_video = LTXPipeline.from_pretrained(
             path,
@@ -45,9 +51,9 @@ class EndpointHandler:
             torch_dtype=torch.bfloat16
         ).to("cuda")
-        # Enable memory optimizations
-        self.text_to_video.enable_model_cpu_offload()
-        self.image_to_video.enable_model_cpu_offload()
     def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
         """Validate and adjust resolution to meet constraints.
@@ -158,28 +164,30 @@ class EndpointHandler:
         Returns:
             Dict[str, Any]: Dictionary containing:
-                - video: Base64 encoded MP4 video
-                - content-type: MIME type of the video
                 - metadata: Dictionary with actual values used for generation
         """
-        # Extract and validate prompt
-        prompt = data.get("prompt")
         if not prompt:
-            raise ValueError("'prompt' is required in the input data")
         # Get and validate resolution
-        width = data.get("width", self.DEFAULT_WIDTH)
-        height = data.get("height", self.DEFAULT_HEIGHT)
         width, height = self._validate_and_adjust_resolution(width, height)
         # Get and validate frames and FPS
-        num_frames = data.get("num_frames", self.DEFAULT_NUM_FRAMES)
-        fps = data.get("fps", self.DEFAULT_FPS)
         num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
         # Get other parameters with defaults
-        guidance_scale = data.get("guidance_scale", 7.5)
-        num_inference_steps = data.get("num_inference_steps", self.DEFAULT_NUM_STEPS)
         logger.info(f"Generating video with prompt: '{prompt}'")
         logger.info(f"Parameters: size={width}x{height}, num_frames={num_frames}, fps={fps}")
@@ -216,9 +224,14 @@ class EndpointHandler:
                 # Encode video to base64
                 video_base64 = base64.b64encode(video_content).decode('utf-8')
                 return {
-                    "video": video_base64,
-                    "content-type": "video/mp4",
                     "metadata": {
                         "width": width,
                         "height": height,

     MAX_HEIGHT = 720
     MAX_FRAMES = 257
+    ENABLE_CPU_OFFLOAD = True
+    EXPERIMENTAL_STUFF = False
     def __init__(self, path: str = ""):
         """Initialize the LTX Video handler with both text-to-video and image-to-video pipelines.
         Args:
             path (str): Path to the model weights directory
         """
+        if EXPERIMENTAL_STUFF:
+            torch.backends.cuda.matmul.allow_tf32 = True
         # Load both pipelines with bfloat16 precision as recommended in docs
         self.text_to_video = LTXPipeline.from_pretrained(
             path,
             torch_dtype=torch.bfloat16
         ).to("cuda")
+        if ENABLE_CPU_OFFLOAD:
+            self.text_to_video.enable_model_cpu_offload()
+            self.image_to_video.enable_model_cpu_offload()
     def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
         """Validate and adjust resolution to meet constraints.
         Returns:
             Dict[str, Any]: Dictionary containing:
+                - video: video encoded in Base64 (h.264 MP4 video). This is a data-uri (prefixed with "data:").
+                - content-type: MIME type of the video (right now always "video/mp4")
                 - metadata: Dictionary with actual values used for generation
         """
+        # Get inputs from request data
+        prompt = data.pop("inputs", None)
         if not prompt:
+            raise ValueError("No prompt provided in the 'inputs' field")
         # Get and validate resolution
+        width = data.pop("width", self.DEFAULT_WIDTH)
+        height = data.pop("height", self.DEFAULT_HEIGHT)
         width, height = self._validate_and_adjust_resolution(width, height)
         # Get and validate frames and FPS
+        num_frames = data.pop("num_frames", self.DEFAULT_NUM_FRAMES)
+        fps = data.pop("fps", self.DEFAULT_FPS)
         num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
         # Get other parameters with defaults
+        guidance_scale = data.pop("guidance_scale", 7.5)
+        num_inference_steps = data.pop("num_inference_steps", self.DEFAULT_NUM_STEPS)
+        seed = data.pop("seed", -1)
+        seed = None if seed == -1 else int(seed)
         logger.info(f"Generating video with prompt: '{prompt}'")
         logger.info(f"Parameters: size={width}x{height}, num_frames={num_frames}, fps={fps}")
                 # Encode video to base64
                 video_base64 = base64.b64encode(video_content).decode('utf-8')
+                content_type = "video/mp4"
+                # Add MP4 data URI prefix
+                video_data_uri = f"data:{content_type};base64,{video_base64}"
                 return {
+                    "video": video_data_uri,
+                    "content-type": content_type,
                     "metadata": {
                         "width": width,
                         "height": height,