jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -70,6 +70,14 @@ class EndpointHandler:
             self.text_to_video.enable_model_cpu_offload()
             self.image_to_video.enable_model_cpu_offload()
     def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
         """Validate and adjust resolution to meet constraints.
@@ -117,57 +125,44 @@ class EndpointHandler:
         return num_frames, fps
-    def _create_video_file(self, frames: torch.Tensor, fps: int = DEFAULT_FPS) -> bytes:
-        """Convert frames to an MP4 video file.
-        Args:
-            frames (torch.Tensor): Generated frames tensor
-            fps (int): Frames per second for the output video
-        Returns:
-            bytes: MP4 video file content
-        """
-        # Log frame information
-        num_frames = frames.shape[1]
-        duration = num_frames / fps
-        logger.info(f"Creating video with {num_frames} frames at {fps} FPS (duration: {duration:.2f} seconds)")
-        # Convert tensor to numpy array
-        video_np = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float().numpy()
-        video_np = (video_np * 255).astype(np.uint8)
-        # Get dimensions
-        _, height, width, _ = video_np.shape
-        logger.info(f"Video dimensions: {width}x{height}")
-        # Create temporary file
-        output_path = tempfile.mktemp(suffix=".mp4")
-        try:
-            # Create video clip and write to file
-            clip = ImageSequenceClip(list(video_np), fps=fps)
-            # potential speed optimizations:
-            # there is a preset= field, to trade encoding speed with file size (but not quality)
-            # values are: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, placebo
-            #
-            # there is a threads= field, by default None, which can be set to 2, 3, 4 etc..
-            clip.write_videofile(output_path, codec="libx264", audio=False)
-            # Read the video file
-            with open(output_path, "rb") as f:
-                video_content = f.read()
-            return video_content
-        finally:
-            # Cleanup
-            if os.path.exists(output_path):
-                os.remove(output_path)
-            # Clear memory
-            del video_np
-            torch.cuda.empty_cache()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process the input data and generate video using LTX.
@@ -189,35 +184,32 @@ class EndpointHandler:
                 - content-type: MIME type of the video (right now always "video/mp4")
                 - metadata: Dictionary with actual values used for generation
         """
-        # Get inputs from request data
         prompt = data.get("inputs", None)
         if not prompt:
             raise ValueError("No prompt provided in the 'inputs' field")
-        # Get and validate resolution
         width = data.get("width", self.DEFAULT_WIDTH)
         height = data.get("height", self.DEFAULT_HEIGHT)
         width, height = self._validate_and_adjust_resolution(width, height)
-        # Get and validate frames and FPS
         num_frames = data.get("num_frames", self.DEFAULT_NUM_FRAMES)
         fps = data.get("fps", self.DEFAULT_FPS)
         num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
-        # Get other parameters with defaults
         guidance_scale = data.get("guidance_scale", 7.5)
         num_inference_steps = data.get("num_inference_steps", self.DEFAULT_NUM_STEPS)
         seed = data.get("seed", -1)
         seed = random.randint(0, 2**32 - 1) if seed == -1 else int(seed)
-        logger.info(f"Generating video with prompt: '{prompt}'")
-        logger.info(f"Video params: size={width}x{height}, num_frames={num_frames}, fps={fps}")
-        logger.info(f"Generation params: seed={seed}, guidance_scale={guidance_scale}, num_inference_steps={num_inference_steps}")
         try:
             with torch.no_grad():
                 random.seed(seed)
                 np.random.seed(seed)
                 generator.manual_seed(seed)
@@ -233,43 +225,40 @@ class EndpointHandler:
                     "generator": generator
                 }
-                # Check if image is provided for image-to-video generation
                 image_data = data.get("image")
                 if image_data:
                     if image_data.startswith('data:'):
                         image_data = image_data.split(',', 1)[1]
                     image_bytes = base64.b64decode(image_data)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-                    logger.info("Using image-to-video generation mode")
                     generation_kwargs["image"] = image
-                    output = self.image_to_video(**generation_kwargs).frames
                 else:
-                    logger.info("Using text-to-video generation mode")
-                    output = self.text_to_video(**generation_kwargs).frames
-                # Convert frames to video file
-                video_content = self._create_video_file(output, fps=fps)
-                # Encode video to base64
-                video_base64 = base64.b64encode(video_content).decode('utf-8')
-                content_type = "video/mp4"
-                # Add MP4 data URI prefix
-                video_data_uri = f"data:{content_type};base64,{video_base64}"
                 return {
                     "video": video_data_uri,
-                    "content-type": content_type,
-                    "metadata": {
-                        "width": width,
-                        "height": height,
-                        "num_frames": num_frames,
-                        "fps": fps,
-                        "duration": num_frames / fps,
-                        "num_inference_steps": num_inference_steps,
-                        "seed": seed
-                    }
                 }
         except Exception as e:

             self.text_to_video.enable_model_cpu_offload()
             self.image_to_video.enable_model_cpu_offload()
+        self.varnish = Varnish(
+            device="cuda" if torch.cuda.is_available() else "cpu",
+            output_format="mp4",
+            output_codec="h264",
+            output_quality=23,
+            enable_mmaudio=False
+        )
     def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
         """Validate and adjust resolution to meet constraints.
         return num_frames, fps
+    async def process_and_encode_video(
+        self,
+        frames: torch.Tensor,
+        fps: int,
+        upscale_factor: int = 0,
+        enable_interpolation: bool = False,
+        interpolation_exp: int = 1
+    ) -> tuple[str, dict]:
+        """Process video frames using Varnish and return base64 encoded result"""
+        # Process video with Varnish
+        result = await self.varnish(
+            input_data=frames,
+            input_fps=fps,
+            output_fps=fps,
+            enable_upscale=upscale_factor > 1,
+            upscale_factor=upscale_factor,
+            enable_interpolation=enable_interpolation,
+            interpolation_exp=interpolation_exp
+        )
+        # Get video as data URI
+        video_data_uri = await result.write(
+            output_type="data-uri",
+            output_format="mp4",
+            output_codec="h264",
+            output_quality=23
+        )
+        metadata = {
+            "width": result.metadata.width,
+            "height": result.metadata.height,
+            "num_frames": result.metadata.frame_count,
+            "fps": result.metadata.fps,
+            "duration": result.metadata.duration
+        }
+        return video_data_uri, metadata
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process the input data and generate video using LTX.
                 - content-type: MIME type of the video (right now always "video/mp4")
                 - metadata: Dictionary with actual values used for generation
         """
         prompt = data.get("inputs", None)
         if not prompt:
             raise ValueError("No prompt provided in the 'inputs' field")
+        # Get generation parameters
         width = data.get("width", self.DEFAULT_WIDTH)
         height = data.get("height", self.DEFAULT_HEIGHT)
         width, height = self._validate_and_adjust_resolution(width, height)
         num_frames = data.get("num_frames", self.DEFAULT_NUM_FRAMES)
         fps = data.get("fps", self.DEFAULT_FPS)
         num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
+        # Get post-processing parameters
+        upscale_factor = data.get("upscale_factor", 0)
+        enable_interpolation = data.get("enable_interpolation", False)
+        interpolation_exp = data.get("interpolation_exp", 1)
         guidance_scale = data.get("guidance_scale", 7.5)
         num_inference_steps = data.get("num_inference_steps", self.DEFAULT_NUM_STEPS)
         seed = data.get("seed", -1)
         seed = random.randint(0, 2**32 - 1) if seed == -1 else int(seed)
         try:
             with torch.no_grad():
                 random.seed(seed)
                 np.random.seed(seed)
                 generator.manual_seed(seed)
                     "generator": generator
                 }
+                # Generate frames using appropriate pipeline
                 image_data = data.get("image")
                 if image_data:
                     if image_data.startswith('data:'):
                         image_data = image_data.split(',', 1)[1]
                     image_bytes = base64.b64decode(image_data)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                     generation_kwargs["image"] = image
+                    frames = self.image_to_video(**generation_kwargs).frames
                 else:
+                    frames = self.text_to_video(**generation_kwargs).frames
+                # Process and encode video
+                video_data_uri, metadata = await self.process_and_encode_video(
+                    frames=frames,
+                    fps=fps,
+                    upscale_factor=upscale_factor,
+                    enable_interpolation=enable_interpolation,
+                    interpolation_exp=interpolation_exp
+                )
+                # Add generation metadata
+                metadata.update({
+                    "num_inference_steps": num_inference_steps,
+                    "seed": seed,
+                    "upscale_factor": upscale_factor,
+                    "interpolation_enabled": enable_interpolation,
+                    "interpolation_exp": interpolation_exp
+                })
                 return {
                     "video": video_data_uri,
+                    "content-type": "video/mp4",
+                    "metadata": metadata
                 }
         except Exception as e: