aurelio-ai
/

xclip-base-patch16-zero-shot

+import io
+from typing import Any, Dict, List
+import cv2
+import tempfile
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, XCLIPModel, XCLIPProcessor
+from huggingface_hub import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import timeit
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class EndpointHandler:
+    def __init__(self, path=""):
+        # Preload all the elements you are going to need at inference.
+        # pseudo
+        # self.model = load_model(path)
+        model_id = "microsoft/xclip-base-patch16-zero-shot"
+        # self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.processor = XCLIPProcessor.from_pretrained(path)
+        self.model = XCLIPModel.from_pretrained(path).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        logging.set_verbosity_debug()
+        self.logger = logging.get_logger(__name__)
+        # Check if CUDA (GPU support) is available
+        if torch.cuda.is_available():
+            self.logger.info("GPU is available for inference.")
+            self.logger.info(f"Using {torch.cuda.get_device_name(0)}")
+        else:
+            self.logger.info("GPU is not available, using CPU for inference.")
+    def download_video_as_bytes(self, url: str) -> (bytes, dict):
+        """
+        Download a video from a given URL, load it in RAM, and return it as bytes.
+        Parameters:
+        - url (str): The URL of the video to download.
+        Returns:
+        - bytes or None: The video content as bytes if successful, None otherwise.
+        - dict or None: The video download headers if succesful, None otherwise.
+        """
+        try:
+            response = requests.get(url)
+            response.raise_for_status()  # Raise an exception for HTTP errors
+            return response.content, response.headers
+        except requests.RequestException as e:
+            print(f"Error downloading the video: {e}")
+            return None, None
+    def extract_evenly_spaced_frames_from_bytes(
+        self, video_bytes: bytes, num_frames: int = 32
+    ) -> list:
+        # Write bytes to a temporary file
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+            temp_video.write(video_bytes)
+            temp_video.flush()
+            # Create a VideoCapture object using the temporary file's name
+            vidcap = cv2.VideoCapture(temp_video.name)
+            # Get the total number of frames in the video
+            total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+            # Calculate the interval at which frames should be extracted
+            interval = total_frames // num_frames
+            frames = []
+            for i in range(num_frames):
+                # Set the video position to the next frame to be captured
+                vidcap.set(cv2.CAP_PROP_POS_FRAMES, i * interval)
+                # Read the frame
+                success, image = vidcap.read()
+                # If successfully read, add the frame to the list
+                if success:
+                    frames.append(image)
+            return frames
+    def preprocess_frames(self, video_frames):
+        """
+        Define a preprocessing function to convert video frames into a format suitable for the model
+        """
+        frames = np.array(video_frames)
+        # Use the XCLIP Processor to preprocess the frames
+        inputs = self.processor(
+            text=None, videos=list(frames), return_tensors="pt", padding=True
+        ).to(self.device)
+        return inputs
+    def embed_frames_with_xclip_processing(self, frames):
+        # Initialize an empty list to store the frame embeddings
+        frame_embeddings = []
+        frame_preprocessed = self.preprocess_frames(frames)
+        # Pass the preprocessed frame through the model to get the frame embeddings
+        frame_embedding = self.model.get_video_features(**frame_preprocessed)
+        # Add the frame embeddings to the list
+        frame_embeddings.append(frame_embedding)
+        # Stack the list of frame embeddings into a single tensor
+        tensor = torch.stack(frame_embeddings)
+        # detach text emb from graph, move to CPU, and convert to numpy array
+        batch_emb = tensor.squeeze(0)
+        batch_emb = batch_emb.cpu().detach().numpy()
+        # NORMALIZE
+        batch_emb = batch_emb.T / np.linalg.norm(batch_emb, axis=1)
+        # transpose back to (21, 512)
+        batch_emb = batch_emb.tolist()
+        return batch_emb
+    def process_video(self, video_url, video_metadata):
+        try:
+            video_bytes, video_headers = self.download_video_as_bytes(video_url)
+            frames = self.extract_evenly_spaced_frames_from_bytes(
+                video_bytes, num_frames=32
+            )
+            frame_embeddings = self.embed_frames_with_xclip_processing(frames)
+            video_metadata["url"] = video_url
+            return frame_embeddings, video_metadata
+        except Exception as e:
+            print(e)
+            return None, None, None
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Process the input data based on its type and return the embeddings.
+        This method accepts a dictionary with a 'process_type' key that can be either 'images' or 'text'.
+        If 'process_type' is 'images', the method expects a list of image URLs under the 'images_urls' key.
+        It downloads and processes these images, and returns their embeddings.
+        If 'process_type' is 'text', the method expects a string query under the 'query' key.
+        It processes this text and returns its embedding.
+        Parameters:
+        - data: Dict[str, Any]
+            A dictionary containing the data to be processed.
+            It must include a 'process_type' key with value either 'images' or 'text'.
+            If 'process_type' is 'images', data should also include 'images_urls' key with a list of image URLs.
+            If 'process_type' is 'text', data should also include 'query' key with a string query.
+        Returns:
+        - List[Dict[str, Any]]
+            A list of dictionaries, each containing the embeddings of the processed data.
+            If an error occurs during processing, the dictionary will include an 'error' key with the error message.
+        Raises:
+        - ValueError: If the 'process_type' key is not present in data, or if the required keys for 'images' or 'text' are not present or are of the wrong type.
+        """
+        if data["process_type"] == "videos":
+            try:
+                if "videos_urls" not in data or not isinstance(
+                    data["videos_urls"], list
+                ):
+                    raise ValueError(
+                        "Data must contain 'videos_urls' key with a list of videos urls."
+                    )
+                batch_size = 10
+                if "batch_size" in data:
+                    batch_size = int(data["batch_size"])
+                # Download and process the videos
+                processed_video_embeddings = []
+                processed_videos_metadata = []
+                for i in range(0, len(data["videos_urls"]), batch_size):
+                    videos_urls = data["videos_urls"][i : i + batch_size]
+                    videos_metadata = data["videos_metadata"][i : i + batch_size]
+                    with ThreadPoolExecutor() as executor:
+                        futures = [
+                            executor.submit(self.process_video, url, metadata)
+                            for url, metadata in zip(videos_urls, videos_metadata)
+                        ]
+                        for future in as_completed(futures):
+                            frame_embeddings, video_metadata = future.result()
+                            if frame_embeddings is not None:
+                                processed_video_embeddings.append(frame_embeddings)
+                                processed_metadata = {
+                                    "text": video_metadata["caption"],
+                                    "source": video_metadata["url"],
+                                    "source_type": "video_frames",
+                                    **video_metadata,
+                                }
+                                processed_videos_metadata.append(processed_metadata)
+                # Return the embeddings
+                return {
+                    "embeddings": processed_video_embeddings,
+                    "metadata": processed_videos_metadata,
+                }
+            except Exception as e:
+                print(f"Error during videos processing: {str(e)}")
+                return {"embeddings": [], "error": str(e)}
+        elif data["process_type"] == "text":
+            if "query" not in data or not isinstance(data["query"], str):
+                raise ValueError("Data must contain 'query' key which is a str.")
+            query = data["query"]
+            inputs = self.tokenizer(query, return_tensors="pt").to(self.device)
+            text_emb = self.model.get_text_features(**inputs)
+            # detach text emb from graph, move to CPU, and convert to numpy array
+            text_emb = text_emb.detach().cpu().numpy()
+            # calculate value to normalize each vector by and normalize them
+            norm_factor = np.linalg.norm(text_emb, axis=1)
+            text_emb = text_emb.T / norm_factor
+            # transpose back to (21, 512)
+            text_emb = text_emb.T
+            # Converting tensor to list for JSON response
+            text_emb_list = text_emb.tolist()
+            return {"embeddings": text_emb_list}
+        else:
+            print(
+                f"Error during CLIP endpoint processing: data['process_type']: {data['process_type']} neither 'images' or 'text'"
+            )
+            return {"embeddings": [], "error": str(e)}
+        # pseudo
+        # self.model(input)

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+certifi==2023.7.22
+charset-normalizer==3.3.2
+colorama==0.4.6
+filelock==3.13.1
+fsspec==2023.10.0
+huggingface-hub==0.17.3
+idna==3.4
+Jinja2==3.1.2
+MarkupSafe==2.1.3
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.24.4
+opencv-python==4.8.1.78
+packaging==23.2
+Pillow==10.1.0
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.0
+sympy==1.12
+tokenizers==0.13.3
+tqdm==4.66.1
+transformers==4.27.2
+typing_extensions==4.8.0
+urllib3==2.0.7