# GSI Technology Video Search Demo - Embedding Videos Notebook:

The following Notebook will include code that demonstrates the process of video embedding.<br>
It specifically focuses on embedding a single video using the [Searchium-ai/clip4clip-webvid150k](https://huggingface.co/Searchium-ai/clip4clip-webvid150k) model.

In [1]:
example = './example/34721191.mp4'

In [2]:
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
from PIL import Image
import cv2
import numpy as np
import torch

# Code to convert one video to few images.  
def video2image(video_path, frame_rate=1.0, size=224):
    def preprocess(size, n_px):
        return Compose([
            Resize(size, interpolation=InterpolationMode.BICUBIC),            
            CenterCrop(size),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])(n_px)
    
    cap = cv2.VideoCapture(video_path)
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    if fps < 1:
        images = np.zeros([3, size, size], dtype=np.float32) 
        print("ERROR: problem reading video file: ", video_path)
    else:
        total_duration = (frameCount + fps - 1) // fps
        start_sec, end_sec = 0, total_duration
        interval = fps / frame_rate
        frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))
        ret = True     
        images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)
            
        for i, idx in enumerate(frames_idx):
            cap.set(cv2.CAP_PROP_POS_FRAMES , idx)
            ret, frame = cap.read()    
            if not ret: break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)             
            last_frame = i
            images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert("RGB"))
            
        images = images[:last_frame+1]
    cap.release()
    video_frames = torch.tensor(images)
    return video_frames
    
video = video2image(example)

In [3]:
from transformers import CLIPVisionModelWithProjection

model = CLIPVisionModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k")
model = model.eval()
visual_output = model(video)

# Normalizing the embeddings and calculating mean between all embeddings. 
visual_output = visual_output["image_embeds"]
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
visual_output = torch.mean(visual_output, dim=0)
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
print(visual_output)

    

tensor([-2.9570e-02,  6.0339e-03,  1.7294e-02, -1.3951e-02,  4.8329e-02,
         2.4099e-02,  3.3340e-02,  3.1769e-02,  2.1997e-03,  4.2602e-03,
        -1.3887e-02,  8.2744e-03,  2.5123e-03, -2.2163e-02, -4.1139e-02,
        -1.2101e-02, -6.1914e-02,  6.7091e-03,  4.2834e-02, -2.2604e-02,
        -2.7443e-02,  1.0600e-02,  2.9430e-03,  3.2580e-02, -1.3577e-02,
         7.8084e-03,  1.2397e-02, -5.3404e-03,  1.4736e-02, -2.4564e-02,
        -5.4057e-02,  3.9507e-02,  1.2754e-02,  4.6864e-04,  7.4087e-03,
         3.8710e-03,  7.9482e-03,  1.3444e-02, -1.7326e-02, -1.2486e-01,
        -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,
         4.1397e-03,  1.7315e-02,  4.4724e-02,  9.1722e-04,  3.1429e-02,
         3.8212e-02, -2.1133e-02,  2.4437e-03, -1.4371e-03, -2.9859e-03,
         7.8939e-04,  2.4093e-02, -2.2199e-02, -3.9110e-02,  1.7673e-02,
         1.1360e-01,  3.3466e-03, -1.9643e-02,  1.7798e-03,  1.5112e-02,
        -6.2003e-03, -2.0564e-02,  6.4936e-02,  6.6