File size: 2,348 Bytes
b0a48de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from transformers import TimesformerModel, VideoMAEImageProcessor
import torch
import cv2
import numpy as np
from torchvision.transforms import Lambda
from pytorchvideo.transforms import (
Normalize,
)
from torchvision.transforms import (
Lambda,
)
import os
from os.path import isfile, join, basename
def extract_features(frames, device, model, image_processor):
# Convert frames to tensor
frames_tensor = torch.stack([torch.from_numpy(frame) for frame in frames])
# Change the order of the tensor to (num_frames, channel, height, width)
frames_tensor = frames_tensor.permute(3, 0, 1, 2).to(device)
# Get the mean and std of the image processor
mean = image_processor.image_mean
std = image_processor.image_std
# Normalize frames
frames_tensor = Lambda(lambda x: x / 255.0)(frames_tensor)
frames_tensor = Normalize(mean, std)(frames_tensor)
# Change the order of the tensor to (num_frames, channel, height, width) and add a batch dimension
frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)
# Load the model to the device
model.to(device)
model.eval()
outputs = model(frames_tensor)
# Get the output after the Transformer Encoder (MLP head)
final_output = outputs[0][:, 0]
return final_output
def to_video(selected_frames, frames, output_path, video_fps):
print("MP4 Format.")
# Write the selected frames to a video
video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), video_fps, (frames[0].shape[1], frames[0].shape[0]))
# selected_frames is a list of indices of frames
for idx in selected_frames:
video_writer.write(frames[idx])
video_writer.release()
print("Completed summarizing the video (wait for a moment to load).")
def load_model():
try:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k600").to(DEVICE).eval()
processor=VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
return model, processor, DEVICE
except Exception as e:
print(e)
def sum_of_squared_difference(vector1, vector2):
squared_diff = np.square(vector1 - vector2)
sum_squared_diff = np.sum(squared_diff)
return sum_squared_diff
|