import requests from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer import torch from PIL import Image model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu") model1.to(device1) max_length = 16 num_beams = 4 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} def image_to_text_model_1(image_url): raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values pixel_values = pixel_values.to(device1) output_ids = model1.generate(pixel_values, **gen_kwargs) preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds def bytes_to_text_model_1(bts): pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values pixel_values = pixel_values.to(device1) output_ids = model1.generate(pixel_values, **gen_kwargs) preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] print(preds[0]) import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import torch device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2) def image_to_text_model_2(img_url): raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') text = "a picture of " inputs = processor2(raw_image, text, return_tensors="pt").to(device2) out = model2.generate(**inputs, num_beams = 3) print(processor2.decode(out[0], skip_special_tokens=True)) def bytes_to_text_model_2(byts): text = "a picture of " inputs = processor2(byts, text, return_tensors="pt").to(device2) out = model2.generate(**inputs, num_beams = 3) print(processor2.decode(out[0], skip_special_tokens=True)) import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") def image_to_text_model_3(img_url): raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') text = "a picture of" inputs = processor3(raw_image, text, return_tensors="pt") inputs = processor3(raw_image, return_tensors="pt") out = model3.generate(**inputs) print(processor3.decode(out[0], skip_special_tokens=True)) def bytes_to_text_model_3(byts): text = "a picture of" inputs = processor3(byts, text, return_tensors="pt") inputs = processor3(byts, return_tensors="pt") out = model3.generate(**inputs) print(processor3.decode(out[0], skip_special_tokens=True)) import cv2 def FrameCapture(path): vidObj = cv2.VideoCapture(path) count = 0 success = 1 while success: success, image = vidObj.read() if count % 20 == 0: print("NEW FRAME") print("MODEL 1") bytes_to_text_model_1(image) print("MODEL 2") bytes_to_text_model_2(image) print("MODEL 3") bytes_to_text_model_3(image) print("\n\n") count += 1 FrameCapture("animation.mp4")