import torch import numpy as np import supervision as sv from PIL import Image CAPTIONING_TASK = "" CAPTION_TO_PHRASE_GROUNDING_TASK = "" def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str: image = Image.fromarray(image).convert("RGB") text = "" inputs = processor(text=text, images=image, return_tensors="pt").to(device) generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3 ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] return processor.post_process_generation( generated_text, task=CAPTIONING_TASK, image_size=image.size) def run_caption_to_phrase_grounding( model, processor, caption: str, image: np.ndarray, device: torch.device ) -> sv.Detections: image = Image.fromarray(image).convert("RGB") text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}" inputs = processor(text=text, images=image, return_tensors="pt").to(device) generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3 ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] response = processor.post_process_generation( generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size) return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size)