Spaces:

SkalskiP
/

florence-2-video

Running on Zero

initial code version

4ae7d54 4 months ago

1.65 kB

	import torch

	import numpy as np
	import supervision as sv
	from PIL import Image


	CAPTIONING_TASK = "<DETAILED_CAPTION>"
	CAPTION_TO_PHRASE_GROUNDING_TASK = "<CAPTION_TO_PHRASE_GROUNDING>"


	def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str:
	image = Image.fromarray(image).convert("RGB")
	text = "<DETAILED_CAPTION>"

	inputs = processor(text=text, images=image, return_tensors="pt").to(device)
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	num_beams=3
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	return processor.post_process_generation(
	generated_text, task=CAPTIONING_TASK, image_size=image.size)


	def run_caption_to_phrase_grounding(
	model,
	processor,
	caption: str,
	image: np.ndarray,
	device: torch.device
	) -> sv.Detections:
	image = Image.fromarray(image).convert("RGB")
	text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}"

	inputs = processor(text=text, images=image, return_tensors="pt").to(device)
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	num_beams=3
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	response = processor.post_process_generation(
	generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size)
	return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size)