from PIL import Image from transformers import Pix2StructForConditionalGeneration as psg from transformers import Pix2StructProcessor as psp def get_result(image_path, question): model = psg.from_pretrained("google/pix2struct-docvqa-large") processor = psp.from_pretrained("google/pix2struct-docvqa-large") image = Image.open(image_path).convert("RGB") inputs = processor(images=image, text=question, return_tensors="pt") predictions = model.generate(**inputs, max_new_tokens=256) predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True) return predicted_answer[0]