| | import torch |
| | import spaces |
| | import gradio as gr |
| | from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor |
| | from PIL import Image |
| |
|
| | |
| | model = Pix2StructForConditionalGeneration.from_pretrained( |
| | "google/pix2struct-screen2words-large", dtype=torch.bfloat16 |
| | ).to("cuda") |
| | model.eval() |
| | processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") |
| |
|
| | |
| | @spaces.GPU |
| | def describe_ui(image, text): |
| | |
| | inputs = processor(images=image, text=text or "", return_tensors="pt").to( |
| | dtype=torch.bfloat16, device="cuda" |
| | ) |
| | predictions = model.generate(**inputs) |
| | return processor.decode(predictions[0], skip_special_tokens=False) |
| |
|
| | |
| | gr.Interface( |
| | fn=describe_ui, |
| | inputs=[ |
| | gr.Image(type="pil", label="Upload UI Screenshot"), |
| | gr.Textbox(label="Optional prompt / instruction", placeholder="e.g. Describe layout and buttons"), |
| | ], |
| | outputs=gr.Textbox(label="Model Output"), |
| | title="UI Screen Describer (Pix2Struct)", |
| | description="Upload a screenshot or UI image and optionally enter a text prompt. The model (Google Pix2Struct) will generate a detailed description.", |
| | ).launch() |
| |
|