1inchcard commited on
Commit
3efb08e
1 Parent(s): 009ce0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -13
app.py CHANGED
@@ -1,23 +1,39 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  from PIL import Image
 
 
4
 
5
- # Load the tokenizer and model
6
  tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
7
- model = AutoModelForSequenceClassification.from_pretrained("neulab/UIX-Qwen2")
8
 
9
- # Function to process the screenshot and prompt
 
 
 
 
 
 
 
 
10
  def predict_coordinates(screenshot, prompt):
11
- # Process the image and prompt here
12
- # For now, we'll use the prompt as input (actual screenshot integration needs proper pre-processing)
13
 
 
14
  inputs = tokenizer(prompt, return_tensors="pt")
15
- outputs = model(**inputs)
16
 
17
- # Example response (fake coordinates for now)
18
- coordinates = {"x": 100, "y": 200} # This would come from the model output
19
-
20
- return coordinates
 
 
 
 
 
 
21
 
22
  # Gradio Interface
23
  with gr.Blocks() as demo:
@@ -34,4 +50,4 @@ with gr.Blocks() as demo:
34
  submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)
35
 
36
  # Launch the Gradio app
37
- demo.launch()
 
1
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
2
+ import torch
3
  from PIL import Image
4
+ import numpy as np
5
+ import gradio as gr
6
 
7
+ # Load the model and tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
9
+ model = AutoModel.from_pretrained("neulab/UIX-Qwen2")
10
 
11
+ # Function to preprocess the image (for simplicity, assume basic resizing)
12
+ def preprocess_image(image):
13
+ # Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
14
+ image = image.resize((224, 224)) # Example size
15
+ image = np.array(image).astype(np.float32) / 255.0 # Normalize to [0, 1]
16
+ image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0) # Convert to tensor, add batch dim
17
+ return image
18
+
19
+ # Function to predict coordinates based on screenshot and prompt
20
  def predict_coordinates(screenshot, prompt):
21
+ # Preprocess the image (screenshot)
22
+ image_tensor = preprocess_image(screenshot)
23
 
24
+ # Tokenize the prompt (text input)
25
  inputs = tokenizer(prompt, return_tensors="pt")
 
26
 
27
+ # Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
28
+ outputs = model(**inputs, pixel_values=image_tensor)
29
+
30
+ # The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
31
+ coordinates = outputs.logits # Placeholder: adapt to actual model's coordinate prediction output
32
+
33
+ # Convert logits to coordinates (this is an example, adjust based on model's actual output format)
34
+ x, y = torch.argmax(coordinates, dim=-1).tolist() # Example conversion to (x, y)
35
+
36
+ return {"x": x, "y": y}
37
 
38
  # Gradio Interface
39
  with gr.Blocks() as demo:
 
50
  submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)
51
 
52
  # Launch the Gradio app
53
+ demo.launch()