Spaces:
Running
Running
阳渠
commited on
Commit
•
1e96bca
1
Parent(s):
8572674
Mobile-Agent-v2
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- MobileAgent/__pycache__/api.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/api_service.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/chat.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/controller.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/crop.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/icon_localization.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/local_server.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/prompt.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc +0 -0
- MobileAgent/__pycache__/text_localization.cpython-310.pyc +0 -0
- MobileAgent/api.py +45 -0
- MobileAgent/api_service.py +26 -0
- MobileAgent/chat.py +86 -0
- MobileAgent/crop.py +141 -0
- MobileAgent/icon_localization.py +59 -0
- MobileAgent/local_server.py +172 -0
- MobileAgent/prompt_no_input.py +174 -0
- MobileAgent/text_localization.py +58 -0
- README.md +5 -4
- app.py +465 -0
- cache/1.png +0 -0
- cache/10.png +0 -0
- cache/11.png +0 -0
- cache/12.png +0 -0
- cache/13.png +0 -0
- cache/14.png +0 -0
- cache/15.png +0 -0
- cache/16.png +0 -0
- cache/17.png +0 -0
- cache/18.png +0 -0
- cache/19.png +0 -0
- cache/2.png +0 -0
- cache/20.png +0 -0
- cache/21.png +0 -0
- cache/22.png +0 -0
- cache/23.png +0 -0
- cache/24.png +0 -0
- cache/25.png +0 -0
- cache/3.png +0 -0
- cache/4.png +0 -0
- cache/5.png +0 -0
- cache/6.png +0 -0
- cache/7.png +0 -0
- cache/8.png +0 -0
- cache/9.png +0 -0
- example/example_1.jpg +0 -0
- example/example_2.jpg +0 -0
- example/example_3.jpg +0 -0
- example/example_4.jpg +0 -0
- example/example_5.jpg +0 -0
MobileAgent/__pycache__/api.cpython-310.pyc
ADDED
Binary file (1.18 kB). View file
|
|
MobileAgent/__pycache__/api_service.cpython-310.pyc
ADDED
Binary file (633 Bytes). View file
|
|
MobileAgent/__pycache__/chat.cpython-310.pyc
ADDED
Binary file (1.92 kB). View file
|
|
MobileAgent/__pycache__/controller.cpython-310.pyc
ADDED
Binary file (4.05 kB). View file
|
|
MobileAgent/__pycache__/crop.cpython-310.pyc
ADDED
Binary file (3.9 kB). View file
|
|
MobileAgent/__pycache__/icon_localization.cpython-310.pyc
ADDED
Binary file (1.77 kB). View file
|
|
MobileAgent/__pycache__/local_server.cpython-310.pyc
ADDED
Binary file (4.25 kB). View file
|
|
MobileAgent/__pycache__/prompt.cpython-310.pyc
ADDED
Binary file (9.8 kB). View file
|
|
MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc
ADDED
Binary file (9.04 kB). View file
|
|
MobileAgent/__pycache__/text_localization.cpython-310.pyc
ADDED
Binary file (1.98 kB). View file
|
|
MobileAgent/api.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import requests
|
3 |
+
|
4 |
+
def encode_image(image_path):
|
5 |
+
with open(image_path, "rb") as image_file:
|
6 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
7 |
+
|
8 |
+
|
9 |
+
def inference_chat(chat, model, api_url, token):
|
10 |
+
headers = {
|
11 |
+
"Content-Type": "application/json",
|
12 |
+
"Authorization": f"Bearer {token}"
|
13 |
+
}
|
14 |
+
|
15 |
+
data = {
|
16 |
+
"model": model,
|
17 |
+
"messages": [],
|
18 |
+
"max_tokens": 2048,
|
19 |
+
'temperature': 0.0,
|
20 |
+
"seed": 1234
|
21 |
+
}
|
22 |
+
|
23 |
+
for role, content in chat:
|
24 |
+
data["messages"].append({"role": role, "content": content})
|
25 |
+
|
26 |
+
retry = 3
|
27 |
+
cur_try = 0
|
28 |
+
while True:
|
29 |
+
cur_try += 1
|
30 |
+
if cur_try > retry:
|
31 |
+
return "No token"
|
32 |
+
try:
|
33 |
+
res = requests.post(api_url, headers=headers, json=data)
|
34 |
+
res_json = res.json()
|
35 |
+
res_content = res_json['data']['response']['choices'][0]['message']['content']
|
36 |
+
except:
|
37 |
+
print("Network Error:")
|
38 |
+
try:
|
39 |
+
print(res.json())
|
40 |
+
except:
|
41 |
+
print("Request Failed")
|
42 |
+
else:
|
43 |
+
break
|
44 |
+
|
45 |
+
return res_content
|
MobileAgent/api_service.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
|
4 |
+
|
5 |
+
def get_action(query_data, url, token):
|
6 |
+
|
7 |
+
headers = {
|
8 |
+
'Authorization': token,
|
9 |
+
'Content-Type': 'application/json'
|
10 |
+
}
|
11 |
+
|
12 |
+
data = {
|
13 |
+
"model": "pre-Mobile_Agent_Server_ADB_V2-2204",
|
14 |
+
"input": {"json_data": query_data}
|
15 |
+
}
|
16 |
+
|
17 |
+
while True:
|
18 |
+
try:
|
19 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
20 |
+
response.json()["output"]
|
21 |
+
except:
|
22 |
+
print("Network Error:", response.json())
|
23 |
+
else:
|
24 |
+
break
|
25 |
+
|
26 |
+
return response
|
MobileAgent/chat.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from MobileAgent.api import encode_image
|
3 |
+
|
4 |
+
|
5 |
+
def init_action_chat():
|
6 |
+
operation_history = []
|
7 |
+
sysetm_prompt = "You are a helpful AI mobile phone operating assistant. You need to help me operate the phone to complete the user\'s instruction."
|
8 |
+
operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
|
9 |
+
return operation_history
|
10 |
+
|
11 |
+
|
12 |
+
def init_reflect_chat():
|
13 |
+
operation_history = []
|
14 |
+
sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
|
15 |
+
operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
|
16 |
+
return operation_history
|
17 |
+
|
18 |
+
|
19 |
+
def init_memory_chat():
|
20 |
+
operation_history = []
|
21 |
+
sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
|
22 |
+
operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
|
23 |
+
return operation_history
|
24 |
+
|
25 |
+
|
26 |
+
def add_response(role, prompt, chat_history, image=None):
|
27 |
+
new_chat_history = copy.deepcopy(chat_history)
|
28 |
+
if image:
|
29 |
+
base64_image = encode_image(image)
|
30 |
+
content = [
|
31 |
+
{
|
32 |
+
"type": "text",
|
33 |
+
"text": prompt
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"type": "image_url",
|
37 |
+
"image_url": {
|
38 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
]
|
42 |
+
else:
|
43 |
+
content = [
|
44 |
+
{
|
45 |
+
"type": "text",
|
46 |
+
"text": prompt
|
47 |
+
},
|
48 |
+
]
|
49 |
+
new_chat_history.append([role, content])
|
50 |
+
return new_chat_history
|
51 |
+
|
52 |
+
|
53 |
+
def add_response_two_image(role, prompt, chat_history, image):
|
54 |
+
new_chat_history = copy.deepcopy(chat_history)
|
55 |
+
|
56 |
+
base64_image1 = encode_image(image[0])
|
57 |
+
base64_image2 = encode_image(image[1])
|
58 |
+
content = [
|
59 |
+
{
|
60 |
+
"type": "text",
|
61 |
+
"text": prompt
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"type": "image_url",
|
65 |
+
"image_url": {
|
66 |
+
"url": f"data:image/jpeg;base64,{base64_image1}"
|
67 |
+
}
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"type": "image_url",
|
71 |
+
"image_url": {
|
72 |
+
"url": f"data:image/jpeg;base64,{base64_image2}"
|
73 |
+
}
|
74 |
+
},
|
75 |
+
]
|
76 |
+
|
77 |
+
new_chat_history.append([role, content])
|
78 |
+
return new_chat_history
|
79 |
+
|
80 |
+
|
81 |
+
def print_status(chat_history):
|
82 |
+
print("*"*100)
|
83 |
+
for chat in chat_history:
|
84 |
+
print("role:", chat[0])
|
85 |
+
print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
|
86 |
+
print("*"*100)
|
MobileAgent/crop.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image, ImageDraw, ImageFont
|
5 |
+
import clip
|
6 |
+
import torch
|
7 |
+
|
8 |
+
|
9 |
+
def crop_image(img, position):
|
10 |
+
def distance(x1,y1,x2,y2):
|
11 |
+
return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
|
12 |
+
position = position.tolist()
|
13 |
+
for i in range(4):
|
14 |
+
for j in range(i+1, 4):
|
15 |
+
if(position[i][0] > position[j][0]):
|
16 |
+
tmp = position[j]
|
17 |
+
position[j] = position[i]
|
18 |
+
position[i] = tmp
|
19 |
+
if position[0][1] > position[1][1]:
|
20 |
+
tmp = position[0]
|
21 |
+
position[0] = position[1]
|
22 |
+
position[1] = tmp
|
23 |
+
|
24 |
+
if position[2][1] > position[3][1]:
|
25 |
+
tmp = position[2]
|
26 |
+
position[2] = position[3]
|
27 |
+
position[3] = tmp
|
28 |
+
|
29 |
+
x1, y1 = position[0][0], position[0][1]
|
30 |
+
x2, y2 = position[2][0], position[2][1]
|
31 |
+
x3, y3 = position[3][0], position[3][1]
|
32 |
+
x4, y4 = position[1][0], position[1][1]
|
33 |
+
|
34 |
+
corners = np.zeros((4,2), np.float32)
|
35 |
+
corners[0] = [x1, y1]
|
36 |
+
corners[1] = [x2, y2]
|
37 |
+
corners[2] = [x4, y4]
|
38 |
+
corners[3] = [x3, y3]
|
39 |
+
|
40 |
+
img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
|
41 |
+
img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
|
42 |
+
|
43 |
+
corners_trans = np.zeros((4,2), np.float32)
|
44 |
+
corners_trans[0] = [0, 0]
|
45 |
+
corners_trans[1] = [img_width - 1, 0]
|
46 |
+
corners_trans[2] = [0, img_height - 1]
|
47 |
+
corners_trans[3] = [img_width - 1, img_height - 1]
|
48 |
+
|
49 |
+
transform = cv2.getPerspectiveTransform(corners, corners_trans)
|
50 |
+
dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
|
51 |
+
return dst
|
52 |
+
|
53 |
+
|
54 |
+
def calculate_size(box):
|
55 |
+
return (box[2]-box[0]) * (box[3]-box[1])
|
56 |
+
|
57 |
+
|
58 |
+
def calculate_iou(box1, box2):
|
59 |
+
xA = max(box1[0], box2[0])
|
60 |
+
yA = max(box1[1], box2[1])
|
61 |
+
xB = min(box1[2], box2[2])
|
62 |
+
yB = min(box1[3], box2[3])
|
63 |
+
|
64 |
+
interArea = max(0, xB - xA) * max(0, yB - yA)
|
65 |
+
box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
66 |
+
box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
67 |
+
unionArea = box1Area + box2Area - interArea
|
68 |
+
iou = interArea / unionArea
|
69 |
+
|
70 |
+
return iou
|
71 |
+
|
72 |
+
|
73 |
+
def crop(image, box, i, text_data=None):
|
74 |
+
image = Image.open(image)
|
75 |
+
|
76 |
+
if text_data:
|
77 |
+
draw = ImageDraw.Draw(image)
|
78 |
+
draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
|
79 |
+
# font_size = int((text_data[3] - text_data[1])*0.75)
|
80 |
+
# font = ImageFont.truetype("arial.ttf", font_size)
|
81 |
+
# draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
|
82 |
+
|
83 |
+
cropped_image = image.crop(box)
|
84 |
+
cropped_image.save(f"./temp/{i}.jpg")
|
85 |
+
|
86 |
+
|
87 |
+
def in_box(box, target):
|
88 |
+
if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
|
89 |
+
return True
|
90 |
+
else:
|
91 |
+
return False
|
92 |
+
|
93 |
+
|
94 |
+
def crop_for_clip(image, box, i, position):
|
95 |
+
image = Image.open(image)
|
96 |
+
w, h = image.size
|
97 |
+
if position == "left":
|
98 |
+
bound = [0, 0, w/2, h]
|
99 |
+
elif position == "right":
|
100 |
+
bound = [w/2, 0, w, h]
|
101 |
+
elif position == "top":
|
102 |
+
bound = [0, 0, w, h/2]
|
103 |
+
elif position == "bottom":
|
104 |
+
bound = [0, h/2, w, h]
|
105 |
+
elif position == "top left":
|
106 |
+
bound = [0, 0, w/2, h/2]
|
107 |
+
elif position == "top right":
|
108 |
+
bound = [w/2, 0, w, h/2]
|
109 |
+
elif position == "bottom left":
|
110 |
+
bound = [0, h/2, w/2, h]
|
111 |
+
elif position == "bottom right":
|
112 |
+
bound = [w/2, h/2, w, h]
|
113 |
+
else:
|
114 |
+
bound = [0, 0, w, h]
|
115 |
+
|
116 |
+
if in_box(box, bound):
|
117 |
+
cropped_image = image.crop(box)
|
118 |
+
cropped_image.save(f"./temp/{i}.jpg")
|
119 |
+
return True
|
120 |
+
else:
|
121 |
+
return False
|
122 |
+
|
123 |
+
|
124 |
+
def clip_for_icon(clip_model, clip_preprocess, images, prompt):
|
125 |
+
image_features = []
|
126 |
+
for image_file in images:
|
127 |
+
image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(next(clip_model.parameters()).device)
|
128 |
+
image_feature = clip_model.encode_image(image)
|
129 |
+
image_features.append(image_feature)
|
130 |
+
image_features = torch.cat(image_features)
|
131 |
+
|
132 |
+
text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device)
|
133 |
+
text_features = clip_model.encode_text(text)
|
134 |
+
|
135 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
136 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
137 |
+
similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
|
138 |
+
_, max_pos = torch.max(similarity, dim=0)
|
139 |
+
pos = max_pos.item()
|
140 |
+
|
141 |
+
return pos
|
MobileAgent/icon_localization.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from MobileAgent.crop import calculate_size, calculate_iou
|
2 |
+
from modelscope.pipelines import pipeline
|
3 |
+
from PIL import Image
|
4 |
+
import torch
|
5 |
+
|
6 |
+
def remove_boxes(boxes_filt, size, iou_threshold=0.5):
|
7 |
+
boxes_to_remove = set()
|
8 |
+
|
9 |
+
for i in range(len(boxes_filt)):
|
10 |
+
if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
|
11 |
+
boxes_to_remove.add(i)
|
12 |
+
for j in range(len(boxes_filt)):
|
13 |
+
if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
|
14 |
+
boxes_to_remove.add(j)
|
15 |
+
if i == j:
|
16 |
+
continue
|
17 |
+
if i in boxes_to_remove or j in boxes_to_remove:
|
18 |
+
continue
|
19 |
+
iou = calculate_iou(boxes_filt[i], boxes_filt[j])
|
20 |
+
if iou >= iou_threshold:
|
21 |
+
boxes_to_remove.add(j)
|
22 |
+
|
23 |
+
boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
|
24 |
+
|
25 |
+
return boxes_filt
|
26 |
+
|
27 |
+
|
28 |
+
def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
|
29 |
+
image = Image.open(input_image_path)
|
30 |
+
size = image.size
|
31 |
+
|
32 |
+
caption = caption.lower()
|
33 |
+
caption = caption.strip()
|
34 |
+
if not caption.endswith('.'):
|
35 |
+
caption = caption + '.'
|
36 |
+
|
37 |
+
inputs = {
|
38 |
+
'IMAGE_PATH': input_image_path,
|
39 |
+
'TEXT_PROMPT': caption,
|
40 |
+
'BOX_TRESHOLD': box_threshold,
|
41 |
+
'TEXT_TRESHOLD': text_threshold
|
42 |
+
}
|
43 |
+
|
44 |
+
result = groundingdino_model(inputs)
|
45 |
+
boxes_filt = result['boxes']
|
46 |
+
|
47 |
+
H, W = size[1], size[0]
|
48 |
+
for i in range(boxes_filt.size(0)):
|
49 |
+
boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
|
50 |
+
boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
|
51 |
+
boxes_filt[i][2:] += boxes_filt[i][:2]
|
52 |
+
|
53 |
+
boxes_filt = boxes_filt.cpu().int().tolist()
|
54 |
+
filtered_boxes = remove_boxes(boxes_filt, size) # [:9]
|
55 |
+
coordinates = []
|
56 |
+
for box in filtered_boxes:
|
57 |
+
coordinates.append([box[0], box[1], box[2], box[3]])
|
58 |
+
|
59 |
+
return coordinates
|
MobileAgent/local_server.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
from io import BytesIO
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
from MobileAgent.api import inference_chat
|
7 |
+
from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt
|
8 |
+
from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image
|
9 |
+
|
10 |
+
from dashscope import MultiModalConversation
|
11 |
+
import dashscope
|
12 |
+
import concurrent
|
13 |
+
|
14 |
+
|
15 |
+
API_url = os.environ.get('url')
|
16 |
+
token = os.environ.get('token')
|
17 |
+
|
18 |
+
|
19 |
+
def base64_to_pil(base64_string):
|
20 |
+
if base64_string.startswith('data:image'):
|
21 |
+
base64_string = base64_string.split(',')[-1]
|
22 |
+
image_data = base64.b64decode(base64_string)
|
23 |
+
image_stream = BytesIO(image_data)
|
24 |
+
pil_image = Image.open(image_stream)
|
25 |
+
return pil_image
|
26 |
+
|
27 |
+
|
28 |
+
def process_image(image, query):
|
29 |
+
dashscope.api_key = os.environ.get('qwen')
|
30 |
+
image = "file://" + image
|
31 |
+
messages = [{
|
32 |
+
'role': 'user',
|
33 |
+
'content': [
|
34 |
+
{
|
35 |
+
'image': image
|
36 |
+
},
|
37 |
+
{
|
38 |
+
'text': query
|
39 |
+
},
|
40 |
+
]
|
41 |
+
}]
|
42 |
+
response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages)
|
43 |
+
|
44 |
+
try:
|
45 |
+
response = response['output']['choices'][0]['message']['content'][0]["text"]
|
46 |
+
except:
|
47 |
+
response = "This is an icon."
|
48 |
+
|
49 |
+
return response
|
50 |
+
|
51 |
+
|
52 |
+
if not os.path.exists("screenshot"):
|
53 |
+
os.mkdir("screenshot")
|
54 |
+
if not os.path.exists("temp"):
|
55 |
+
os.mkdir("temp")
|
56 |
+
|
57 |
+
|
58 |
+
def mobile_agent_infer(json_data):
|
59 |
+
task = json_data["task"]
|
60 |
+
if task == "caption":
|
61 |
+
query = json_data["query"]
|
62 |
+
images = json_data["images"]
|
63 |
+
local_images = []
|
64 |
+
for image in images:
|
65 |
+
image_name = image["image_name"]
|
66 |
+
image_file = image["image_file"]
|
67 |
+
image_file = base64_to_pil(image_file)
|
68 |
+
image_path = "temp/" + image_name
|
69 |
+
image_file.save(image_path, "PNG")
|
70 |
+
local_images.append(image_path)
|
71 |
+
|
72 |
+
icon_map = {}
|
73 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
74 |
+
futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)}
|
75 |
+
|
76 |
+
for future in concurrent.futures.as_completed(futures):
|
77 |
+
i = futures[future]
|
78 |
+
response = future.result()
|
79 |
+
icon_map[i + 1] = response
|
80 |
+
|
81 |
+
output = {"icon_map": icon_map}
|
82 |
+
return output
|
83 |
+
|
84 |
+
elif task == "planning":
|
85 |
+
instruction = json_data["instruction"]
|
86 |
+
thought_history = json_data["thought_history"]
|
87 |
+
summary_history = json_data["summary_history"]
|
88 |
+
action_history = json_data["action_history"]
|
89 |
+
completed_requirements = json_data["completed_requirements"]
|
90 |
+
add_info = json_data["add_info"]
|
91 |
+
|
92 |
+
prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history,
|
93 |
+
completed_requirements, add_info)
|
94 |
+
chat_planning = init_memory_chat()
|
95 |
+
chat_planning = add_response("user", prompt_planning, chat_planning)
|
96 |
+
output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token)
|
97 |
+
|
98 |
+
output = {"planning": output_planning}
|
99 |
+
return output
|
100 |
+
|
101 |
+
elif task == "decision":
|
102 |
+
screenshot_file = json_data["screenshot_file"]
|
103 |
+
screenshot_file = base64_to_pil(screenshot_file)
|
104 |
+
image_path = "screenshot/screenshot_local.png"
|
105 |
+
screenshot_file.save(image_path, "PNG")
|
106 |
+
|
107 |
+
instruction = json_data["instruction"]
|
108 |
+
perception_infos = json_data["perception_infos"]
|
109 |
+
width = json_data["width"]
|
110 |
+
height = json_data["height"]
|
111 |
+
summary_history = json_data["summary_history"]
|
112 |
+
action_history = json_data["action_history"]
|
113 |
+
summary = json_data["summary"]
|
114 |
+
action = json_data["action"]
|
115 |
+
add_info = json_data["add_info"]
|
116 |
+
error_flag = json_data["error_flag"]
|
117 |
+
completed_requirements = json_data["completed_requirements"]
|
118 |
+
memory = json_data["memory"]
|
119 |
+
memory_switch = json_data["memory_switch"]
|
120 |
+
insight = json_data["insight"]
|
121 |
+
|
122 |
+
prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history,
|
123 |
+
action_history, summary, action, add_info, error_flag, completed_requirements,
|
124 |
+
memory)
|
125 |
+
chat_action = init_action_chat()
|
126 |
+
chat_action = add_response("user", prompt_action, chat_action, image_path)
|
127 |
+
output_action = inference_chat(chat_action, 'gpt-4o', API_url, token)
|
128 |
+
if output_action == "No token":
|
129 |
+
output = {"decision": "No token", "memory": None}
|
130 |
+
return output
|
131 |
+
chat_action = add_response("assistant", output_action, chat_action)
|
132 |
+
|
133 |
+
output_memory = None
|
134 |
+
if memory_switch:
|
135 |
+
prompt_memory = get_memory_prompt(insight)
|
136 |
+
chat_action = add_response("user", prompt_memory, chat_action)
|
137 |
+
output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token)
|
138 |
+
|
139 |
+
output = {"decision": output_action, "memory": output_memory}
|
140 |
+
return output
|
141 |
+
|
142 |
+
elif task == "reflection":
|
143 |
+
screenshot_file = json_data["screenshot_file"]
|
144 |
+
screenshot_file = base64_to_pil(screenshot_file)
|
145 |
+
image_path = "screenshot/screenshot_local.png"
|
146 |
+
screenshot_file.save(image_path, "PNG")
|
147 |
+
last_screenshot_file = json_data["last_screenshot_file"]
|
148 |
+
last_screenshot_file = base64_to_pil(last_screenshot_file)
|
149 |
+
last_image_path = "screenshot/last_screenshot_local.png"
|
150 |
+
last_screenshot_file.save(last_image_path, "PNG")
|
151 |
+
|
152 |
+
instruction = json_data["instruction"]
|
153 |
+
last_perception_infos = json_data["last_perception_infos"]
|
154 |
+
perception_infos = json_data["perception_infos"]
|
155 |
+
width = json_data["width"]
|
156 |
+
height = json_data["height"]
|
157 |
+
summary = json_data["summary"]
|
158 |
+
action = json_data["action"]
|
159 |
+
add_info = json_data["add_info"]
|
160 |
+
|
161 |
+
prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height,
|
162 |
+
summary, action, add_info)
|
163 |
+
chat_reflect = init_reflect_chat()
|
164 |
+
chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path])
|
165 |
+
output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token)
|
166 |
+
|
167 |
+
output = {"reflection": output_reflect}
|
168 |
+
return output
|
169 |
+
|
170 |
+
else:
|
171 |
+
output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."}
|
172 |
+
return output
|
MobileAgent/prompt_no_input.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_action_prompt(instruction, clickable_infos, width, height, summary_history, action_history, last_summary, last_action, add_info, error_flag, completed_content, memory):
|
2 |
+
prompt = "### Background ###\n"
|
3 |
+
prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
|
4 |
+
|
5 |
+
prompt += "### Screenshot information ###\n"
|
6 |
+
prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
|
7 |
+
prompt += "This information consists of two parts: coordinates; content. "
|
8 |
+
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. "
|
9 |
+
prompt += "The information is as follow:\n"
|
10 |
+
|
11 |
+
for clickable_info in clickable_infos:
|
12 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
13 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
14 |
+
|
15 |
+
prompt += "Please note that this information is not necessarily accurate. You need to combine the screenshot to understand."
|
16 |
+
prompt += "\n\n"
|
17 |
+
|
18 |
+
if add_info != "":
|
19 |
+
prompt += "### Hint ###\n"
|
20 |
+
prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
|
21 |
+
prompt += add_info
|
22 |
+
prompt += "\n\n"
|
23 |
+
|
24 |
+
if len(action_history) > 0:
|
25 |
+
prompt += "### History operations ###\n"
|
26 |
+
prompt += "Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n"
|
27 |
+
for i in range(len(action_history)):
|
28 |
+
prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(" to ")[0].strip() + "; Action: " + action_history[i] + "]\n"
|
29 |
+
prompt += "\n"
|
30 |
+
|
31 |
+
if completed_content != "":
|
32 |
+
prompt += "### Progress ###\n"
|
33 |
+
prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
|
34 |
+
prompt += "Completed contents:\n" + completed_content + "\n\n"
|
35 |
+
|
36 |
+
if memory != "":
|
37 |
+
prompt += "### Memory ###\n"
|
38 |
+
prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
|
39 |
+
prompt += "Memory:\n" + memory + "\n"
|
40 |
+
|
41 |
+
if error_flag:
|
42 |
+
prompt += "### Last operation ###\n"
|
43 |
+
prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
|
44 |
+
prompt += "\n\n"
|
45 |
+
|
46 |
+
prompt += "### Response requirements ###\n"
|
47 |
+
prompt += "Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n"
|
48 |
+
prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n"
|
49 |
+
prompt += "Tap (x, y): Tap the position (x, y) in current page.\n"
|
50 |
+
prompt += "Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n"
|
51 |
+
prompt += "Type (text): Type the \"text\" in the input box.\n"
|
52 |
+
prompt += "Home: Return to home page.\n"
|
53 |
+
prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process."
|
54 |
+
prompt += "\n\n"
|
55 |
+
|
56 |
+
prompt += "### Output format ###\n"
|
57 |
+
prompt += "Your output consists of the following three parts:\n"
|
58 |
+
prompt += "### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n"
|
59 |
+
prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n"
|
60 |
+
prompt += "### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought."
|
61 |
+
|
62 |
+
return prompt
|
63 |
+
|
64 |
+
|
65 |
+
def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info):
|
66 |
+
prompt = f"These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
|
67 |
+
|
68 |
+
prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
|
69 |
+
prompt += "The information consists of two parts, consisting of format: coordinates; content. "
|
70 |
+
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
|
71 |
+
prompt += "The keyboard status is whether the keyboard of the current page is activated."
|
72 |
+
prompt += "\n\n"
|
73 |
+
|
74 |
+
prompt += "### Before the current operation ###\n"
|
75 |
+
prompt += "Screenshot information:\n"
|
76 |
+
for clickable_info in clickable_infos1:
|
77 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
78 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
79 |
+
prompt += "\n"
|
80 |
+
|
81 |
+
prompt += "### After the current operation ###\n"
|
82 |
+
prompt += "Screenshot information:\n"
|
83 |
+
for clickable_info in clickable_infos2:
|
84 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
85 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
86 |
+
prompt += "\n"
|
87 |
+
|
88 |
+
prompt += "### Current operation ###\n"
|
89 |
+
prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n"
|
90 |
+
prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
|
91 |
+
prompt += "Operation action: " + action
|
92 |
+
prompt += "\n\n"
|
93 |
+
|
94 |
+
prompt += "### Response requirements ###\n"
|
95 |
+
prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
|
96 |
+
prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
|
97 |
+
prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
|
98 |
+
prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n"
|
99 |
+
prompt += "C: The \"Operation action\" produces no changes."
|
100 |
+
prompt += "\n\n"
|
101 |
+
|
102 |
+
prompt += "### Output format ###\n"
|
103 |
+
prompt += "Your output format is:\n"
|
104 |
+
prompt += "### Thought ###\nYour thought about the question\n"
|
105 |
+
prompt += "### Answer ###\nA or B or C"
|
106 |
+
|
107 |
+
return prompt
|
108 |
+
|
109 |
+
|
110 |
+
def get_memory_prompt(insight):
|
111 |
+
if insight != "":
|
112 |
+
prompt = "### Important content ###\n"
|
113 |
+
prompt += insight
|
114 |
+
prompt += "\n\n"
|
115 |
+
|
116 |
+
prompt += "### Response requirements ###\n"
|
117 |
+
prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
|
118 |
+
|
119 |
+
else:
|
120 |
+
prompt = "### Response requirements ###\n"
|
121 |
+
prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
|
122 |
+
|
123 |
+
prompt += "### Output format ###\n"
|
124 |
+
prompt += "Your output format is:\n"
|
125 |
+
prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
|
126 |
+
|
127 |
+
return prompt
|
128 |
+
|
129 |
+
def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info):
|
130 |
+
prompt = "### Background ###\n"
|
131 |
+
prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n"
|
132 |
+
|
133 |
+
if add_info != "":
|
134 |
+
prompt += "### Hint ###\n"
|
135 |
+
prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
|
136 |
+
prompt += add_info
|
137 |
+
prompt += "\n\n"
|
138 |
+
|
139 |
+
if len(thought_history) > 1:
|
140 |
+
prompt += "### History operations ###\n"
|
141 |
+
prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
|
142 |
+
for i in range(len(summary_history)):
|
143 |
+
operation = summary_history[i].split(" to ")[0].strip()
|
144 |
+
prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
|
145 |
+
prompt += "\n"
|
146 |
+
|
147 |
+
prompt += "### Progress thinking ###\n"
|
148 |
+
prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
|
149 |
+
prompt += "Completed contents:\n" + completed_content + "\n\n"
|
150 |
+
|
151 |
+
prompt += "### Response requirements ###\n"
|
152 |
+
prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
|
153 |
+
|
154 |
+
prompt += "### Output format ###\n"
|
155 |
+
prompt += "Your output format is:\n"
|
156 |
+
prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
|
157 |
+
|
158 |
+
else:
|
159 |
+
prompt += "### Current operation ###\n"
|
160 |
+
prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
|
161 |
+
prompt += f"Operation thought: {thought_history[-1]}\n"
|
162 |
+
operation = summary_history[-1].split(" to ")[0].strip()
|
163 |
+
prompt += f"Operation action: {operation}\n\n"
|
164 |
+
|
165 |
+
prompt += "### Response requirements ###\n"
|
166 |
+
prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
|
167 |
+
prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
|
168 |
+
|
169 |
+
prompt += "### Output format ###\n"
|
170 |
+
prompt += "Your output format is:\n"
|
171 |
+
prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
|
172 |
+
prompt += "(Please use English to output)"
|
173 |
+
|
174 |
+
return prompt
|
MobileAgent/text_localization.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
from MobileAgent.crop import crop_image, calculate_size
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
|
7 |
+
def order_point(coor):
|
8 |
+
arr = np.array(coor).reshape([4, 2])
|
9 |
+
sum_ = np.sum(arr, 0)
|
10 |
+
centroid = sum_ / arr.shape[0]
|
11 |
+
theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
|
12 |
+
sort_points = arr[np.argsort(theta)]
|
13 |
+
sort_points = sort_points.reshape([4, -1])
|
14 |
+
if sort_points[0][0] > centroid[0]:
|
15 |
+
sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
|
16 |
+
sort_points = sort_points.reshape([4, 2]).astype('float32')
|
17 |
+
return sort_points
|
18 |
+
|
19 |
+
|
20 |
+
def longest_common_substring_length(str1, str2):
|
21 |
+
m = len(str1)
|
22 |
+
n = len(str2)
|
23 |
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
24 |
+
|
25 |
+
for i in range(1, m + 1):
|
26 |
+
for j in range(1, n + 1):
|
27 |
+
if str1[i - 1] == str2[j - 1]:
|
28 |
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
29 |
+
else:
|
30 |
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
31 |
+
|
32 |
+
return dp[m][n]
|
33 |
+
|
34 |
+
|
35 |
+
def ocr(image_path, ocr_detection, ocr_recognition):
|
36 |
+
text_data = []
|
37 |
+
coordinate = []
|
38 |
+
|
39 |
+
image_full = cv2.imread(image_path)
|
40 |
+
det_result = ocr_detection(image_full)
|
41 |
+
det_result = det_result['polygons']
|
42 |
+
for i in range(det_result.shape[0]):
|
43 |
+
pts = order_point(det_result[i])
|
44 |
+
image_crop = crop_image(image_full, pts)
|
45 |
+
|
46 |
+
try:
|
47 |
+
result = ocr_recognition(image_crop)['text'][0]
|
48 |
+
except:
|
49 |
+
continue
|
50 |
+
|
51 |
+
box = [int(e) for e in list(pts.reshape(-1))]
|
52 |
+
box = [box[0], box[1], box[4], box[5]]
|
53 |
+
|
54 |
+
text_data.append(result)
|
55 |
+
coordinate.append(box)
|
56 |
+
|
57 |
+
else:
|
58 |
+
return text_data, coordinate
|
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
title: Mobile Agent
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Mobile Agent
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.19.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import base64
|
5 |
+
import gradio as gr
|
6 |
+
from PIL import Image, ImageDraw
|
7 |
+
|
8 |
+
from MobileAgent.text_localization import ocr
|
9 |
+
from MobileAgent.icon_localization import det
|
10 |
+
from MobileAgent.local_server import mobile_agent_infer
|
11 |
+
|
12 |
+
from modelscope import snapshot_download
|
13 |
+
from modelscope.pipelines import pipeline
|
14 |
+
from modelscope.utils.constant import Tasks
|
15 |
+
|
16 |
+
|
17 |
+
chatbot_css = """
|
18 |
+
<style>
|
19 |
+
.chat-container {
|
20 |
+
display: flex;
|
21 |
+
flex-direction: column;
|
22 |
+
overflow-y: auto;
|
23 |
+
max-height: 630px;
|
24 |
+
margin: 10px;
|
25 |
+
}
|
26 |
+
.user-message, .bot-message {
|
27 |
+
margin: 5px;
|
28 |
+
padding: 10px;
|
29 |
+
border-radius: 10px;
|
30 |
+
}
|
31 |
+
.user-message {
|
32 |
+
text-align: right;
|
33 |
+
background-color: #7B68EE;
|
34 |
+
color: white;
|
35 |
+
align-self: flex-end;
|
36 |
+
}
|
37 |
+
.bot-message {
|
38 |
+
text-align: left;
|
39 |
+
background-color: #ADD8E6;
|
40 |
+
color: black;
|
41 |
+
align-self: flex-start;
|
42 |
+
}
|
43 |
+
.user-image {
|
44 |
+
text-align: right;
|
45 |
+
align-self: flex-end;
|
46 |
+
max-width: 150px;
|
47 |
+
max-height: 300px;
|
48 |
+
}
|
49 |
+
.bot-image {
|
50 |
+
text-align: left;
|
51 |
+
align-self: flex-start;
|
52 |
+
max-width: 200px;
|
53 |
+
max-height: 400px;
|
54 |
+
}
|
55 |
+
</style>
|
56 |
+
"""
|
57 |
+
|
58 |
+
|
59 |
+
temp_file = "temp"
|
60 |
+
screenshot = "screenshot"
|
61 |
+
cache = "cache"
|
62 |
+
if not os.path.exists(temp_file):
|
63 |
+
os.mkdir(temp_file)
|
64 |
+
if not os.path.exists(screenshot):
|
65 |
+
os.mkdir(screenshot)
|
66 |
+
if not os.path.exists(cache):
|
67 |
+
os.mkdir(cache)
|
68 |
+
|
69 |
+
|
70 |
+
groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0')
|
71 |
+
groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir)
|
72 |
+
ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo')
|
73 |
+
ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo')
|
74 |
+
|
75 |
+
|
76 |
+
def encode_image(image_path):
|
77 |
+
with open(image_path, "rb") as image_file:
|
78 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
79 |
+
|
80 |
+
|
81 |
+
def get_all_files_in_folder(folder_path):
|
82 |
+
file_list = []
|
83 |
+
for file_name in os.listdir(folder_path):
|
84 |
+
file_list.append(file_name)
|
85 |
+
return file_list
|
86 |
+
|
87 |
+
|
88 |
+
def crop(image, box, i):
|
89 |
+
image = Image.open(image)
|
90 |
+
x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
|
91 |
+
if x1 >= x2-10 or y1 >= y2-10:
|
92 |
+
return
|
93 |
+
cropped_image = image.crop((x1, y1, x2, y2))
|
94 |
+
cropped_image.save(f"./temp/{i}.png", format="PNG")
|
95 |
+
|
96 |
+
|
97 |
+
def merge_text_blocks(text_list, coordinates_list):
|
98 |
+
merged_text_blocks = []
|
99 |
+
merged_coordinates = []
|
100 |
+
|
101 |
+
sorted_indices = sorted(range(len(coordinates_list)), key=lambda k: (coordinates_list[k][1], coordinates_list[k][0]))
|
102 |
+
sorted_text_list = [text_list[i] for i in sorted_indices]
|
103 |
+
sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices]
|
104 |
+
|
105 |
+
num_blocks = len(sorted_text_list)
|
106 |
+
merge = [False] * num_blocks
|
107 |
+
|
108 |
+
for i in range(num_blocks):
|
109 |
+
if merge[i]:
|
110 |
+
continue
|
111 |
+
|
112 |
+
anchor = i
|
113 |
+
|
114 |
+
group_text = [sorted_text_list[anchor]]
|
115 |
+
group_coordinates = [sorted_coordinates_list[anchor]]
|
116 |
+
|
117 |
+
for j in range(i+1, num_blocks):
|
118 |
+
if merge[j]:
|
119 |
+
continue
|
120 |
+
|
121 |
+
if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \
|
122 |
+
sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \
|
123 |
+
abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10:
|
124 |
+
group_text.append(sorted_text_list[j])
|
125 |
+
group_coordinates.append(sorted_coordinates_list[j])
|
126 |
+
merge[anchor] = True
|
127 |
+
anchor = j
|
128 |
+
merge[anchor] = True
|
129 |
+
|
130 |
+
merged_text = "\n".join(group_text)
|
131 |
+
min_x1 = min(group_coordinates, key=lambda x: x[0])[0]
|
132 |
+
min_y1 = min(group_coordinates, key=lambda x: x[1])[1]
|
133 |
+
max_x2 = max(group_coordinates, key=lambda x: x[2])[2]
|
134 |
+
max_y2 = max(group_coordinates, key=lambda x: x[3])[3]
|
135 |
+
|
136 |
+
merged_text_blocks.append(merged_text)
|
137 |
+
merged_coordinates.append([min_x1, min_y1, max_x2, max_y2])
|
138 |
+
|
139 |
+
return merged_text_blocks, merged_coordinates
|
140 |
+
|
141 |
+
|
142 |
+
def get_perception_infos(screenshot_file):
|
143 |
+
width, height = Image.open(screenshot_file).size
|
144 |
+
|
145 |
+
text, coordinates = ocr(screenshot_file, ocr_detection, ocr_recognition)
|
146 |
+
text, coordinates = merge_text_blocks(text, coordinates)
|
147 |
+
|
148 |
+
perception_infos = []
|
149 |
+
for i in range(len(coordinates)):
|
150 |
+
perception_info = {"text": "text: " + text[i], "coordinates": coordinates[i]}
|
151 |
+
perception_infos.append(perception_info)
|
152 |
+
|
153 |
+
coordinates = det(screenshot_file, "icon", groundingdino_model)
|
154 |
+
|
155 |
+
for i in range(len(coordinates)):
|
156 |
+
perception_info = {"text": "icon", "coordinates": coordinates[i]}
|
157 |
+
perception_infos.append(perception_info)
|
158 |
+
|
159 |
+
image_box = []
|
160 |
+
image_id = []
|
161 |
+
for i in range(len(perception_infos)):
|
162 |
+
if perception_infos[i]['text'] == 'icon':
|
163 |
+
image_box.append(perception_infos[i]['coordinates'])
|
164 |
+
image_id.append(i)
|
165 |
+
|
166 |
+
for i in range(len(image_box)):
|
167 |
+
crop(screenshot_file, image_box[i], image_id[i])
|
168 |
+
|
169 |
+
images = get_all_files_in_folder(temp_file)
|
170 |
+
if len(images) > 0:
|
171 |
+
images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
|
172 |
+
image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
|
173 |
+
icon_map = {}
|
174 |
+
prompt = 'This image is an icon from a phone screen. Please briefly describe the shape and color of this icon in one sentence.'
|
175 |
+
|
176 |
+
string_image = []
|
177 |
+
for i in range(len(images)):
|
178 |
+
image_path = os.path.join(temp_file, images[i])
|
179 |
+
string_image.append({"image_name": images[i], "image_file": encode_image(image_path)})
|
180 |
+
query_data = {"task": "caption", "images": string_image, "query": prompt}
|
181 |
+
response_query = mobile_agent_infer(query_data)
|
182 |
+
icon_map = response_query["icon_map"]
|
183 |
+
|
184 |
+
for i, j in zip(image_id, range(1, len(image_id)+1)):
|
185 |
+
if icon_map.get(str(j)):
|
186 |
+
perception_infos[i]['text'] = "icon: " + icon_map[str(j)]
|
187 |
+
|
188 |
+
for i in range(len(perception_infos)):
|
189 |
+
perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)]
|
190 |
+
|
191 |
+
return perception_infos, width, height
|
192 |
+
|
193 |
+
|
194 |
+
def image_to_base64(image):
|
195 |
+
buffered = io.BytesIO()
|
196 |
+
image.save(buffered, format="PNG")
|
197 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
198 |
+
img_html = f'<img src="data:image/png;base64,{img_str}" />'
|
199 |
+
return img_html
|
200 |
+
|
201 |
+
|
202 |
+
def chatbot(image, instruction, add_info, history, chat_log):
|
203 |
+
if history == {}:
|
204 |
+
thought_history = []
|
205 |
+
summary_history = []
|
206 |
+
action_history = []
|
207 |
+
summary = ""
|
208 |
+
action = ""
|
209 |
+
completed_requirements = ""
|
210 |
+
memory = ""
|
211 |
+
insight = ""
|
212 |
+
error_flag = False
|
213 |
+
user_msg = "<div class='user-message'>{}</div>".format(instruction)
|
214 |
+
else:
|
215 |
+
thought_history = history["thought_history"]
|
216 |
+
summary_history = history["summary_history"]
|
217 |
+
action_history = history["action_history"]
|
218 |
+
summary = history["summary"]
|
219 |
+
action = history["action"]
|
220 |
+
completed_requirements = history["completed_requirements"]
|
221 |
+
memory = history["memory"][0]
|
222 |
+
insight = history["insight"]
|
223 |
+
error_flag = history["error_flag"]
|
224 |
+
user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.")
|
225 |
+
|
226 |
+
images = get_all_files_in_folder(cache)
|
227 |
+
if len(images) > 0 and len(images) <= 100:
|
228 |
+
images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
|
229 |
+
image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
|
230 |
+
cur_image_id = image_id[-1] + 1
|
231 |
+
elif len(images) > 100:
|
232 |
+
images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
|
233 |
+
image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
|
234 |
+
cur_image_id = image_id[-1] + 1
|
235 |
+
os.remove(os.path.join(cache, str(image_id[0])+".png"))
|
236 |
+
else:
|
237 |
+
cur_image_id = 1
|
238 |
+
|
239 |
+
image.save(os.path.join(cache, str(cur_image_id) + ".png"), format="PNG")
|
240 |
+
screenshot_file = os.path.join(cache, str(cur_image_id) + ".png")
|
241 |
+
perception_infos, width, height = get_perception_infos(screenshot_file)
|
242 |
+
shutil.rmtree(temp_file)
|
243 |
+
os.mkdir(temp_file)
|
244 |
+
|
245 |
+
local_screenshot_file = encode_image(screenshot_file)
|
246 |
+
query_data = {
|
247 |
+
"task": "decision",
|
248 |
+
"screenshot_file": local_screenshot_file,
|
249 |
+
"instruction": instruction,
|
250 |
+
"perception_infos": perception_infos,
|
251 |
+
"width": width,
|
252 |
+
"height": height,
|
253 |
+
"summary_history": summary_history,
|
254 |
+
"action_history": action_history,
|
255 |
+
"summary": summary,
|
256 |
+
"action": action,
|
257 |
+
"add_info": add_info,
|
258 |
+
"error_flag": error_flag,
|
259 |
+
"completed_requirements": completed_requirements,
|
260 |
+
"memory": memory,
|
261 |
+
"memory_switch": True,
|
262 |
+
"insight": insight
|
263 |
+
}
|
264 |
+
|
265 |
+
response_query = mobile_agent_infer(query_data)
|
266 |
+
output_action = response_query["decision"]
|
267 |
+
output_memory = response_query["memory"]
|
268 |
+
if output_action == "No token":
|
269 |
+
bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
|
270 |
+
chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
|
271 |
+
return chatbot_css + chat_html, history, chat_log
|
272 |
+
|
273 |
+
thought = output_action.split("### Thought ###")[-1].split("### Action ###")[0].replace("\n", " ").replace(":", "").replace(" ", " ").strip()
|
274 |
+
summary = output_action.split("### Operation ###")[-1].replace("\n", " ").replace(" ", " ").strip()
|
275 |
+
action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace(" ", " ").strip()
|
276 |
+
|
277 |
+
output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n"
|
278 |
+
if "None" not in output_memory and output_memory not in memory:
|
279 |
+
memory += output_memory
|
280 |
+
|
281 |
+
if "Open app" in action:
|
282 |
+
bot_response = "Please click the red circle and upload the current screenshot again."
|
283 |
+
app_name = action.split("(")[-1].split(")")[0]
|
284 |
+
text, coordinate = ocr(screenshot_file, ocr_detection, ocr_recognition)
|
285 |
+
for ti in range(len(text)):
|
286 |
+
if app_name == text[ti]:
|
287 |
+
name_coordinate = [int((coordinate[ti][0] + coordinate[ti][2])/2), int((coordinate[ti][1] + coordinate[ti][3])/2)]
|
288 |
+
x, y = name_coordinate[0], name_coordinate[1]
|
289 |
+
radius = 75
|
290 |
+
draw = ImageDraw.Draw(image)
|
291 |
+
draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
|
292 |
+
break
|
293 |
+
|
294 |
+
elif "Tap" in action:
|
295 |
+
bot_response = "Please click the red circle and upload the current screenshot again."
|
296 |
+
coordinate = action.split("(")[-1].split(")")[0].split(", ")
|
297 |
+
x, y = int(coordinate[0]), int(coordinate[1])
|
298 |
+
radius = 75
|
299 |
+
draw = ImageDraw.Draw(image)
|
300 |
+
draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
|
301 |
+
|
302 |
+
elif "Swipe" in action:
|
303 |
+
bot_response = "Please slide from red circle to blue circle and upload the current screenshot again."
|
304 |
+
coordinate1 = action.split("Swipe (")[-1].split("), (")[0].split(", ")
|
305 |
+
coordinate2 = action.split("), (")[-1].split(")")[0].split(", ")
|
306 |
+
x1, y1 = int(coordinate1[0]), int(coordinate1[1])
|
307 |
+
x2, y2 = int(coordinate2[0]), int(coordinate2[1])
|
308 |
+
radius = 75
|
309 |
+
draw = ImageDraw.Draw(image)
|
310 |
+
draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=10)
|
311 |
+
draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=10)
|
312 |
+
|
313 |
+
elif "Type" in action:
|
314 |
+
if "(text)" not in action:
|
315 |
+
text = action.split("(")[-1].split(")")[0]
|
316 |
+
else:
|
317 |
+
text = action.split(" \"")[-1].split("\"")[0]
|
318 |
+
bot_response = f"Please type the \"{text}\" and upload the current screenshot again."
|
319 |
+
|
320 |
+
elif "Back" in action:
|
321 |
+
bot_response = f"Please back to previous page and upload the current screenshot again."
|
322 |
+
|
323 |
+
elif "Home" in action:
|
324 |
+
bot_response = f"Please back to home page and upload the current screenshot again."
|
325 |
+
|
326 |
+
elif "Stop" in action:
|
327 |
+
bot_response = f"Task completed."
|
328 |
+
|
329 |
+
bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###")
|
330 |
+
bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought)
|
331 |
+
bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action)
|
332 |
+
bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary)
|
333 |
+
bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###")
|
334 |
+
bot_memory = "<div class='bot-message'>{}</div>".format(output_memory)
|
335 |
+
bot_response = "<div class='bot-message'>{}</div>".format(bot_response)
|
336 |
+
if image is not None:
|
337 |
+
bot_img_html = image_to_base64(image)
|
338 |
+
bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response
|
339 |
+
|
340 |
+
chat_log.append(user_msg)
|
341 |
+
|
342 |
+
thought_history.append(thought)
|
343 |
+
summary_history.append(summary)
|
344 |
+
action_history.append(action)
|
345 |
+
|
346 |
+
history["thought_history"] = thought_history
|
347 |
+
history["summary_history"] = summary_history
|
348 |
+
history["action_history"] = action_history
|
349 |
+
history["summary"] = summary
|
350 |
+
history["action"] = action
|
351 |
+
history["memory"] = memory,
|
352 |
+
history["memory_switch"] = True,
|
353 |
+
history["insight"] = insight
|
354 |
+
history["error_flag"] = error_flag
|
355 |
+
|
356 |
+
query_data = {
|
357 |
+
"task": "planning",
|
358 |
+
"instruction": instruction,
|
359 |
+
"thought_history": thought_history,
|
360 |
+
"summary_history": summary_history,
|
361 |
+
"action_history": action_history,
|
362 |
+
"completed_requirements": "",
|
363 |
+
"add_info": add_info
|
364 |
+
}
|
365 |
+
|
366 |
+
response_query = mobile_agent_infer(query_data)
|
367 |
+
output_planning = response_query["planning"]
|
368 |
+
if output_planning == "No token":
|
369 |
+
bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
|
370 |
+
chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
|
371 |
+
return chatbot_css + chat_html, history, chat_log
|
372 |
+
|
373 |
+
output_planning = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip()
|
374 |
+
history["completed_requirements"] = output_planning
|
375 |
+
|
376 |
+
bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###")
|
377 |
+
output_planning = "<div class='bot-message'>{}</div>".format(output_planning)
|
378 |
+
|
379 |
+
chat_log.append(bot_text3)
|
380 |
+
chat_log.append(output_planning)
|
381 |
+
chat_log.append(bot_text1)
|
382 |
+
chat_log.append(bot_thought)
|
383 |
+
chat_log.append(bot_action)
|
384 |
+
chat_log.append(bot_operation)
|
385 |
+
chat_log.append(bot_text2)
|
386 |
+
chat_log.append(bot_memory)
|
387 |
+
chat_log.append(bot_response)
|
388 |
+
|
389 |
+
chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
|
390 |
+
|
391 |
+
return chatbot_css + chat_html, history, chat_log
|
392 |
+
|
393 |
+
|
394 |
+
def lock_input(instruction):
|
395 |
+
return gr.update(value=instruction, interactive=False), gr.update(value=None)
|
396 |
+
|
397 |
+
|
398 |
+
def reset_demo():
|
399 |
+
return gr.update(value="", interactive=True), gr.update(value="If you want to tap an icon of an app, use the action \"Open app\"", interactive=True), "<div class='chat-container'></div>", {}, []
|
400 |
+
|
401 |
+
|
402 |
+
tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center">
|
403 |
+
<a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
|
404 |
+
<a href="https://arxiv.org/abs/2406.01014"><img src="https://img.shields.io/badge/Arxiv-2406.01014-red"></a>
|
405 |
+
<a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a>
|
406 |
+
</div>
|
407 |
+
If you like our project, please give us a star ✨ on Github for latest update.
|
408 |
+
|
409 |
+
**Terms of use**
|
410 |
+
1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\".
|
411 |
+
2. You can input helpful operation knowledge in \"Knowledge\".
|
412 |
+
3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the screenshot after your operation.
|
413 |
+
4. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom to experience.
|
414 |
+
5. Due to limited resources, each operation may take a long time, please be patient and wait.
|
415 |
+
|
416 |
+
**使用说明**
|
417 |
+
1. 在“Instruction”中输入你的指令,例如“打开深色模式”。
|
418 |
+
2. 你可以在“Knowledge”中输入帮助性的操作知识。
|
419 |
+
3. 点击“Submit”来获得操作。你需要根据输出来操作手机,并且上传操作后的截图。
|
420 |
+
4. “Example”中的5个例子是一个任务。从上到下点击它们并且点击“Submit”来体验。
|
421 |
+
5. 由于资源有限,每次操作的时间会比较长,请耐心等待。""")
|
422 |
+
|
423 |
+
title_markdowm = ("""# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration""")
|
424 |
+
|
425 |
+
instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
|
426 |
+
knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge", value="If you want to tap an icon of an app, use the action \"Open app\"")
|
427 |
+
with gr.Blocks() as demo:
|
428 |
+
history_state = gr.State(value={})
|
429 |
+
history_output = gr.State(value=[])
|
430 |
+
with gr.Row():
|
431 |
+
gr.Markdown(title_markdowm)
|
432 |
+
with gr.Row():
|
433 |
+
with gr.Column(scale=5):
|
434 |
+
gr.Markdown(tos_markdown)
|
435 |
+
with gr.Row():
|
436 |
+
image_input = gr.Image(label="Screenshot", type="pil", height=550, width=230)
|
437 |
+
gr.Examples(examples=[
|
438 |
+
["./example/example_1.jpg", "Turn on the dark mode"],
|
439 |
+
["./example/example_2.jpg", "Turn on the dark mode"],
|
440 |
+
["./example/example_3.jpg", "Turn on the dark mode"],
|
441 |
+
["./example/example_4.jpg", "Turn on the dark mode"],
|
442 |
+
["./example/example_5.jpg", "Turn on the dark mode"],
|
443 |
+
], inputs=[image_input, instruction_input, knowledge_input])
|
444 |
+
|
445 |
+
with gr.Column(scale=6):
|
446 |
+
instruction_input.render()
|
447 |
+
knowledge_input.render()
|
448 |
+
with gr.Row():
|
449 |
+
start_button = gr.Button("Submit")
|
450 |
+
clear_button = gr.Button("Clear")
|
451 |
+
output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>")
|
452 |
+
|
453 |
+
start_button.click(
|
454 |
+
fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output),
|
455 |
+
inputs=[image_input, instruction_input, knowledge_input, history_state, history_output],
|
456 |
+
outputs=[output_component, history_state, history_output]
|
457 |
+
)
|
458 |
+
|
459 |
+
clear_button.click(
|
460 |
+
fn=reset_demo,
|
461 |
+
inputs=[],
|
462 |
+
outputs=[instruction_input, knowledge_input, output_component, history_state, history_output]
|
463 |
+
)
|
464 |
+
|
465 |
+
demo.queue().launch(share=True)
|
cache/1.png
ADDED
cache/10.png
ADDED
cache/11.png
ADDED
cache/12.png
ADDED
cache/13.png
ADDED
cache/14.png
ADDED
cache/15.png
ADDED
cache/16.png
ADDED
cache/17.png
ADDED
cache/18.png
ADDED
cache/19.png
ADDED
cache/2.png
ADDED
cache/20.png
ADDED
cache/21.png
ADDED
cache/22.png
ADDED
cache/23.png
ADDED
cache/24.png
ADDED
cache/25.png
ADDED
cache/3.png
ADDED
cache/4.png
ADDED
cache/5.png
ADDED
cache/6.png
ADDED
cache/7.png
ADDED
cache/8.png
ADDED
cache/9.png
ADDED
example/example_1.jpg
ADDED
example/example_2.jpg
ADDED
example/example_3.jpg
ADDED
example/example_4.jpg
ADDED
example/example_5.jpg
ADDED