Upload folder using huggingface_hub
Browse files- lmdeploy_infer.py +243 -0
lmdeploy_infer.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
import ast
|
4 |
+
from io import BytesIO
|
5 |
+
from typing import List, Union
|
6 |
+
|
7 |
+
from PIL import Image, ImageFile
|
8 |
+
import numpy as np
|
9 |
+
from scipy.spatial.transform import Rotation
|
10 |
+
|
11 |
+
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
|
12 |
+
IMAGE_TOKEN = '<IMAGE_TOKEN>'
|
13 |
+
|
14 |
+
def normalize_quaternion(quat):
|
15 |
+
return np.array(quat) / np.linalg.norm(quat, axis=-1, keepdims=True)
|
16 |
+
|
17 |
+
def quaternion_to_discrete_euler(quaternion, bins_num=256):
|
18 |
+
euler = Rotation.from_quat(quaternion).as_euler('xyz', degrees=True) + 180
|
19 |
+
resolution = 360 / bins_num
|
20 |
+
disc = np.around((euler / resolution)).astype(int)
|
21 |
+
disc[disc == bins_num] = 0
|
22 |
+
return disc
|
23 |
+
|
24 |
+
def discrete_euler_to_quaternion(discrete_euler, bins_num=256):
|
25 |
+
resolution = 360 / bins_num
|
26 |
+
euler = (discrete_euler * resolution) - 180
|
27 |
+
return Rotation.from_euler('xyz', euler, degrees=True).as_quat()
|
28 |
+
|
29 |
+
|
30 |
+
class RotationActionDiscretizer:
|
31 |
+
def __init__(self, bins_num=256, min_action=-1, max_action=1):
|
32 |
+
"""
|
33 |
+
Note: the input action is quaternion
|
34 |
+
Args: bins_num: Number of bins to discretize the rotation space into.
|
35 |
+
"""
|
36 |
+
self.bins_num = bins_num
|
37 |
+
|
38 |
+
def discretize(self, action: Union[np.ndarray, List[float]], degrees=False):
|
39 |
+
# Check if the input action is quaternion or euler
|
40 |
+
if len(action) == 4:
|
41 |
+
return quaternion_to_discrete_euler(normalize_quaternion(action), bins_num=self.bins_num)
|
42 |
+
else:
|
43 |
+
return quaternion_to_discrete_euler(
|
44 |
+
normalize_quaternion(Rotation.from_euler('xyz', action, degrees=degrees).as_quat()),
|
45 |
+
bins_num=self.bins_num
|
46 |
+
)
|
47 |
+
|
48 |
+
def undiscretize(self, discrete_action):
|
49 |
+
return normalize_quaternion(discrete_euler_to_quaternion(discrete_action, bins_num=self.bins_num))
|
50 |
+
|
51 |
+
def get_action_space(self):
|
52 |
+
return self.bins_num
|
53 |
+
|
54 |
+
def generate_discrete_special_tokens(self)-> List[str]:
|
55 |
+
return [f"<rot{i}>" for i in range(self.bins_num)]
|
56 |
+
|
57 |
+
def map_4d_quaternion_to_special_tokens(self, action) -> List[str]:
|
58 |
+
discretiezd_action = self.discretize(action)
|
59 |
+
return [f"<rot{action}>" for action in discretiezd_action]
|
60 |
+
|
61 |
+
def map_roll_pitch_yaw_to_special_tokens(self, roll_pitch_yaw: Union[np.ndarray, List[float]], degrees=False) -> List[str]:
|
62 |
+
discretized_action = self.discretize(roll_pitch_yaw, degrees)
|
63 |
+
return [f"<rot{a}>" for a in discretized_action]
|
64 |
+
|
65 |
+
|
66 |
+
class TranslationActionDiscretizer:
|
67 |
+
def __init__(self, bins_num=256, min_action=-1, max_action=1):
|
68 |
+
self.bins_num = bins_num
|
69 |
+
self.min_action = min_action
|
70 |
+
self.max_action = max_action
|
71 |
+
|
72 |
+
# Create Uniform Bins + Compute Bin Centers
|
73 |
+
self.bins = np.linspace(min_action, max_action, bins_num)
|
74 |
+
self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
|
75 |
+
|
76 |
+
def discretize(self, action: np.ndarray):
|
77 |
+
action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
|
78 |
+
discretized_action = np.digitize(action, self.bins)
|
79 |
+
return discretized_action
|
80 |
+
|
81 |
+
def undiscretize(self, discrete_action):
|
82 |
+
"""
|
83 |
+
NOTE =>> Because of the way the actions are discretized w.r.t. the bins (and not the bin centers), the
|
84 |
+
digitization returns bin indices between [1, # bins], inclusive, when there are actually only
|
85 |
+
(# bins - 1) bin intervals.
|
86 |
+
|
87 |
+
Therefore, if the digitization returns the last possible index, we map this to the last bin interval.
|
88 |
+
|
89 |
+
EXAMPLE =>> Let's say self._bins has 256 values. Then self._bin_centers has 255 values. Digitization returns
|
90 |
+
indices between [1, 256]. We subtract 1 from all indices so that they are between [0, 255]. There
|
91 |
+
is still one index (i==255) that would cause an out-of-bounds error if used to index into
|
92 |
+
self._bin_centers. Therefore, if i==255, we subtract 1 from it so that it just becomes the index of
|
93 |
+
the last bin center. We implement this simply via clipping between [0, 255 - 1].
|
94 |
+
"""
|
95 |
+
|
96 |
+
discrete_action = np.clip(discrete_action - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
|
97 |
+
undiscretized_action = self.bin_centers[discrete_action]
|
98 |
+
|
99 |
+
# Clamp the result to the action bounds
|
100 |
+
return np.clip(undiscretized_action, self.min_action, self.max_action)
|
101 |
+
|
102 |
+
def get_action_space(self):
|
103 |
+
return self.bins_num
|
104 |
+
|
105 |
+
def generate_discrete_special_tokens(self)-> List[str]:
|
106 |
+
return [f"<loc{i}>" for i in range(self.bins_num)]
|
107 |
+
|
108 |
+
def map_3d_action_to_special_tokens(self, action) -> List[str]:
|
109 |
+
discretiezd_action = self.discretize(action)
|
110 |
+
return [f"<loc{action}>" for action in discretiezd_action]
|
111 |
+
|
112 |
+
|
113 |
+
class OpennessActionDiscretizer:
|
114 |
+
def __init__(self, bins_num=256, min_openness=0, max_openness=1):
|
115 |
+
"""
|
116 |
+
Args:
|
117 |
+
bins_num: Number of bins to discretize the openness space into.
|
118 |
+
min_openness: Minimum openness of the gripper.
|
119 |
+
max_openness: Maximum openness of the gripper.
|
120 |
+
"""
|
121 |
+
self.bins_num = bins_num
|
122 |
+
self.min_openness = min_openness
|
123 |
+
self.max_openness = max_openness
|
124 |
+
|
125 |
+
# Create Uniform Bins + Compute Bin Centers
|
126 |
+
self.bins = np.linspace(min_openness, max_openness, bins_num)
|
127 |
+
self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
|
128 |
+
|
129 |
+
def discretize(self, openness: float):
|
130 |
+
openness = np.clip(openness, a_min=self.min_openness, a_max=self.max_openness)
|
131 |
+
discretized_openness = np.digitize(openness, self.bins)
|
132 |
+
return discretized_openness
|
133 |
+
|
134 |
+
def undiscretize(self, discrete_openness):
|
135 |
+
discrete_openness = np.clip(discrete_openness - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
|
136 |
+
return self.bin_centers[discrete_openness]
|
137 |
+
|
138 |
+
def get_action_space(self):
|
139 |
+
return self.bins_num
|
140 |
+
|
141 |
+
def generate_discrete_special_tokens(self) -> List[str]:
|
142 |
+
return [f"<open{i}>" for i in range(self.bins_num)]
|
143 |
+
|
144 |
+
def map_openness_to_special_tokens(self, openness) -> List[str]:
|
145 |
+
discretized_openness = self.discretize(openness)
|
146 |
+
return [f"<open{discretized_openness}>"]
|
147 |
+
|
148 |
+
# def construct_lmdeploy_tasks(jsonl_path):
|
149 |
+
# data = load_jsonl(jsonl_path)
|
150 |
+
|
151 |
+
# lmdeploy_tasks = []
|
152 |
+
# for sample_idx, item in enumerate(data):
|
153 |
+
|
154 |
+
# langs = item["conversations"][0]["value"]
|
155 |
+
# langs = langs.replace("<image>", IMAGE_TOKEN)
|
156 |
+
# image_urls = [
|
157 |
+
# os.path.join(sample_save_folder, f"{sample_idx}_{im_idx}.png") for im_idx in range(len(item["image"]))
|
158 |
+
# ]
|
159 |
+
# gt_lang = item["conversations"][1]["value"]
|
160 |
+
# lmdeploy_tasks.append((langs, image_urls, gt_lang))
|
161 |
+
|
162 |
+
# return lmdeploy_tasks
|
163 |
+
|
164 |
+
def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
|
165 |
+
"""load image from base64 format."""
|
166 |
+
return Image.open(BytesIO(base64.b64decode(image)))
|
167 |
+
|
168 |
+
def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
|
169 |
+
"""load image from url, local path or openai GPT4V."""
|
170 |
+
FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
|
171 |
+
headers = {
|
172 |
+
'User-Agent':
|
173 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
174 |
+
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
175 |
+
}
|
176 |
+
try:
|
177 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
178 |
+
if isinstance(image_url, Image.Image):
|
179 |
+
img = image_url
|
180 |
+
else:
|
181 |
+
# Load image from local path
|
182 |
+
img = Image.open(image_url)
|
183 |
+
|
184 |
+
# check image valid
|
185 |
+
img = img.convert('RGB')
|
186 |
+
except Exception as error:
|
187 |
+
if isinstance(image_url, str) and len(image_url) > 100:
|
188 |
+
image_url = image_url[:100] + ' ...'
|
189 |
+
print(f'{error}, image_url={image_url}')
|
190 |
+
# use dummy image
|
191 |
+
img = Image.new('RGB', (32, 32))
|
192 |
+
|
193 |
+
return img
|
194 |
+
|
195 |
+
model = '/mnt/petrelfs/huangsiyuan/VLA/InternVL/internvl_chat/output/internvla_8b_1node_with_visual_traces_wo_sp_token_w_cam/VLA8B_V1'
|
196 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=2048), chat_template_config=ChatTemplateConfig(model_name='internvl2-internlm2'))
|
197 |
+
|
198 |
+
TRANS_MAX = 0.275
|
199 |
+
TRANS_MIN = -0.275
|
200 |
+
|
201 |
+
ROT_MIN = -0.350
|
202 |
+
ROT_MAX = 0.395
|
203 |
+
|
204 |
+
OPEN_MIN = -0.388
|
205 |
+
OPEN_MAX = 0.300
|
206 |
+
|
207 |
+
translation_bins_num = 256
|
208 |
+
rotation_bins_num = 256
|
209 |
+
openness_bins_num = 256
|
210 |
+
translation_action_discretizer = TranslationActionDiscretizer(bins_num=translation_bins_num, max_action=TRANS_MAX, min_action=TRANS_MIN)
|
211 |
+
rotation_action_discretizer = RotationActionDiscretizer(bins_num=rotation_bins_num, min_action=ROT_MIN, max_action=ROT_MAX)
|
212 |
+
openness_action_discretizer = OpennessActionDiscretizer(bins_num=openness_bins_num, min_openness=OPEN_MIN, max_openness=OPEN_MAX)
|
213 |
+
|
214 |
+
VQA_FORMAT = f"{IMAGE_TOKEN}\n {IMAGE_TOKEN}\n Given the observation images from the wrist camera mounted at CAM_PARAM and the overhead camera mounted at CAM_PARAM, please provide the action that the robot should take to finish the task: TASK"
|
215 |
+
# question_template = "<image>\n <image>\n Given the observation images from the wrist camera mounted at <cam>[256,89,256,236,129,181]</cam> and the overhead camera mounted at <cam>[82,1,256,54,128,98]</cam>, please provide the action that the robot should take to finish the task: place a chess piece on the chessboar"
|
216 |
+
|
217 |
+
# cam_params
|
218 |
+
wrist_cam_pose = [0.3618544138321802, -0.08323374464523976, 0.41759402329169787, 2.6584232953914344, 0.035482430406705845, 1.2906347836099603]
|
219 |
+
overhead_cam_pose = [-0.09877916942983442, -0.3919519409041736, 0.4780865865815033, -1.8237694898473762, -0.012183613523460979, -0.746683044221379]
|
220 |
+
cam_pose_list = [wrist_cam_pose, overhead_cam_pose]
|
221 |
+
for cam_pose in cam_pose_list:
|
222 |
+
cam_xyz_token = translation_action_discretizer.discretize(np.array(cam_pose[:3]))
|
223 |
+
cam_rpy_token = rotation_action_discretizer.discretize(np.array(cam_pose[3:6]))
|
224 |
+
cam_action_tokens = [cam_xyz_token[0], cam_xyz_token[1], cam_xyz_token[2], cam_rpy_token[0], cam_rpy_token[1], cam_rpy_token[2]]
|
225 |
+
cam_action_tokens_str = "<cam>[" + ",".join(map(str, cam_action_tokens)) + "]</cam>"
|
226 |
+
VQA_FORMAT = VQA_FORMAT.replace("CAM_PARAM", cam_action_tokens_str, 1)
|
227 |
+
|
228 |
+
# task lang
|
229 |
+
task = "Pick up the green object from the table and put it in the bowl"
|
230 |
+
VQA_FORMAT = VQA_FORMAT.replace("TASK", task)
|
231 |
+
|
232 |
+
img1 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_0.png"
|
233 |
+
img2 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_1.png"
|
234 |
+
images = [load_image(img1), load_image(img2)] # only need to return the PIL.Image object
|
235 |
+
response = pipe((VQA_FORMAT, images))
|
236 |
+
print(response.text)
|
237 |
+
print("gt: [124,137,104,126,130,129,233]")
|
238 |
+
action_list = np.array(ast.literal_eval(response.text))
|
239 |
+
xyz = translation_action_discretizer.undiscretize(action_list[:3])
|
240 |
+
rpy = rotation_action_discretizer.undiscretize(action_list[3:6])
|
241 |
+
openness = openness_action_discretizer.undiscretize(action_list[6])
|
242 |
+
|
243 |
+
print(f"xyz: {xyz}, rpy: {rpy}, openness: {openness}")
|