SiyuanH commited on
Commit
ba807b4
1 Parent(s): a8e60f3

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. lmdeploy_infer.py +243 -0
lmdeploy_infer.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import ast
4
+ from io import BytesIO
5
+ from typing import List, Union
6
+
7
+ from PIL import Image, ImageFile
8
+ import numpy as np
9
+ from scipy.spatial.transform import Rotation
10
+
11
+ from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
12
+ IMAGE_TOKEN = '<IMAGE_TOKEN>'
13
+
14
+ def normalize_quaternion(quat):
15
+ return np.array(quat) / np.linalg.norm(quat, axis=-1, keepdims=True)
16
+
17
+ def quaternion_to_discrete_euler(quaternion, bins_num=256):
18
+ euler = Rotation.from_quat(quaternion).as_euler('xyz', degrees=True) + 180
19
+ resolution = 360 / bins_num
20
+ disc = np.around((euler / resolution)).astype(int)
21
+ disc[disc == bins_num] = 0
22
+ return disc
23
+
24
+ def discrete_euler_to_quaternion(discrete_euler, bins_num=256):
25
+ resolution = 360 / bins_num
26
+ euler = (discrete_euler * resolution) - 180
27
+ return Rotation.from_euler('xyz', euler, degrees=True).as_quat()
28
+
29
+
30
+ class RotationActionDiscretizer:
31
+ def __init__(self, bins_num=256, min_action=-1, max_action=1):
32
+ """
33
+ Note: the input action is quaternion
34
+ Args: bins_num: Number of bins to discretize the rotation space into.
35
+ """
36
+ self.bins_num = bins_num
37
+
38
+ def discretize(self, action: Union[np.ndarray, List[float]], degrees=False):
39
+ # Check if the input action is quaternion or euler
40
+ if len(action) == 4:
41
+ return quaternion_to_discrete_euler(normalize_quaternion(action), bins_num=self.bins_num)
42
+ else:
43
+ return quaternion_to_discrete_euler(
44
+ normalize_quaternion(Rotation.from_euler('xyz', action, degrees=degrees).as_quat()),
45
+ bins_num=self.bins_num
46
+ )
47
+
48
+ def undiscretize(self, discrete_action):
49
+ return normalize_quaternion(discrete_euler_to_quaternion(discrete_action, bins_num=self.bins_num))
50
+
51
+ def get_action_space(self):
52
+ return self.bins_num
53
+
54
+ def generate_discrete_special_tokens(self)-> List[str]:
55
+ return [f"<rot{i}>" for i in range(self.bins_num)]
56
+
57
+ def map_4d_quaternion_to_special_tokens(self, action) -> List[str]:
58
+ discretiezd_action = self.discretize(action)
59
+ return [f"<rot{action}>" for action in discretiezd_action]
60
+
61
+ def map_roll_pitch_yaw_to_special_tokens(self, roll_pitch_yaw: Union[np.ndarray, List[float]], degrees=False) -> List[str]:
62
+ discretized_action = self.discretize(roll_pitch_yaw, degrees)
63
+ return [f"<rot{a}>" for a in discretized_action]
64
+
65
+
66
+ class TranslationActionDiscretizer:
67
+ def __init__(self, bins_num=256, min_action=-1, max_action=1):
68
+ self.bins_num = bins_num
69
+ self.min_action = min_action
70
+ self.max_action = max_action
71
+
72
+ # Create Uniform Bins + Compute Bin Centers
73
+ self.bins = np.linspace(min_action, max_action, bins_num)
74
+ self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
75
+
76
+ def discretize(self, action: np.ndarray):
77
+ action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
78
+ discretized_action = np.digitize(action, self.bins)
79
+ return discretized_action
80
+
81
+ def undiscretize(self, discrete_action):
82
+ """
83
+ NOTE =>> Because of the way the actions are discretized w.r.t. the bins (and not the bin centers), the
84
+ digitization returns bin indices between [1, # bins], inclusive, when there are actually only
85
+ (# bins - 1) bin intervals.
86
+
87
+ Therefore, if the digitization returns the last possible index, we map this to the last bin interval.
88
+
89
+ EXAMPLE =>> Let's say self._bins has 256 values. Then self._bin_centers has 255 values. Digitization returns
90
+ indices between [1, 256]. We subtract 1 from all indices so that they are between [0, 255]. There
91
+ is still one index (i==255) that would cause an out-of-bounds error if used to index into
92
+ self._bin_centers. Therefore, if i==255, we subtract 1 from it so that it just becomes the index of
93
+ the last bin center. We implement this simply via clipping between [0, 255 - 1].
94
+ """
95
+
96
+ discrete_action = np.clip(discrete_action - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
97
+ undiscretized_action = self.bin_centers[discrete_action]
98
+
99
+ # Clamp the result to the action bounds
100
+ return np.clip(undiscretized_action, self.min_action, self.max_action)
101
+
102
+ def get_action_space(self):
103
+ return self.bins_num
104
+
105
+ def generate_discrete_special_tokens(self)-> List[str]:
106
+ return [f"<loc{i}>" for i in range(self.bins_num)]
107
+
108
+ def map_3d_action_to_special_tokens(self, action) -> List[str]:
109
+ discretiezd_action = self.discretize(action)
110
+ return [f"<loc{action}>" for action in discretiezd_action]
111
+
112
+
113
+ class OpennessActionDiscretizer:
114
+ def __init__(self, bins_num=256, min_openness=0, max_openness=1):
115
+ """
116
+ Args:
117
+ bins_num: Number of bins to discretize the openness space into.
118
+ min_openness: Minimum openness of the gripper.
119
+ max_openness: Maximum openness of the gripper.
120
+ """
121
+ self.bins_num = bins_num
122
+ self.min_openness = min_openness
123
+ self.max_openness = max_openness
124
+
125
+ # Create Uniform Bins + Compute Bin Centers
126
+ self.bins = np.linspace(min_openness, max_openness, bins_num)
127
+ self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
128
+
129
+ def discretize(self, openness: float):
130
+ openness = np.clip(openness, a_min=self.min_openness, a_max=self.max_openness)
131
+ discretized_openness = np.digitize(openness, self.bins)
132
+ return discretized_openness
133
+
134
+ def undiscretize(self, discrete_openness):
135
+ discrete_openness = np.clip(discrete_openness - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
136
+ return self.bin_centers[discrete_openness]
137
+
138
+ def get_action_space(self):
139
+ return self.bins_num
140
+
141
+ def generate_discrete_special_tokens(self) -> List[str]:
142
+ return [f"<open{i}>" for i in range(self.bins_num)]
143
+
144
+ def map_openness_to_special_tokens(self, openness) -> List[str]:
145
+ discretized_openness = self.discretize(openness)
146
+ return [f"<open{discretized_openness}>"]
147
+
148
+ # def construct_lmdeploy_tasks(jsonl_path):
149
+ # data = load_jsonl(jsonl_path)
150
+
151
+ # lmdeploy_tasks = []
152
+ # for sample_idx, item in enumerate(data):
153
+
154
+ # langs = item["conversations"][0]["value"]
155
+ # langs = langs.replace("<image>", IMAGE_TOKEN)
156
+ # image_urls = [
157
+ # os.path.join(sample_save_folder, f"{sample_idx}_{im_idx}.png") for im_idx in range(len(item["image"]))
158
+ # ]
159
+ # gt_lang = item["conversations"][1]["value"]
160
+ # lmdeploy_tasks.append((langs, image_urls, gt_lang))
161
+
162
+ # return lmdeploy_tasks
163
+
164
+ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
165
+ """load image from base64 format."""
166
+ return Image.open(BytesIO(base64.b64decode(image)))
167
+
168
+ def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
169
+ """load image from url, local path or openai GPT4V."""
170
+ FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
171
+ headers = {
172
+ 'User-Agent':
173
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
174
+ '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
175
+ }
176
+ try:
177
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
178
+ if isinstance(image_url, Image.Image):
179
+ img = image_url
180
+ else:
181
+ # Load image from local path
182
+ img = Image.open(image_url)
183
+
184
+ # check image valid
185
+ img = img.convert('RGB')
186
+ except Exception as error:
187
+ if isinstance(image_url, str) and len(image_url) > 100:
188
+ image_url = image_url[:100] + ' ...'
189
+ print(f'{error}, image_url={image_url}')
190
+ # use dummy image
191
+ img = Image.new('RGB', (32, 32))
192
+
193
+ return img
194
+
195
+ model = '/mnt/petrelfs/huangsiyuan/VLA/InternVL/internvl_chat/output/internvla_8b_1node_with_visual_traces_wo_sp_token_w_cam/VLA8B_V1'
196
+ pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=2048), chat_template_config=ChatTemplateConfig(model_name='internvl2-internlm2'))
197
+
198
+ TRANS_MAX = 0.275
199
+ TRANS_MIN = -0.275
200
+
201
+ ROT_MIN = -0.350
202
+ ROT_MAX = 0.395
203
+
204
+ OPEN_MIN = -0.388
205
+ OPEN_MAX = 0.300
206
+
207
+ translation_bins_num = 256
208
+ rotation_bins_num = 256
209
+ openness_bins_num = 256
210
+ translation_action_discretizer = TranslationActionDiscretizer(bins_num=translation_bins_num, max_action=TRANS_MAX, min_action=TRANS_MIN)
211
+ rotation_action_discretizer = RotationActionDiscretizer(bins_num=rotation_bins_num, min_action=ROT_MIN, max_action=ROT_MAX)
212
+ openness_action_discretizer = OpennessActionDiscretizer(bins_num=openness_bins_num, min_openness=OPEN_MIN, max_openness=OPEN_MAX)
213
+
214
+ VQA_FORMAT = f"{IMAGE_TOKEN}\n {IMAGE_TOKEN}\n Given the observation images from the wrist camera mounted at CAM_PARAM and the overhead camera mounted at CAM_PARAM, please provide the action that the robot should take to finish the task: TASK"
215
+ # question_template = "<image>\n <image>\n Given the observation images from the wrist camera mounted at <cam>[256,89,256,236,129,181]</cam> and the overhead camera mounted at <cam>[82,1,256,54,128,98]</cam>, please provide the action that the robot should take to finish the task: place a chess piece on the chessboar"
216
+
217
+ # cam_params
218
+ wrist_cam_pose = [0.3618544138321802, -0.08323374464523976, 0.41759402329169787, 2.6584232953914344, 0.035482430406705845, 1.2906347836099603]
219
+ overhead_cam_pose = [-0.09877916942983442, -0.3919519409041736, 0.4780865865815033, -1.8237694898473762, -0.012183613523460979, -0.746683044221379]
220
+ cam_pose_list = [wrist_cam_pose, overhead_cam_pose]
221
+ for cam_pose in cam_pose_list:
222
+ cam_xyz_token = translation_action_discretizer.discretize(np.array(cam_pose[:3]))
223
+ cam_rpy_token = rotation_action_discretizer.discretize(np.array(cam_pose[3:6]))
224
+ cam_action_tokens = [cam_xyz_token[0], cam_xyz_token[1], cam_xyz_token[2], cam_rpy_token[0], cam_rpy_token[1], cam_rpy_token[2]]
225
+ cam_action_tokens_str = "<cam>[" + ",".join(map(str, cam_action_tokens)) + "]</cam>"
226
+ VQA_FORMAT = VQA_FORMAT.replace("CAM_PARAM", cam_action_tokens_str, 1)
227
+
228
+ # task lang
229
+ task = "Pick up the green object from the table and put it in the bowl"
230
+ VQA_FORMAT = VQA_FORMAT.replace("TASK", task)
231
+
232
+ img1 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_0.png"
233
+ img2 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_1.png"
234
+ images = [load_image(img1), load_image(img2)] # only need to return the PIL.Image object
235
+ response = pipe((VQA_FORMAT, images))
236
+ print(response.text)
237
+ print("gt: [124,137,104,126,130,129,233]")
238
+ action_list = np.array(ast.literal_eval(response.text))
239
+ xyz = translation_action_discretizer.undiscretize(action_list[:3])
240
+ rpy = rotation_action_discretizer.undiscretize(action_list[3:6])
241
+ openness = openness_action_discretizer.undiscretize(action_list[6])
242
+
243
+ print(f"xyz: {xyz}, rpy: {rpy}, openness: {openness}")