SiyuanH
/

VLA8B-V1

Safetensors

internvl_chat

custom_code

Model card Files Files and versions Community

SiyuanH commited on Oct 9, 2024

Commit

ba807b4

verified ·

1 Parent(s): a8e60f3

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

lmdeploy_infer.py +243 -0

lmdeploy_infer.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import base64
+import os
+import ast
+from io import BytesIO
+from typing import List, Union
+from PIL import Image, ImageFile
+import numpy as np
+from scipy.spatial.transform import Rotation
+from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
+IMAGE_TOKEN = '<IMAGE_TOKEN>'
+def normalize_quaternion(quat):
+    return np.array(quat) / np.linalg.norm(quat, axis=-1, keepdims=True)
+def quaternion_to_discrete_euler(quaternion, bins_num=256):
+    euler = Rotation.from_quat(quaternion).as_euler('xyz', degrees=True) + 180
+    resolution = 360 / bins_num
+    disc = np.around((euler / resolution)).astype(int)
+    disc[disc == bins_num] = 0
+    return disc
+def discrete_euler_to_quaternion(discrete_euler, bins_num=256):
+    resolution = 360 / bins_num
+    euler = (discrete_euler * resolution) - 180
+    return Rotation.from_euler('xyz', euler, degrees=True).as_quat()
+class RotationActionDiscretizer:
+    def __init__(self, bins_num=256, min_action=-1, max_action=1):
+        """
+        Note: the input action is quaternion
+        Args: bins_num: Number of bins to discretize the rotation space into.
+        """
+        self.bins_num = bins_num
+    def discretize(self, action: Union[np.ndarray, List[float]], degrees=False):
+        # Check if the input action is quaternion or euler
+        if len(action) == 4:
+            return quaternion_to_discrete_euler(normalize_quaternion(action), bins_num=self.bins_num)
+        else:
+            return quaternion_to_discrete_euler(
+                normalize_quaternion(Rotation.from_euler('xyz', action, degrees=degrees).as_quat()),
+                bins_num=self.bins_num
+            )
+    def undiscretize(self, discrete_action):
+        return normalize_quaternion(discrete_euler_to_quaternion(discrete_action, bins_num=self.bins_num))
+    def get_action_space(self):
+        return self.bins_num
+    def generate_discrete_special_tokens(self)-> List[str]:
+        return [f"<rot{i}>" for i in range(self.bins_num)]
+    def map_4d_quaternion_to_special_tokens(self, action) -> List[str]:
+        discretiezd_action = self.discretize(action)
+        return [f"<rot{action}>" for action in discretiezd_action]
+    def map_roll_pitch_yaw_to_special_tokens(self, roll_pitch_yaw: Union[np.ndarray, List[float]], degrees=False) -> List[str]:
+        discretized_action = self.discretize(roll_pitch_yaw, degrees)
+        return [f"<rot{a}>" for a in discretized_action]
+class TranslationActionDiscretizer:
+    def __init__(self, bins_num=256, min_action=-1, max_action=1):
+        self.bins_num = bins_num
+        self.min_action = min_action
+        self.max_action = max_action
+        # Create Uniform Bins + Compute Bin Centers
+        self.bins = np.linspace(min_action, max_action, bins_num)
+        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
+    def discretize(self, action: np.ndarray):
+        action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
+        discretized_action = np.digitize(action, self.bins)
+        return discretized_action
+    def undiscretize(self, discrete_action):
+        """
+        NOTE =>> Because of the way the actions are discretized w.r.t. the bins (and not the bin centers), the
+                 digitization returns bin indices between [1, # bins], inclusive, when there are actually only
+                 (# bins - 1) bin intervals.
+                 Therefore, if the digitization returns the last possible index, we map this to the last bin interval.
+        EXAMPLE =>> Let's say self._bins has 256 values. Then self._bin_centers has 255 values. Digitization returns
+                    indices between [1, 256]. We subtract 1 from all indices so that they are between [0, 255]. There
+                    is still one index (i==255) that would cause an out-of-bounds error if used to index into
+                    self._bin_centers. Therefore, if i==255, we subtract 1 from it so that it just becomes the index of
+                    the last bin center. We implement this simply via clipping between [0, 255 - 1].
+        """
+        discrete_action = np.clip(discrete_action - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
+        undiscretized_action = self.bin_centers[discrete_action]
+        # Clamp the result to the action bounds
+        return np.clip(undiscretized_action, self.min_action, self.max_action)
+    def get_action_space(self):
+        return self.bins_num
+    def generate_discrete_special_tokens(self)-> List[str]:
+        return [f"<loc{i}>" for i in range(self.bins_num)]
+    def map_3d_action_to_special_tokens(self, action) -> List[str]:
+        discretiezd_action = self.discretize(action)
+        return [f"<loc{action}>" for action in discretiezd_action]
+class OpennessActionDiscretizer:
+    def __init__(self, bins_num=256, min_openness=0, max_openness=1):
+        """
+        Args:
+            bins_num: Number of bins to discretize the openness space into.
+            min_openness: Minimum openness of the gripper.
+            max_openness: Maximum openness of the gripper.
+        """
+        self.bins_num = bins_num
+        self.min_openness = min_openness
+        self.max_openness = max_openness
+        # Create Uniform Bins + Compute Bin Centers
+        self.bins = np.linspace(min_openness, max_openness, bins_num)
+        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
+    def discretize(self, openness: float):
+        openness = np.clip(openness, a_min=self.min_openness, a_max=self.max_openness)
+        discretized_openness = np.digitize(openness, self.bins)
+        return discretized_openness
+    def undiscretize(self, discrete_openness):
+        discrete_openness = np.clip(discrete_openness - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
+        return self.bin_centers[discrete_openness]
+    def get_action_space(self):
+        return self.bins_num
+    def generate_discrete_special_tokens(self) -> List[str]:
+        return [f"<open{i}>" for i in range(self.bins_num)]
+    def map_openness_to_special_tokens(self, openness) -> List[str]:
+        discretized_openness = self.discretize(openness)
+        return [f"<open{discretized_openness}>"]
+# def construct_lmdeploy_tasks(jsonl_path):
+#     data = load_jsonl(jsonl_path)
+#     lmdeploy_tasks = []
+#     for sample_idx, item in enumerate(data):
+#         langs = item["conversations"][0]["value"]
+#         langs = langs.replace("<image>", IMAGE_TOKEN)
+#         image_urls = [
+#             os.path.join(sample_save_folder, f"{sample_idx}_{im_idx}.png") for im_idx in range(len(item["image"]))
+#         ]
+#         gt_lang = item["conversations"][1]["value"]
+#         lmdeploy_tasks.append((langs, image_urls, gt_lang))
+#     return lmdeploy_tasks
+def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
+    """load image from base64 format."""
+    return Image.open(BytesIO(base64.b64decode(image)))
+def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
+    """load image from url, local path or openai GPT4V."""
+    FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
+    headers = {
+        'User-Agent':
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+    try:
+        ImageFile.LOAD_TRUNCATED_IMAGES = True
+        if isinstance(image_url, Image.Image):
+            img = image_url
+        else:
+            # Load image from local path
+            img = Image.open(image_url)
+        # check image valid
+        img = img.convert('RGB')
+    except Exception as error:
+        if isinstance(image_url, str) and len(image_url) > 100:
+            image_url = image_url[:100] + ' ...'
+        print(f'{error}, image_url={image_url}')
+        # use dummy image
+        img = Image.new('RGB', (32, 32))
+    return img
+model = '/mnt/petrelfs/huangsiyuan/VLA/InternVL/internvl_chat/output/internvla_8b_1node_with_visual_traces_wo_sp_token_w_cam/VLA8B_V1'
+pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=2048),  chat_template_config=ChatTemplateConfig(model_name='internvl2-internlm2'))
+TRANS_MAX = 0.275
+TRANS_MIN = -0.275
+ROT_MIN = -0.350
+ROT_MAX = 0.395
+OPEN_MIN = -0.388
+OPEN_MAX = 0.300
+translation_bins_num = 256
+rotation_bins_num = 256
+openness_bins_num = 256
+translation_action_discretizer = TranslationActionDiscretizer(bins_num=translation_bins_num, max_action=TRANS_MAX, min_action=TRANS_MIN)
+rotation_action_discretizer = RotationActionDiscretizer(bins_num=rotation_bins_num, min_action=ROT_MIN, max_action=ROT_MAX)
+openness_action_discretizer = OpennessActionDiscretizer(bins_num=openness_bins_num, min_openness=OPEN_MIN, max_openness=OPEN_MAX)
+VQA_FORMAT = f"{IMAGE_TOKEN}\n {IMAGE_TOKEN}\n Given the observation images from the wrist camera mounted at CAM_PARAM and the overhead camera mounted at CAM_PARAM, please provide the action that the robot should take to finish the task: TASK"
+# question_template = "<image>\n <image>\n Given the observation images from the wrist camera mounted at <cam>[256,89,256,236,129,181]</cam> and the overhead camera mounted at <cam>[82,1,256,54,128,98]</cam>, please provide the action that the robot should take to finish the task: place a chess piece on the chessboar"
+# cam_params
+wrist_cam_pose = [0.3618544138321802, -0.08323374464523976, 0.41759402329169787, 2.6584232953914344, 0.035482430406705845, 1.2906347836099603]
+overhead_cam_pose = [-0.09877916942983442, -0.3919519409041736, 0.4780865865815033, -1.8237694898473762, -0.012183613523460979, -0.746683044221379]
+cam_pose_list = [wrist_cam_pose, overhead_cam_pose]
+for cam_pose in cam_pose_list:
+    cam_xyz_token = translation_action_discretizer.discretize(np.array(cam_pose[:3]))
+    cam_rpy_token = rotation_action_discretizer.discretize(np.array(cam_pose[3:6]))
+    cam_action_tokens = [cam_xyz_token[0], cam_xyz_token[1], cam_xyz_token[2], cam_rpy_token[0], cam_rpy_token[1], cam_rpy_token[2]]
+    cam_action_tokens_str = "<cam>[" + ",".join(map(str, cam_action_tokens)) + "]</cam>"
+    VQA_FORMAT = VQA_FORMAT.replace("CAM_PARAM", cam_action_tokens_str, 1)
+# task lang
+task = "Pick up the green object from the table and put it in the bowl"
+VQA_FORMAT = VQA_FORMAT.replace("TASK", task)
+img1 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_0.png"
+img2 = "/mnt/petrelfs/huangsiyuan/VLA/droid_action_tasks_internvl/sample_images/2_1.png"
+images = [load_image(img1), load_image(img2)] # only need to return the PIL.Image object
+response = pipe((VQA_FORMAT, images))
+print(response.text)
+print("gt: [124,137,104,126,130,129,233]")
+action_list =  np.array(ast.literal_eval(response.text))
+xyz = translation_action_discretizer.undiscretize(action_list[:3])
+rpy = rotation_action_discretizer.undiscretize(action_list[3:6])
+openness = openness_action_discretizer.undiscretize(action_list[6])
+print(f"xyz: {xyz}, rpy: {rpy}, openness: {openness}")