turing-motors
/

Terra

+import argparse
+import json
+import random
+from pathlib import Path
+import imageio
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoModel
+from tqdm import tqdm
+# Constants
+IMAGE_SIZE = (288, 512)
+N_FRAMES_PER_ROUND = 25
+MAX_NUM_FRAMES = 50
+N_TOKENS_PER_FRAME = 576
+TRAJ_TEMPLATE_PATH = Path("./assets/template_trajectory.json")
+PATH_START_ID = 9
+PATH_POINT_INTERVAL = 10
+N_ACTION_TOKENS = 6
+# change here if you want to use your own images
+CONDITIONING_FRAMES_DIR = Path("./assets/conditioning_frames")
+CONDITIONING_FRAMES_PATH_LIST = [
+    CONDITIONING_FRAMES_DIR / "001.png",
+    CONDITIONING_FRAMES_DIR / "002.png",
+    CONDITIONING_FRAMES_DIR / "003.png"
+]
+def set_random_seed(seed: int = 0):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+def preprocess_image(image: Image.Image, size: tuple[int, int] = (288, 512)) -> torch.Tensor:
+    H, W = size
+    image = image.convert("RGB")
+    image = image.resize((W, H))
+    image_array = np.array(image)
+    image_array = (image_array / 127.5 - 1.0).astype(np.float32)
+    return torch.from_numpy(image_array).permute(2, 0, 1).unsqueeze(0).float()
+def to_np_images(images: torch.Tensor) -> np.ndarray:
+    images = images.detach().cpu()
+    images = torch.clamp(images, -1., 1.)
+    images = (images + 1.) / 2.
+    images = images.permute(0, 2, 3, 1).numpy()
+    return (255 * images).astype(np.uint8)
+def load_images(file_path_list: list[Path], size: tuple[int, int] = (288, 512)) -> torch.Tensor:
+    images = []
+    for file_path in file_path_list:
+        image = Image.open(file_path)
+        image = preprocess_image(image, size)
+        images.append(image)
+    return torch.cat(images, dim=0)
+def save_images_to_mp4(images: np.ndarray, output_path: Path, fps: int = 10):
+    writer = imageio.get_writer(output_path, fps=fps)
+    for img in images:
+        writer.append_data(img)
+    writer.close()
+def determine_num_rounds(num_frames: int, num_overlapping_frames: int, n_initial_frames: int) -> int:
+    n_rounds = (num_frames - n_initial_frames) // (N_FRAMES_PER_ROUND - num_overlapping_frames)
+    if (num_frames - n_initial_frames) % (N_FRAMES_PER_ROUND - num_overlapping_frames) > 0:
+        n_rounds += 1
+    return n_rounds
+def prepare_action(
+    traj_template: dict,
+    cmd: str,
+    path_start_id: int,
+    path_point_interval: int,
+    n_action_tokens: int = 5,
+    start_index: int = 0,
+    n_frames: int = 25
+) -> torch.Tensor:
+    trajs = traj_template[cmd]["instruction_trajs"]
+    actions = []
+    timesteps = np.arange(0.0, 3.0, 0.05)
+    for i in range(start_index, start_index + n_frames):
+        traj = trajs[i][path_start_id::path_point_interval][:n_action_tokens]
+        action = np.array(traj)
+        timestep = timesteps[path_start_id::path_point_interval][:n_action_tokens]
+        action = np.concatenate([
+            action[:, [1, 0]],
+            timestep.reshape(-1, 1)
+        ], axis=1)
+        actions.append(torch.tensor(action))
+    return torch.cat(actions, dim=0)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--output_dir", type=Path)
+    parser.add_argument("--cmd", type=str, default="curving_to_left/curving_to_left_moderate")
+    parser.add_argument("--num_frames", type=int, default=25)
+    parser.add_argument("--num_overlapping_frames", type=int, default=3)
+    args = parser.parse_args()
+    assert args.num_frames <= MAX_NUM_FRAMES, f"`num_frames` should be less than or equal to {MAX_NUM_FRAMES}"
+    assert args.num_overlapping_frames < N_FRAMES_PER_ROUND, f"`num_overlapping_frames` should be less than {N_FRAMES_PER_ROUND}"
+    set_random_seed(args.seed)
+    if args.output_dir is None:
+        output_dir = Path(f"./outputs/{args.cmd}")
+    else:
+        output_dir = args.output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoModel.from_pretrained("turing-motors/Terra", subfolder="lfq_tokenizer_B_256", trust_remote_code=True).to(device).eval()
+    model = AutoModel.from_pretrained("turing-motors/Terra", subfolder="world_model", trust_remote_code=True).to(device).eval()
+    conditioning_frames = load_images(CONDITIONING_FRAMES_PATH_LIST, IMAGE_SIZE).to(device)
+    with torch.inference_mode(), torch.autocast(device_type="cuda"):
+        input_ids = tokenizer.tokenize(conditioning_frames).detach().unsqueeze(0)
+    num_rounds = determine_num_rounds(args.num_frames, args.num_overlapping_frames, len(CONDITIONING_FRAMES_PATH_LIST))
+    print(f"Number of generation rounds: {num_rounds}")
+    with open(TRAJ_TEMPLATE_PATH) as f:
+        traj_template = json.load(f)
+    all_outputs = []
+    for round in range(num_rounds):
+        start_index = round * (N_FRAMES_PER_ROUND - args.num_overlapping_frames)
+        num_frames_for_round = min(N_FRAMES_PER_ROUND, args.num_frames - start_index)
+        actions = prepare_action(
+            traj_template, args.cmd, PATH_START_ID, PATH_POINT_INTERVAL, N_ACTION_TOKENS, start_index, num_frames_for_round
+        ).unsqueeze(0).to(device).float()
+        if round == 0:
+            num_generated_tokens = N_TOKENS_PER_FRAME * (num_frames_for_round - len(CONDITIONING_FRAMES_PATH_LIST))
+        else:
+            num_generated_tokens = N_TOKENS_PER_FRAME * (num_frames_for_round - args.num_overlapping_frames)
+        progress_bar = tqdm(total=num_generated_tokens, desc=f"Round {round + 1}")
+        with torch.inference_mode(), torch.autocast(device_type="cuda"):
+            output_tokens = model.generate(
+                input_ids=input_ids,
+                actions=actions,
+                do_sample=True,
+                max_length=N_TOKENS_PER_FRAME * num_frames_for_round,
+                temperature=1.0,
+                top_p=1.0,
+                use_cache=True,
+                pad_token_id=None,
+                eos_token_id=None,
+                progress_bar=progress_bar
+            )
+        if round == 0:
+            all_outputs.append(output_tokens[0])
+        else:
+            all_outputs.append(output_tokens[0, args.num_overlapping_frames * N_TOKENS_PER_FRAME:])
+        input_ids = output_tokens[:, -args.num_overlapping_frames * N_TOKENS_PER_FRAME:]
+        progress_bar.close()
+    output_ids = torch.cat(all_outputs)
+    # Calculate the shape of the latent tensor
+    downsample_ratio = 1
+    for coef in tokenizer.config.encoder_decoder_config["ch_mult"]:
+        downsample_ratio *= coef
+    h = IMAGE_SIZE[0] // downsample_ratio
+    w = IMAGE_SIZE[1] // downsample_ratio
+    c = tokenizer.config.encoder_decoder_config["z_channels"]
+    latent_shape = (len(output_ids) // 576, h, w, c)
+    # Decode the latent tensor to images
+    with torch.inference_mode(), torch.autocast(device_type="cuda"):
+        reconstructed = tokenizer.decode_tokens(output_ids, latent_shape)
+    reconstructed_images = to_np_images(reconstructed)
+    save_images_to_mp4(reconstructed_images, output_dir / "generated.mp4", fps=10)