minimal example - code

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +38 -0
atari.py +126 -0
evaluate.ipynb +93 -0
networks.py +156 -0
performances.png +0 -0
requirements.txt +98 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*best_online_params filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Model parameters training with `i-DQN` and `i-IQN`
+This repository contains the model parameters trained with `i-DQN` on [$57$ Atari games](#list-of-games-for-i-dqn) and trained with `i-IQN` on [$20$ Atari games](#list-of-games-for-i-iqn) 🎮. $5$ seeds are available for each configuration which makes a total of $385$ available models 📈.
+The [evaluate.ipynb](./evaluate.ipynb) notebook contains a minimal example to evaluate to model parameters 🧑‍🏫. It uses JAX 🚀.
+ps: The set of [$20$ Atari games](#list-of-games-for-i-iqn) is included in the set of [$57$ Atari games](#list-of-games-for-i-dqn).
+### Model performances
+`i-DQN` and `i-IQN` are improvements made over [`DQN`](https://www.nature.com/articles/nature14236.pdf) and [`IQN`](https://arxiv.org/abs/1806.06923) ✨. Check it out on [arXiv](https://arxiv.org/abs/2403.02107)! | <img src="performances.png" alt="drawing" width="600"/>
+:-:|:-:
+### List of games for `i-DQN`
+Alien, Amidar, Assault, Asterix, Asteroids, Atlantis, BankHeist, BattleZone, BeamRider, Berzerk, Bowling, Boxing, Breakout, Centipede, ChopperCommand, CrazyClimber, DemonAttack, DoubleDunk, Enduro, FishingDerby, Freeway, Frostbite, Gopher, Gravitar, Hero, IceHockey, Jamesbond, Kangaroo, Krull, KungFuMaster, MontezumaRevenge, MsPacman, NameThisGame, Phoenix, Pitfall, Pong, Pooyan, PrivateEye, Qbert, Riverraid, RoadRunner, Robotank, Seaquest, Skiing, Solaris, SpaceInvaders, StarGunner, Tennis, TimePilot, Tutankham, UpNDown, Venture, VideoPinball, WizardOfWor, YarsRevenge, Zaxxon.
+### List of games for `i-IQN`
+Alien, Assault, BankHeist, Berzerk, Breakout, Centipede, ChopperCommand, DemonAttack, Enduro, Frostbite, Gopher, Gravitar, IceHockey, Jamesbond, Krull, KungFuMaster, Riverraid, Seaquest, Skiing, StarGunner.
+## User installation
+Python 3.10 is recommended. Create a Python virtual environment, activate it, update pip and install the package and its dependencies in editable mode:
+```bash
+python3.10 -m venv env
+source env/bin/activate
+pip install --upgrade pip
+pip install numpy==1.23.5  # to avoid numpy==2.XX
+pip install -r requirements.txt
+pip install --upgrade "jax[cuda12_pip]==0.4.13" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+```
+## Citing `i-QN`
+```
+@article{vincent2024iterated,
+  title={Iterated $ Q $-Network: Beyond the One-Step Bellman Operator},
+  author={Vincent, Th{\'e}o and Palenicek, Daniel and Belousov, Boris and Peters, Jan and D'Eramo, Carlo},
+  journal={arXiv preprint arXiv:2403.02107},
+  year={2024}
+}
+```

atari.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+The environment is inspired from https://github.com/google/dopamine/blob/master/dopamine/discrete_domains/atari_lib.py
+"""
+import os
+from typing import Tuple, Dict
+from gym.wrappers.monitoring import video_recorder
+import gym
+import numpy as np
+import jax
+import jax.numpy as jnp
+import cv2
+class AtariEnv:
+    def __init__(
+        self,
+        name: str,
+    ) -> None:
+        self.name = name
+        self.state_height, self.state_width = (84, 84)
+        self.n_stacked_frames = 4
+        self.n_skipped_frames = 4
+        self.env = gym.make(
+            f"ALE/{self.name}-v5",
+            full_action_space=False,
+            frameskip=1,
+            repeat_action_probability=0.25,
+            render_mode="rgb_array",
+        ).env
+        self.n_actions = self.env.action_space.n
+        self.original_state_height, self.original_state_width, _ = self.env.observation_space._shape
+        self.screen_buffer = [
+            np.empty((self.original_state_height, self.original_state_width), dtype=np.uint8),
+            np.empty((self.original_state_height, self.original_state_width), dtype=np.uint8),
+        ]
+    @property
+    def observation(self) -> np.ndarray:
+        return np.copy(self.state_[:, :, -1])
+    @property
+    def state(self) -> np.ndarray:
+        return jnp.array(self.state_, dtype=jnp.float32)
+    def reset(self) -> None:
+        self.env.reset()
+        self.n_steps = 0
+        self.env.ale.getScreenGrayscale(self.screen_buffer[0])
+        self.screen_buffer[1].fill(0)
+        self.state_ = np.zeros((self.state_height, self.state_width, self.n_stacked_frames), dtype=np.uint8)
+        self.state_[:, :, -1] = self.resize()
+    def step(self, action: jnp.int8) -> Tuple[float, bool, Dict]:
+        reward = 0
+        for idx_frame in range(self.n_skipped_frames):
+            _, reward_, terminal, _ = self.env.step(action)
+            reward += reward_
+            if idx_frame >= self.n_skipped_frames - 2:
+                t = idx_frame - (self.n_skipped_frames - 2)
+                self.env.ale.getScreenGrayscale(self.screen_buffer[t])
+            if terminal:
+                break
+        self.state_ = np.roll(self.state_, -1, axis=-1)
+        self.state_[:, :, -1] = self.pool_and_resize()
+        self.n_steps += 1
+        return reward, terminal, _
+    def pool_and_resize(self) -> np.ndarray:
+        np.maximum(self.screen_buffer[0], self.screen_buffer[1], out=self.screen_buffer[0])
+        return self.resize()
+    def resize(self):
+        return np.asarray(
+            cv2.resize(self.screen_buffer[0], (self.state_width, self.state_height), interpolation=cv2.INTER_AREA),
+            dtype=np.uint8,
+        )
+    def evaluate_one_simulation(
+        self,
+        q,
+        q_params: Dict,
+        horizon: int,
+        eps_eval: float,
+        exploration_key: jax.random.PRNGKey,
+        video_path: str,
+    ) -> float:
+        video = video_recorder.VideoRecorder(
+            self.env, path=f"{video_path}.mp4", enabled=True if video_path is not None else False
+        )
+        sun_reward = 0
+        terminal = False
+        self.reset()
+        while not terminal and self.n_steps < horizon:
+            self.env.render(mode="rgb_array")
+            video.capture_frame()
+            exploration_key, key = jax.random.split(exploration_key)
+            if jax.random.uniform(key) < eps_eval:
+                action = jax.random.choice(key, jnp.arange(self.n_actions)).astype(jnp.int8)
+            else:
+                action = q.best_action(q_params, self.state, key)
+            reward, terminal, _ = self.step(action)
+            sun_reward += reward
+        video.close()
+        if video_path is not None:
+            os.remove(f"{video_path}.meta.json")
+        return sun_reward, terminal

evaluate.ipynb ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import jax\n",
+    "import pickle\n",
+    "from atari import AtariEnv\n",
+    "from networks import AtariiDQN, AtariiIQN\n",
+    "from networks import AtariiIQN\n",
+    "\n",
+    "# ------- START TO MODIFY ------- #\n",
+    "IDQN_ALGO = True # if False then i-IQN is evaluated\n",
+    "GAME = \"Alien\"\n",
+    "NETWORK_SEED = 1 # seed in [1, 2, 3, 4, 5]\n",
+    "EVALUATION_SEED = 0\n",
+    "HORIZON = 27000\n",
+    "ENDING_EPS = 0.01\n",
+    "RECORD_VIDEO = False\n",
+    "\n",
+    "### 56 games are available for i-DQN with 5 seeds each:\n",
+    "# Alien, Amidar, Assault, Asterix, Asteroids, Atlantis, \n",
+    "# BankHeist, BattleZone, BeamRider, Berzerk, Bowling, Boxing, Breakout, Centipede, \n",
+    "# ChopperCommand, CrazyClimber, DemonAttack, DoubleDunk, Enduro, FishingDerby, \n",
+    "# Freeway, Frostbite, Gopher, Gravitar, Hero, IceHockey, Jamesbond, Kangaroo, \n",
+    "# Krull, KungFuMaster, MontezumaRevenge, MsPacman, NameThisGame, Phoenix, Pitfall, \n",
+    "# Pong, Pooyan, PrivateEye, Qbert, Riverraid, RoadRunner, Robotank, Seaquest, Skiing, \n",
+    "# Solaris, SpaceInvaders, StarGunner, Tennis, TimePilot, Tutankham, UpNDown, Venture, \n",
+    "# VideoPinball, WizardOfWor, YarsRevenge, Zaxxon\n",
+    "\n",
+    "## 20 games are available for i-IQN with 5 seeds each:\n",
+    "# Alien, Assault, BankHeist, Berzerk, Breakout, Centipede, \n",
+    "# ChopperCommand, DemonAttack, Enduro, Frostbite, Gopher, \n",
+    "# Gravitar, IceHockey, Jamesbond, Krull, KungFuMaster, \n",
+    "# Riverraid, Seaquest, Skiing, StarGunner\n",
+    "# ------- END TO MODIFY ------- #\n",
+    "\n",
+    "\n",
+    "params_path = f\"parameters/{GAME}/{'iDQN' if IDQN_ALGO else 'iIQN'}/{5 if IDQN_ALGO else 3}_Q_{NETWORK_SEED}_best_online_params\"\n",
+    "\n",
+    "env = AtariEnv(GAME)\n",
+    "\n",
+    "if IDQN_ALGO:\n",
+    "    q = AtariiDQN(env.n_actions, idx_head=0) # idx_head in [0, 1, 2, 3, 4, 5]\n",
+    "else:\n",
+    "    q = AtariiIQN(env.n_actions, idx_head=0) # idx_head in [0, 1, 2, 3]\n",
+    "\n",
+    "with open(params_path, \"rb\") as handle:\n",
+    "    q_params = pickle.load(handle)\n",
+    "\n",
+    "reward, absorbing = env.evaluate_one_simulation(\n",
+    "    q, q_params, HORIZON, ENDING_EPS, jax.random.PRNGKey(EVALUATION_SEED), params_path if RECORD_VIDEO else None\n",
+    ")\n",
+    "print(\"Undiscounted reward:\", reward)\n",
+    "print(\"N steps\", env.n_steps, \"; Horizon\", HORIZON, \"; Absorbing\", absorbing)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

networks.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from flax.core import FrozenDict
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from functools import partial
+# --- Base functions ---
+def scale(state: jnp.ndarray) -> jnp.ndarray:
+    return state / 255.0
+class Torso(nn.Module):
+    initialization_type: str
+    @nn.compact
+    def __call__(self, state):
+        if self.initialization_type == "dqn":
+            initializer = nn.initializers.variance_scaling(scale=1.0, mode="fan_avg", distribution="truncated_normal")
+        elif self.initialization_type == "iqn":
+            initializer = nn.initializers.variance_scaling(
+                scale=1.0 / jnp.sqrt(3.0), mode="fan_in", distribution="uniform"
+            )
+        x = nn.Conv(features=32, kernel_size=(8, 8), strides=(4, 4), kernel_init=initializer)(state)
+        x = nn.relu(x)
+        x = nn.Conv(features=64, kernel_size=(4, 4), strides=(2, 2), kernel_init=initializer)(x)
+        x = nn.relu(x)
+        x = nn.Conv(features=64, kernel_size=(3, 3), strides=(1, 1), kernel_init=initializer)(x)
+        x = nn.relu(x)
+        return x.flatten()
+class Head(nn.Module):
+    n_actions: int
+    initialization_type: str
+    @nn.compact
+    def __call__(self, x):
+        if self.initialization_type == "dqn":
+            initializer = nn.initializers.variance_scaling(scale=1.0, mode="fan_avg", distribution="truncated_normal")
+        elif self.initialization_type == "iqn":
+            initializer = nn.initializers.variance_scaling(
+                scale=1.0 / jnp.sqrt(3.0), mode="fan_in", distribution="uniform"
+            )
+        x = nn.Dense(features=512, kernel_init=initializer)(x)
+        x = nn.relu(x)
+        return nn.Dense(features=self.n_actions, kernel_init=initializer)(x)
+class QuantileEmbedding(nn.Module):
+    n_features: int = 7744
+    quantile_embedding_dim: int = 64
+    @nn.compact
+    def __call__(self, key, n_quantiles):
+        initializer = nn.initializers.variance_scaling(scale=1.0 / jnp.sqrt(3.0), mode="fan_in", distribution="uniform")
+        quantiles = jax.random.uniform(key, shape=(n_quantiles, 1))
+        arange = jnp.arange(1, self.quantile_embedding_dim + 1).reshape((1, self.quantile_embedding_dim))
+        quantile_embedding = nn.Dense(features=self.n_features, kernel_init=initializer)(
+            jnp.cos(jnp.pi * quantiles @ arange)
+        )
+        # output (n_quantiles, n_features) | (n_quantiles)
+        return (nn.relu(quantile_embedding), jnp.squeeze(quantiles, axis=1))
+# --- i-DQN networks ---
+class AtariSharediDQNNet:
+    def __init__(self, n_actions: int) -> None:
+        self.n_heads = 5
+        self.n_actions = n_actions
+        self.torso = Torso("dqn")
+        self.head = Head(self.n_actions, "dqn")
+    def apply(self, params: FrozenDict, idx_head: int, state: jnp.ndarray) -> jnp.ndarray:
+        feature = self.torso.apply(
+            params[f"torso_params_{min(idx_head, 1)}"],
+            state,
+        )
+        return self.head.apply(params[f"head_params_{idx_head}"], feature)
+class AtariiDQN:
+    def __init__(self, n_actions: int, idx_head: int) -> None:
+        self.network = AtariSharediDQNNet(n_actions)
+        self.idx_head = idx_head
+    @partial(jax.jit, static_argnames="self")
+    def best_action(self, params: FrozenDict, state: jnp.ndarray, key: jax.random.PRNGKeyArray) -> jnp.int8:
+        return jnp.argmax(self.network.apply(params, self.idx_head, scale(state))).astype(jnp.int8)
+# --- i-IQN networks ---
+class AtariSharediIQNNet:
+    def __init__(self, n_actions: int) -> None:
+        self.n_heads = 4
+        self.n_actions = n_actions
+        self.torso = Torso("iqn")
+        self.quantile_embedding = QuantileEmbedding()
+        self.head = Head(self.n_actions, "iqn")
+    def apply(
+        self, params: FrozenDict, idx_head: int, state: jnp.ndarray, key: jax.random.PRNGKey, n_quantiles: int
+    ) -> jnp.ndarray:
+        # output (n_features)
+        state_feature = self.torso.apply(
+            jax.tree_util.tree_map(
+                lambda param: param[jax.lax.cond(idx_head >= 1, lambda: 1, lambda: 0)], params["torso_params"]
+            ),
+            state,
+        )
+        # output (n_quantiles, n_features)
+        quantiles_feature, _ = self.quantile_embedding.apply(
+            jax.tree_util.tree_map(
+                lambda param: param[jax.lax.cond(idx_head >= 1, lambda: 1, lambda: 0)], params["quantiles_params"]
+            ),
+            key,
+            n_quantiles,
+        )
+        # mapping over the quantiles | output (n_quantiles, n_features)
+        feature = jax.vmap(
+            lambda quantile_feature_, state_feature_: quantile_feature_ * state_feature_, in_axes=(0, None)
+        )(quantiles_feature, state_feature)
+        return self.head.apply(
+            jax.tree_util.tree_map(lambda param: param[idx_head], params["head_params"]), feature
+        )  # output (n_quantiles, n_actions)
+class AtariiIQN:
+    def __init__(self, n_actions: int, idx_head: int) -> None:
+        self.network = AtariSharediIQNNet(n_actions)
+        self.idx_head = idx_head
+        self.n_quantiles_policy = 32
+    @partial(jax.jit, static_argnames="self")
+    def best_action(self, params: FrozenDict, state: jnp.ndarray, key: jax.random.PRNGKeyArray) -> jnp.int8:
+        # output (n_quantiles, n_actions)
+        q_quantiles = self.network.apply(params, self.idx_head, scale(state), key, self.n_quantiles_policy)
+        q_values = jnp.mean(q_quantiles, axis=0)
+        return jnp.argmax(q_values).astype(jnp.int8)

performances.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,98 @@

+absl-py==1.4.0
+ale-py==0.7.5
+arch==6.2.0
+asttokens==2.2.1
+AutoROM==0.4.2
+AutoROM.accept-rom-license==0.6.1
+backcall==0.2.0
+cached-property==1.5.2
+certifi==2023.5.7
+charset-normalizer==3.1.0
+chex==0.1.8
+click==8.1.3
+cloudpickle==2.2.1
+comm==0.1.3
+contourpy==1.1.0
+cycler==0.11.0
+debugpy==1.6.7
+decorator==5.1.1
+dm-tree==0.1.8
+etils==1.3.0
+exceptiongroup==1.1.3
+executing==1.2.0
+flax==0.6.11
+fonttools==4.40.0
+fsspec==2023.9.2
+gym==0.25.2
+gym-notices==0.0.8
+idna==3.4
+importlib-resources==5.12.0
+iniconfig==2.0.0
+ipykernel==6.25.0
+ipython==8.14.0
+jax==0.4.13
+jaxlib==0.4.13
+jedi==0.18.2
+jupyter_client==8.3.0
+jupyter_core==5.3.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+ml-dtypes==0.2.0
+msgpack==1.0.5
+nest-asyncio==1.5.6
+numpy==1.23.5
+nvidia-cublas-cu12==12.2.5.6
+nvidia-cuda-cupti-cu12==12.2.142
+nvidia-cuda-nvcc-cu12==12.2.140
+nvidia-cuda-nvrtc-cu12==12.2.140
+nvidia-cuda-runtime-cu12==12.2.140
+nvidia-cudnn-cu12==8.9.4.25
+nvidia-cufft-cu12==11.0.8.103
+nvidia-cusolver-cu12==11.5.2.141
+nvidia-cusparse-cu12==12.1.2.141
+nvidia-nvjitlink-cu12==12.2.140
+opencv-python==4.7.0.72
+opt-einsum==3.3.0
+optax==0.1.5
+orbax-checkpoint==0.2.6
+packaging==23.1
+pandas==2.0.2
+parso==0.8.3
+patsy==0.5.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.5.0
+platformdirs==3.9.1
+pluggy==1.3.0
+prompt-toolkit==3.0.38
+protobuf==4.23.3
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+Pygments==2.15.1
+pyparsing==3.1.0
+pytest==7.4.0
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+pyzmq==25.1.0
+requests==2.31.0
+rich==13.4.2
+scipy==1.11.0
+six==1.16.0
+stack-data==0.6.2
+statsmodels==0.14.0
+tensorstore==0.1.39
+tomli==2.0.1
+toolz==0.12.0
+tornado==6.3.2
+tqdm==4.65.0
+traitlets==5.9.0
+typing_extensions==4.6.3
+tzdata==2023.3
+urllib3==1.26.16
+wcwidth==0.2.6
+zipp==3.17.0