sdpkjc commited on
Commit
e1a8c96
1 Parent(s): ae99e83

pushing model

Browse files
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - InvertedPendulum-v2
4
+ - deep-reinforcement-learning
5
+ - reinforcement-learning
6
+ - custom-implementation
7
+ library_name: cleanrl
8
+ model-index:
9
+ - name: DDPG
10
+ results:
11
+ - task:
12
+ type: reinforcement-learning
13
+ name: reinforcement-learning
14
+ dataset:
15
+ name: InvertedPendulum-v2
16
+ type: InvertedPendulum-v2
17
+ metrics:
18
+ - type: mean_reward
19
+ value: 709.70 +/- 443.44
20
+ name: mean_reward
21
+ verified: false
22
+ ---
23
+
24
+ # (CleanRL) **DDPG** Agent Playing **InvertedPendulum-v2**
25
+
26
+ This is a trained model of a DDPG agent playing InvertedPendulum-v2.
27
+ The model was trained by using [CleanRL](https://github.com/vwxyzjn/cleanrl) and the most up-to-date training code can be
28
+ found [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ddpg_continuous_action_jax.py).
29
+
30
+ ## Get Started
31
+
32
+ To use this model, please install the `cleanrl` package with the following command:
33
+
34
+ ```
35
+ pip install "cleanrl[ddpg_continuous_action_jax]"
36
+ python -m cleanrl_utils.enjoy --exp-name ddpg_continuous_action_jax --env-id InvertedPendulum-v2
37
+ ```
38
+
39
+ Please refer to the [documentation](https://docs.cleanrl.dev/get-started/zoo/) for more detail.
40
+
41
+
42
+ ## Command to reproduce the training
43
+
44
+ ```bash
45
+ curl -OL https://huggingface.co/cleanrl/InvertedPendulum-v2-ddpg_continuous_action_jax-seed1/raw/main/ddpg_continuous_action_jax.py
46
+ curl -OL https://huggingface.co/cleanrl/InvertedPendulum-v2-ddpg_continuous_action_jax-seed1/raw/main/pyproject.toml
47
+ curl -OL https://huggingface.co/cleanrl/InvertedPendulum-v2-ddpg_continuous_action_jax-seed1/raw/main/poetry.lock
48
+ poetry install --all-extras
49
+ python ddpg_continuous_action_jax.py --track --capture-video --save-model --hf-entity cleanrl --upload-mode --env-id InvertedPendulum-v2 --seed 1
50
+ ```
51
+
52
+ # Hyperparameters
53
+ ```python
54
+ {'batch_size': 256,
55
+ 'buffer_size': 1000000,
56
+ 'capture_video': True,
57
+ 'env_id': 'InvertedPendulum-v2',
58
+ 'exp_name': 'ddpg_continuous_action_jax',
59
+ 'exploration_noise': 0.1,
60
+ 'gamma': 0.99,
61
+ 'hf_entity': 'cleanrl',
62
+ 'learning_rate': 0.0003,
63
+ 'learning_starts': 25000.0,
64
+ 'noise_clip': 0.5,
65
+ 'policy_frequency': 2,
66
+ 'save_model': True,
67
+ 'seed': 1,
68
+ 'tau': 0.005,
69
+ 'total_timesteps': 1000000,
70
+ 'track': True,
71
+ 'upload_model': True,
72
+ 'wandb_entity': None,
73
+ 'wandb_project_name': 'cleanRL'}
74
+ ```
75
+
ddpg_continuous_action_jax.cleanrl_model ADDED
Binary file (540 kB). View file
 
ddpg_continuous_action_jax.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ddpg/#ddpg_continuous_action_jaxpy
2
+ import argparse
3
+ import os
4
+ import random
5
+ import time
6
+ from distutils.util import strtobool
7
+ from typing import Sequence
8
+
9
+ import flax
10
+ import flax.linen as nn
11
+ import gymnasium as gym
12
+ import jax
13
+ import jax.numpy as jnp
14
+ import numpy as np
15
+ import optax
16
+ from flax.training.train_state import TrainState
17
+ from stable_baselines3.common.buffers import ReplayBuffer
18
+ from torch.utils.tensorboard import SummaryWriter
19
+
20
+
21
+ def parse_args():
22
+ # fmt: off
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
25
+ help="the name of this experiment")
26
+ parser.add_argument("--seed", type=int, default=1,
27
+ help="seed of the experiment")
28
+ parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
29
+ help="if toggled, this experiment will be tracked with Weights and Biases")
30
+ parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
31
+ help="the wandb's project name")
32
+ parser.add_argument("--wandb-entity", type=str, default=None,
33
+ help="the entity (team) of wandb's project")
34
+ parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
35
+ help="whether to capture videos of the agent performances (check out `videos` folder)")
36
+ parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
37
+ help="whether to save model into the `runs/{run_name}` folder")
38
+ parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
39
+ help="whether to upload the saved model to huggingface")
40
+ parser.add_argument("--hf-entity", type=str, default="",
41
+ help="the user or org name of the model repository from the Hugging Face Hub")
42
+
43
+ # Algorithm specific arguments
44
+ parser.add_argument("--env-id", type=str, default="HalfCheetah-v4",
45
+ help="the id of the environment")
46
+ parser.add_argument("--total-timesteps", type=int, default=1000000,
47
+ help="total timesteps of the experiments")
48
+ parser.add_argument("--learning-rate", type=float, default=3e-4,
49
+ help="the learning rate of the optimizer")
50
+ parser.add_argument("--buffer-size", type=int, default=int(1e6),
51
+ help="the replay memory buffer size")
52
+ parser.add_argument("--gamma", type=float, default=0.99,
53
+ help="the discount factor gamma")
54
+ parser.add_argument("--tau", type=float, default=0.005,
55
+ help="target smoothing coefficient (default: 0.005)")
56
+ parser.add_argument("--batch-size", type=int, default=256,
57
+ help="the batch size of sample from the reply memory")
58
+ parser.add_argument("--exploration-noise", type=float, default=0.1,
59
+ help="the scale of exploration noise")
60
+ parser.add_argument("--learning-starts", type=int, default=25e3,
61
+ help="timestep to start learning")
62
+ parser.add_argument("--policy-frequency", type=int, default=2,
63
+ help="the frequency of training policy (delayed)")
64
+ parser.add_argument("--noise-clip", type=float, default=0.5,
65
+ help="noise clip parameter of the Target Policy Smoothing Regularization")
66
+ args = parser.parse_args()
67
+ # fmt: on
68
+ return args
69
+
70
+
71
+ def make_env(env_id, seed, idx, capture_video, run_name):
72
+ def thunk():
73
+ if capture_video:
74
+ env = gym.make(env_id, render_mode="rgb_array")
75
+ else:
76
+ env = gym.make(env_id)
77
+ env = gym.wrappers.RecordEpisodeStatistics(env)
78
+ if capture_video:
79
+ if idx == 0:
80
+ env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
81
+ env.action_space.seed(seed)
82
+ env.observation_space.seed(seed)
83
+ return env
84
+
85
+ return thunk
86
+
87
+
88
+ # ALGO LOGIC: initialize agent here:
89
+ class QNetwork(nn.Module):
90
+ @nn.compact
91
+ def __call__(self, x: jnp.ndarray, a: jnp.ndarray):
92
+ x = jnp.concatenate([x, a], -1)
93
+ x = nn.Dense(256)(x)
94
+ x = nn.relu(x)
95
+ x = nn.Dense(256)(x)
96
+ x = nn.relu(x)
97
+ x = nn.Dense(1)(x)
98
+ return x
99
+
100
+
101
+ class Actor(nn.Module):
102
+ action_dim: Sequence[int]
103
+ action_scale: Sequence[int]
104
+ action_bias: Sequence[int]
105
+
106
+ @nn.compact
107
+ def __call__(self, x):
108
+ x = nn.Dense(256)(x)
109
+ x = nn.relu(x)
110
+ x = nn.Dense(256)(x)
111
+ x = nn.relu(x)
112
+ x = nn.Dense(self.action_dim)(x)
113
+ x = nn.tanh(x)
114
+ x = x * self.action_scale + self.action_bias
115
+ return x
116
+
117
+
118
+ class TrainState(TrainState):
119
+ target_params: flax.core.FrozenDict
120
+
121
+
122
+ if __name__ == "__main__":
123
+ import stable_baselines3 as sb3
124
+
125
+ if sb3.__version__ < "2.0":
126
+ raise ValueError(
127
+ """Ongoing migration: run the following command to install the new dependencies:
128
+
129
+ poetry run pip install "stable_baselines3==2.0.0a1"
130
+ """
131
+ )
132
+ args = parse_args()
133
+ run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
134
+ if args.track:
135
+ import wandb
136
+
137
+ wandb.init(
138
+ project=args.wandb_project_name,
139
+ entity=args.wandb_entity,
140
+ sync_tensorboard=True,
141
+ config=vars(args),
142
+ name=run_name,
143
+ monitor_gym=True,
144
+ save_code=True,
145
+ )
146
+ writer = SummaryWriter(f"runs/{run_name}")
147
+ writer.add_text(
148
+ "hyperparameters",
149
+ "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
150
+ )
151
+
152
+ # TRY NOT TO MODIFY: seeding
153
+ random.seed(args.seed)
154
+ np.random.seed(args.seed)
155
+ key = jax.random.PRNGKey(args.seed)
156
+ key, actor_key, qf1_key = jax.random.split(key, 3)
157
+
158
+ # env setup
159
+ envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
160
+ assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
161
+
162
+ max_action = float(envs.single_action_space.high[0])
163
+ envs.single_observation_space.dtype = np.float32
164
+ rb = ReplayBuffer(
165
+ args.buffer_size,
166
+ envs.single_observation_space,
167
+ envs.single_action_space,
168
+ device="cpu",
169
+ handle_timeout_termination=False,
170
+ )
171
+
172
+ # TRY NOT TO MODIFY: start the game
173
+ obs, _ = envs.reset()
174
+
175
+ action_scale = np.array((envs.action_space.high - envs.action_space.low) / 2.0)
176
+ action_bias = np.array((envs.action_space.high + envs.action_space.low) / 2.0)
177
+ actor = Actor(
178
+ action_dim=np.prod(envs.single_action_space.shape),
179
+ action_scale=action_scale,
180
+ action_bias=action_bias,
181
+ )
182
+ qf1 = QNetwork()
183
+ actor_state = TrainState.create(
184
+ apply_fn=actor.apply,
185
+ params=actor.init(actor_key, obs),
186
+ target_params=actor.init(actor_key, obs),
187
+ tx=optax.adam(learning_rate=args.learning_rate),
188
+ )
189
+ qf1_state = TrainState.create(
190
+ apply_fn=qf1.apply,
191
+ params=qf1.init(qf1_key, obs, envs.action_space.sample()),
192
+ target_params=qf1.init(qf1_key, obs, envs.action_space.sample()),
193
+ tx=optax.adam(learning_rate=args.learning_rate),
194
+ )
195
+ actor.apply = jax.jit(actor.apply)
196
+ qf1.apply = jax.jit(qf1.apply)
197
+
198
+ @jax.jit
199
+ def update_critic(
200
+ actor_state: TrainState,
201
+ qf1_state: TrainState,
202
+ observations: np.ndarray,
203
+ actions: np.ndarray,
204
+ next_observations: np.ndarray,
205
+ rewards: np.ndarray,
206
+ dones: np.ndarray,
207
+ ):
208
+ next_state_actions = (actor.apply(actor_state.target_params, next_observations)).clip(-1, 1) # TODO: proper clip
209
+ qf1_next_target = qf1.apply(qf1_state.target_params, next_observations, next_state_actions).reshape(-1)
210
+ next_q_value = (rewards + (1 - dones) * args.gamma * (qf1_next_target)).reshape(-1)
211
+
212
+ def mse_loss(params):
213
+ qf1_a_values = qf1.apply(params, observations, actions).squeeze()
214
+ return ((qf1_a_values - next_q_value) ** 2).mean(), qf1_a_values.mean()
215
+
216
+ (qf1_loss_value, qf1_a_values), grads = jax.value_and_grad(mse_loss, has_aux=True)(qf1_state.params)
217
+ qf1_state = qf1_state.apply_gradients(grads=grads)
218
+ return qf1_state, qf1_loss_value, qf1_a_values
219
+
220
+ @jax.jit
221
+ def update_actor(
222
+ actor_state: TrainState,
223
+ qf1_state: TrainState,
224
+ observations: np.ndarray,
225
+ ):
226
+ def actor_loss(params):
227
+ return -qf1.apply(qf1_state.params, observations, actor.apply(params, observations)).mean()
228
+
229
+ actor_loss_value, grads = jax.value_and_grad(actor_loss)(actor_state.params)
230
+ actor_state = actor_state.apply_gradients(grads=grads)
231
+ actor_state = actor_state.replace(
232
+ target_params=optax.incremental_update(actor_state.params, actor_state.target_params, args.tau)
233
+ )
234
+ qf1_state = qf1_state.replace(
235
+ target_params=optax.incremental_update(qf1_state.params, qf1_state.target_params, args.tau)
236
+ )
237
+ return actor_state, qf1_state, actor_loss_value
238
+
239
+ start_time = time.time()
240
+ for global_step in range(args.total_timesteps):
241
+ # ALGO LOGIC: put action logic here
242
+ if global_step < args.learning_starts:
243
+ actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
244
+ else:
245
+ actions = actor.apply(actor_state.params, obs)
246
+ actions = np.array(
247
+ [
248
+ (jax.device_get(actions)[0] + np.random.normal(0, action_scale * args.exploration_noise)[0]).clip(
249
+ envs.single_action_space.low, envs.single_action_space.high
250
+ )
251
+ ]
252
+ )
253
+
254
+ # TRY NOT TO MODIFY: execute the game and log data.
255
+ next_obs, rewards, terminateds, truncateds, infos = envs.step(actions)
256
+
257
+ # TRY NOT TO MODIFY: record rewards for plotting purposes
258
+ if "final_info" in infos:
259
+ for info in infos["final_info"]:
260
+ print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
261
+ writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
262
+ writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
263
+ break
264
+
265
+ # TRY NOT TO MODIFY: save data to reply buffer; handle `terminal_observation`
266
+ real_next_obs = next_obs.copy()
267
+ for idx, d in enumerate(truncateds):
268
+ if d:
269
+ real_next_obs[idx] = infos["final_observation"][idx]
270
+ rb.add(obs, real_next_obs, actions, rewards, terminateds, infos)
271
+
272
+ # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
273
+ obs = next_obs
274
+
275
+ # ALGO LOGIC: training.
276
+ if global_step > args.learning_starts:
277
+ data = rb.sample(args.batch_size)
278
+ qf1_state, qf1_loss_value, qf1_a_values = update_critic(
279
+ actor_state,
280
+ qf1_state,
281
+ data.observations.numpy(),
282
+ data.actions.numpy(),
283
+ data.next_observations.numpy(),
284
+ data.rewards.flatten().numpy(),
285
+ data.dones.flatten().numpy(),
286
+ )
287
+ if global_step % args.policy_frequency == 0:
288
+ actor_state, qf1_state, actor_loss_value = update_actor(
289
+ actor_state,
290
+ qf1_state,
291
+ data.observations.numpy(),
292
+ )
293
+
294
+ if global_step % 100 == 0:
295
+ writer.add_scalar("losses/qf1_loss", qf1_loss_value.item(), global_step)
296
+ writer.add_scalar("losses/actor_loss", actor_loss_value.item(), global_step)
297
+ writer.add_scalar("losses/qf1_values", qf1_a_values.item(), global_step)
298
+ print("SPS:", int(global_step / (time.time() - start_time)))
299
+ writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
300
+
301
+ if args.save_model:
302
+ model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
303
+ with open(model_path, "wb") as f:
304
+ f.write(
305
+ flax.serialization.to_bytes(
306
+ [
307
+ actor_state.params,
308
+ qf1_state.params,
309
+ ]
310
+ )
311
+ )
312
+ print(f"model saved to {model_path}")
313
+ from cleanrl_utils.evals.ddpg_jax_eval import evaluate
314
+
315
+ episodic_returns = evaluate(
316
+ model_path,
317
+ make_env,
318
+ args.env_id,
319
+ eval_episodes=10,
320
+ run_name=f"{run_name}-eval",
321
+ Model=(Actor, QNetwork),
322
+ exploration_noise=args.exploration_noise,
323
+ )
324
+ for idx, episodic_return in enumerate(episodic_returns):
325
+ writer.add_scalar("eval/episodic_return", episodic_return, idx)
326
+
327
+ if args.upload_model:
328
+ from cleanrl_utils.huggingface import push_to_hub
329
+
330
+ repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
331
+ repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
332
+ push_to_hub(args, episodic_returns, repo_id, "DDPG", f"runs/{run_name}", f"videos/{run_name}-eval")
333
+
334
+ envs.close()
335
+ writer.close()
events.out.tfevents.1688055374.3090-172.1331626.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2024f38367e87f0c0b84dbfb9fded590ed202bb8cce73ffc5e3174e02435ee36
3
+ size 2931277
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "cleanrl"
3
+ version = "1.1.0"
4
+ description = "High-quality single file implementation of Deep Reinforcement Learning algorithms with research-friendly features"
5
+ authors = ["Costa Huang <[email protected]>"]
6
+ packages = [
7
+ { include = "cleanrl" },
8
+ { include = "cleanrl_utils" },
9
+ ]
10
+ keywords = ["reinforcement", "machine", "learning", "research"]
11
+ license="MIT"
12
+ readme = "README.md"
13
+
14
+ [tool.poetry.dependencies]
15
+ python = ">=3.7.1,<3.11"
16
+ tensorboard = "^2.10.0"
17
+ wandb = "^0.13.11"
18
+ gym = "0.23.1"
19
+ torch = ">=1.12.1"
20
+ stable-baselines3 = "1.2.0"
21
+ gymnasium = ">=0.28.1"
22
+ moviepy = "^1.0.3"
23
+ pygame = "2.1.0"
24
+ huggingface-hub = "^0.11.1"
25
+ rich = "<12.0"
26
+ tenacity = "^8.2.2"
27
+
28
+ ale-py = {version = "0.7.4", optional = true}
29
+ AutoROM = {extras = ["accept-rom-license"], version = "^0.4.2", optional = true}
30
+ opencv-python = {version = "^4.6.0.66", optional = true}
31
+ procgen = {version = "^0.10.7", optional = true}
32
+ pytest = {version = "^7.1.3", optional = true}
33
+ mujoco = {version = "<=2.3.3", optional = true}
34
+ imageio = {version = "^2.14.1", optional = true}
35
+ free-mujoco-py = {version = "^2.1.6", optional = true}
36
+ mkdocs-material = {version = "^8.4.3", optional = true}
37
+ markdown-include = {version = "^0.7.0", optional = true}
38
+ openrlbenchmark = {version = "^0.1.1b4", optional = true}
39
+ jax = {version = "^0.3.17", optional = true}
40
+ jaxlib = {version = "^0.3.15", optional = true}
41
+ flax = {version = "^0.6.0", optional = true}
42
+ optuna = {version = "^3.0.1", optional = true}
43
+ optuna-dashboard = {version = "^0.7.2", optional = true}
44
+ envpool = {version = "^0.6.4", optional = true}
45
+ PettingZoo = {version = "1.18.1", optional = true}
46
+ SuperSuit = {version = "3.4.0", optional = true}
47
+ multi-agent-ale-py = {version = "0.1.11", optional = true}
48
+ boto3 = {version = "^1.24.70", optional = true}
49
+ awscli = {version = "^1.25.71", optional = true}
50
+ shimmy = {version = ">=1.0.0", extras = ["dm-control"], optional = true}
51
+
52
+ [tool.poetry.group.dev.dependencies]
53
+ pre-commit = "^2.20.0"
54
+
55
+
56
+ [tool.poetry.group.isaacgym]
57
+ optional = true
58
+ [tool.poetry.group.isaacgym.dependencies]
59
+ isaacgymenvs = {git = "https://github.com/vwxyzjn/IsaacGymEnvs.git", rev = "poetry", python = ">=3.7.1,<3.10"}
60
+ isaacgym = {path = "cleanrl/ppo_continuous_action_isaacgym/isaacgym", develop = true}
61
+
62
+
63
+ [build-system]
64
+ requires = ["poetry-core"]
65
+ build-backend = "poetry.core.masonry.api"
66
+
67
+ [tool.poetry.extras]
68
+ atari = ["ale-py", "AutoROM", "opencv-python"]
69
+ procgen = ["procgen"]
70
+ plot = ["pandas", "seaborn"]
71
+ pytest = ["pytest"]
72
+ mujoco = ["mujoco", "imageio"]
73
+ mujoco_py = ["free-mujoco-py"]
74
+ jax = ["jax", "jaxlib", "flax"]
75
+ docs = ["mkdocs-material", "markdown-include", "openrlbenchmark"]
76
+ envpool = ["envpool"]
77
+ optuna = ["optuna", "optuna-dashboard"]
78
+ pettingzoo = ["PettingZoo", "SuperSuit", "multi-agent-ale-py"]
79
+ cloud = ["boto3", "awscli"]
80
+ dm_control = ["shimmy", "mujoco"]
81
+
82
+ # dependencies for algorithm variant (useful when you want to run a specific algorithm)
83
+ dqn = []
84
+ dqn_atari = ["ale-py", "AutoROM", "opencv-python"]
85
+ dqn_jax = ["jax", "jaxlib", "flax"]
86
+ dqn_atari_jax = [
87
+ "ale-py", "AutoROM", "opencv-python", # atari
88
+ "jax", "jaxlib", "flax" # jax
89
+ ]
90
+ c51 = []
91
+ c51_atari = ["ale-py", "AutoROM", "opencv-python"]
92
+ c51_jax = ["jax", "jaxlib", "flax"]
93
+ c51_atari_jax = [
94
+ "ale-py", "AutoROM", "opencv-python", # atari
95
+ "jax", "jaxlib", "flax" # jax
96
+ ]
97
+ ppo_atari_envpool_xla_jax_scan = [
98
+ "ale-py", "AutoROM", "opencv-python", # atari
99
+ "jax", "jaxlib", "flax", # jax
100
+ "envpool", # envpool
101
+ ]
102
+ qdagger_dqn_atari_impalacnn = [
103
+ "ale-py", "AutoROM", "opencv-python"
104
+ ]
105
+ qdagger_dqn_atari_jax_impalacnn = [
106
+ "ale-py", "AutoROM", "opencv-python", # atari
107
+ "jax", "jaxlib", "flax", # jax
108
+ ]
109
+
110
+ [[tool.poetry.source]]
111
+ name = "tsinghua"
112
+ url = "https://pypi.tuna.tsinghua.edu.cn/simple"
113
+ default = true
replay.mp4 ADDED
Binary file (34.7 kB). View file
 
videos/InvertedPendulum-v2__ddpg_continuous_action_jax__1__1688055359-eval/rl-video-episode-0.mp4 ADDED
Binary file (2.71 kB). View file
 
videos/InvertedPendulum-v2__ddpg_continuous_action_jax__1__1688055359-eval/rl-video-episode-1.mp4 ADDED
Binary file (2.74 kB). View file
 
videos/InvertedPendulum-v2__ddpg_continuous_action_jax__1__1688055359-eval/rl-video-episode-8.mp4 ADDED
Binary file (34.7 kB). View file