jacksonhack commited on
Commit
203427f
·
verified ·
1 Parent(s): 4d6fa0b

pushing model

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - Hopper-v5
4
+ - deep-reinforcement-learning
5
+ - reinforcement-learning
6
+ - custom-implementation
7
+ library_name: cleanrl
8
+ model-index:
9
+ - name: TD3
10
+ results:
11
+ - task:
12
+ type: reinforcement-learning
13
+ name: reinforcement-learning
14
+ dataset:
15
+ name: Hopper-v5
16
+ type: Hopper-v5
17
+ metrics:
18
+ - type: mean_reward
19
+ value: 1329.44 +/- 156.77
20
+ name: mean_reward
21
+ verified: false
22
+ ---
23
+
24
+ # (CleanRL) **TD3** Agent Playing **Hopper-v5**
25
+
26
+ This is a trained model of a TD3 agent playing Hopper-v5.
27
+ The model was trained by using [CleanRL](https://github.com/vwxyzjn/cleanrl) and the most up-to-date training code can be
28
+ found [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/td3.py).
29
+
30
+ ## Get Started
31
+
32
+ To use this model, please install the `cleanrl` package with the following command:
33
+
34
+ ```
35
+ pip install "cleanrl[td3]"
36
+ python -m cleanrl_utils.enjoy --exp-name td3 --env-id Hopper-v5
37
+ ```
38
+
39
+ Please refer to the [documentation](https://docs.cleanrl.dev/get-started/zoo/) for more detail.
40
+
41
+
42
+ ## Command to reproduce the training
43
+
44
+ ```bash
45
+ curl -OL https://huggingface.co/jacksonhack/Hopper-v5-td3-seed1/raw/main/td3.py
46
+ curl -OL https://huggingface.co/jacksonhack/Hopper-v5-td3-seed1/raw/main/pyproject.toml
47
+ curl -OL https://huggingface.co/jacksonhack/Hopper-v5-td3-seed1/raw/main/poetry.lock
48
+ poetry install --all-extras
49
+ python td3.py --save_model --upload_model --track
50
+ ```
51
+
52
+ # Hyperparameters
53
+ ```python
54
+ {'batch_size': 256,
55
+ 'buffer_size': 1000000,
56
+ 'capture_video': False,
57
+ 'cuda': True,
58
+ 'env_id': 'Hopper-v5',
59
+ 'exp_name': 'td3',
60
+ 'exploration_noise': 0.1,
61
+ 'gamma': 0.99,
62
+ 'hf_entity': 'jacksonhack',
63
+ 'learning_rate': 0.0003,
64
+ 'learning_starts': 25000.0,
65
+ 'noise_clip': 0.5,
66
+ 'policy_frequency': 2,
67
+ 'policy_noise': 0.2,
68
+ 'save_model': True,
69
+ 'seed': 1,
70
+ 'tau': 0.005,
71
+ 'torch_deterministic': True,
72
+ 'total_timesteps': 1000000,
73
+ 'track': True,
74
+ 'upload_model': True,
75
+ 'wandb_entity': None,
76
+ 'wandb_project_name': 'cleanRL'}
77
+ ```
78
+
events.out.tfevents.1732713429.DESKTOP-3BC7099.129416.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd5c52ac63984c5e5f38cc2591f606271c4796402720f8b9a111df17d9b2f7f
3
+ size 4386563
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rl"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["jackson <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [
8
+ {include = "rl"},
9
+ {include = "rl_utils"},
10
+ ]
11
+
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.10"
15
+ gymnasium = {extras = ["box2d"], version = "^1.0.0"}
16
+ tensorboard = "^2.18.0"
17
+ huggingface-hub = "^0.26.2"
18
+ tyro = "^0.8.14"
19
+ torch = "^2.5.1"
20
+ stable-baselines3 = "^2.3.2"
21
+ numpy = "^1.21.6"
22
+ tenacity = "^9.0.0"
23
+ mujoco = "2.3.3"
24
+
25
+
26
+ [tool.poetry.group.dev.dependencies]
27
+ black = "^24.10.0"
28
+ wandb = "^0.18.7"
29
+ moviepy = "^2.1.1"
30
+
31
+ [build-system]
32
+ requires = ["poetry-core"]
33
+ build-backend = "poetry.core.masonry.api"
replay.mp4 ADDED
Binary file (720 kB). View file
 
td3.cleanrl_model ADDED
Binary file (843 kB). View file
 
td3.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/td3/#td3_continuous_actionpy
2
+ import os
3
+ import random
4
+ import time
5
+ from dataclasses import dataclass
6
+
7
+ import gymnasium as gym
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.optim as optim
13
+ import tyro
14
+ from stable_baselines3.common.buffers import ReplayBuffer
15
+ from torch.utils.tensorboard import SummaryWriter
16
+
17
+
18
+ @dataclass
19
+ class Args:
20
+ exp_name: str = os.path.basename(__file__)[: -len(".py")]
21
+ """the name of this experiment"""
22
+ seed: int = 1
23
+ """seed of the experiment"""
24
+ torch_deterministic: bool = True
25
+ """if toggled, `torch.backends.cudnn.deterministic=False`"""
26
+ cuda: bool = True
27
+ """if toggled, cuda will be enabled by default"""
28
+ track: bool = False
29
+ """if toggled, this experiment will be tracked with Weights and Biases"""
30
+ wandb_project_name: str = "cleanRL"
31
+ """the wandb's project name"""
32
+ wandb_entity: str = None
33
+ """the entity (team) of wandb's project"""
34
+ capture_video: bool = False
35
+ """whether to capture videos of the agent performances (check out `videos` folder)"""
36
+ save_model: bool = False
37
+ """whether to save model into the `runs/{run_name}` folder"""
38
+ upload_model: bool = False
39
+ """whether to upload the saved model to huggingface"""
40
+ hf_entity: str = "jacksonhack"
41
+ """the user or org name of the model repository from the Hugging Face Hub"""
42
+
43
+ # Algorithm specific arguments
44
+ env_id: str = "Hopper-v5"
45
+ """the id of the environment"""
46
+ total_timesteps: int = 1000000
47
+ """total timesteps of the experiments"""
48
+ learning_rate: float = 3e-4
49
+ """the learning rate of the optimizer"""
50
+ buffer_size: int = int(1e6)
51
+ """the replay memory buffer size"""
52
+ gamma: float = 0.99
53
+ """the discount factor gamma"""
54
+ tau: float = 0.005
55
+ """target smoothing coefficient (default: 0.005)"""
56
+ batch_size: int = 256
57
+ """the batch size of sample from the reply memory"""
58
+ policy_noise: float = 0.2
59
+ """the scale of policy noise"""
60
+ exploration_noise: float = 0.1
61
+ """the scale of exploration noise"""
62
+ learning_starts: int = 25e3
63
+ """timestep to start learning"""
64
+ policy_frequency: int = 2
65
+ """the frequency of training policy (delayed)"""
66
+ noise_clip: float = 0.5
67
+ """noise clip parameter of the Target Policy Smoothing Regularization"""
68
+
69
+
70
+ def make_env(env_id, seed, idx, capture_video, run_name):
71
+ def thunk():
72
+ if capture_video and idx == 0:
73
+ env = gym.make(env_id, render_mode="rgb_array")
74
+ env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
75
+ else:
76
+ env = gym.make(env_id)
77
+ env = gym.wrappers.RecordEpisodeStatistics(env)
78
+ env.action_space.seed(seed)
79
+ return env
80
+
81
+ return thunk
82
+
83
+
84
+ # ALGO LOGIC: initialize agent here:
85
+ class QNetwork(nn.Module):
86
+ def __init__(self, env):
87
+ super().__init__()
88
+ self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
89
+ self.fc2 = nn.Linear(256, 256)
90
+ self.fc3 = nn.Linear(256, 1)
91
+
92
+ def forward(self, x, a):
93
+ x = torch.cat([x, a], 1)
94
+ x = F.relu(self.fc1(x))
95
+ x = F.relu(self.fc2(x))
96
+ x = self.fc3(x)
97
+ return x
98
+
99
+
100
+ class Actor(nn.Module):
101
+ def __init__(self, env):
102
+ super().__init__()
103
+ self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
104
+ self.fc2 = nn.Linear(256, 256)
105
+ self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
106
+ # action rescaling
107
+ self.register_buffer(
108
+ "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
109
+ )
110
+ self.register_buffer(
111
+ "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
112
+ )
113
+
114
+ def forward(self, x):
115
+ x = F.relu(self.fc1(x))
116
+ x = F.relu(self.fc2(x))
117
+ x = torch.tanh(self.fc_mu(x))
118
+ return x * self.action_scale + self.action_bias
119
+
120
+
121
+ if __name__ == "__main__":
122
+ import stable_baselines3 as sb3
123
+
124
+ if sb3.__version__ < "2.0":
125
+ raise ValueError(
126
+ """Ongoing migration: run the following command to install the new dependencies:
127
+ poetry run pip install "stable_baselines3==2.0.0a1"
128
+ """
129
+ )
130
+
131
+ args = tyro.cli(Args)
132
+ run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
133
+ if args.track:
134
+ import wandb
135
+
136
+ wandb.init(
137
+ project=args.wandb_project_name,
138
+ entity=args.wandb_entity,
139
+ sync_tensorboard=True,
140
+ config=vars(args),
141
+ name=run_name,
142
+ monitor_gym=False,
143
+ save_code=True,
144
+ )
145
+ writer = SummaryWriter(f"runs/{run_name}")
146
+ writer.add_text(
147
+ "hyperparameters",
148
+ "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
149
+ )
150
+
151
+ # TRY NOT TO MODIFY: seeding
152
+ random.seed(args.seed)
153
+ np.random.seed(args.seed)
154
+ torch.manual_seed(args.seed)
155
+ torch.backends.cudnn.deterministic = args.torch_deterministic
156
+
157
+ device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
158
+
159
+ # env setup
160
+ envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
161
+ assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
162
+
163
+ actor = Actor(envs).to(device)
164
+ qf1 = QNetwork(envs).to(device)
165
+ qf2 = QNetwork(envs).to(device)
166
+ qf1_target = QNetwork(envs).to(device)
167
+ qf2_target = QNetwork(envs).to(device)
168
+ target_actor = Actor(envs).to(device)
169
+ target_actor.load_state_dict(actor.state_dict())
170
+ qf1_target.load_state_dict(qf1.state_dict())
171
+ qf2_target.load_state_dict(qf2.state_dict())
172
+ q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate)
173
+ actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate)
174
+
175
+ envs.single_observation_space.dtype = np.float32
176
+ rb = ReplayBuffer(
177
+ args.buffer_size,
178
+ envs.single_observation_space,
179
+ envs.single_action_space,
180
+ device,
181
+ handle_timeout_termination=False,
182
+ )
183
+ start_time = time.time()
184
+
185
+ # TRY NOT TO MODIFY: start the game
186
+ obs, _ = envs.reset(seed=args.seed)
187
+ for global_step in range(args.total_timesteps):
188
+ # ALGO LOGIC: put action logic here
189
+ if global_step < args.learning_starts:
190
+ actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
191
+ else:
192
+ with torch.no_grad():
193
+ actions = actor(torch.Tensor(obs).to(device))
194
+ actions += torch.normal(0, actor.action_scale * args.exploration_noise)
195
+ actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
196
+
197
+ # TRY NOT TO MODIFY: execute the game and log data.
198
+ next_obs, rewards, terminations, truncations, infos = envs.step(actions)
199
+
200
+ # TRY NOT TO MODIFY: record rewards for plotting purposes
201
+ # if "final_info" in infos:
202
+ # for info in infos["final_info"]:
203
+ # print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
204
+ # writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
205
+ # writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
206
+ # break
207
+
208
+ if "episode" in infos:
209
+ print(f"global_step={global_step}, episode_return={infos['episode']['r'][infos['_episode']][0]}")
210
+ writer.add_scalar("charts/episodic_return", infos["episode"]["r"][infos["_episode"]][0], global_step)
211
+ writer.add_scalar("charts/episodic_length", infos["episode"]["l"][infos["_episode"]][0], global_step)
212
+
213
+ # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
214
+ real_next_obs = next_obs.copy()
215
+ # for idx, trunc in enumerate(truncations):
216
+ # if trunc:
217
+ # real_next_obs[idx] = infos["final_observation"][idx]
218
+
219
+ rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
220
+
221
+ # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
222
+ obs = next_obs
223
+
224
+ # ALGO LOGIC: training.
225
+ if global_step > args.learning_starts:
226
+ data = rb.sample(args.batch_size)
227
+ with torch.no_grad():
228
+ clipped_noise = (torch.randn_like(data.actions, device=device) * args.policy_noise).clamp(
229
+ -args.noise_clip, args.noise_clip
230
+ ) * target_actor.action_scale
231
+
232
+ next_state_actions = (target_actor(data.next_observations) + clipped_noise).clamp(
233
+ envs.single_action_space.low[0], envs.single_action_space.high[0]
234
+ )
235
+ qf1_next_target = qf1_target(data.next_observations, next_state_actions)
236
+ qf2_next_target = qf2_target(data.next_observations, next_state_actions)
237
+ min_qf_next_target = torch.min(qf1_next_target, qf2_next_target)
238
+ next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
239
+
240
+ qf1_a_values = qf1(data.observations, data.actions).view(-1)
241
+ qf2_a_values = qf2(data.observations, data.actions).view(-1)
242
+ qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
243
+ qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
244
+ qf_loss = qf1_loss + qf2_loss
245
+
246
+ # optimize the model
247
+ q_optimizer.zero_grad()
248
+ qf_loss.backward()
249
+ q_optimizer.step()
250
+
251
+ if global_step % args.policy_frequency == 0:
252
+ actor_loss = -qf1(data.observations, actor(data.observations)).mean()
253
+ actor_optimizer.zero_grad()
254
+ actor_loss.backward()
255
+ actor_optimizer.step()
256
+
257
+ # update the target network
258
+ for param, target_param in zip(actor.parameters(), target_actor.parameters()):
259
+ target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
260
+ for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
261
+ target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
262
+ for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
263
+ target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
264
+
265
+ if global_step % 100 == 0:
266
+ writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
267
+ writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
268
+ writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
269
+ writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
270
+ writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
271
+ writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
272
+ print("SPS:", int(global_step / (time.time() - start_time)))
273
+ writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
274
+
275
+ if args.save_model:
276
+ model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
277
+ torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path)
278
+ print(f"model saved to {model_path}")
279
+ from rl_utils.evals.td3_eval import evaluate
280
+
281
+ episodic_returns = evaluate(
282
+ model_path,
283
+ make_env,
284
+ args.env_id,
285
+ eval_episodes=10,
286
+ run_name=f"{run_name}-eval",
287
+ Model=(Actor, QNetwork),
288
+ device=device,
289
+ exploration_noise=args.exploration_noise,
290
+ )
291
+ for idx, episodic_return in enumerate(episodic_returns):
292
+ writer.add_scalar("eval/episodic_return", episodic_return, idx)
293
+
294
+ if args.upload_model:
295
+ from rl_utils.huggingface import push_to_hub
296
+
297
+ repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
298
+ repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
299
+ push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval")
300
+
301
+ envs.close()
302
+ writer.close()
videos/Hopper-v5__td3__1__1732713426-eval/rl-video-episode-0.mp4 ADDED
Binary file (477 kB). View file
 
videos/Hopper-v5__td3__1__1732713426-eval/rl-video-episode-1.mp4 ADDED
Binary file (634 kB). View file
 
videos/Hopper-v5__td3__1__1732713426-eval/rl-video-episode-8.mp4 ADDED
Binary file (720 kB). View file