|
--- |
|
pipeline_tag: reinforcement-learning |
|
tags: |
|
- ppo |
|
- pong |
|
- cleanRL |
|
- self-play |
|
- pettingzoo |
|
- pong_v3 |
|
--- |
|
# Experiment |
|
PPO agents trained in a selfplay settings. This repo includes checkpoints collected during training for |
|
4 experiments: |
|
- Shared weights for actor and critic |
|
- No shared weights |
|
- Resume training for extra steps for both shared and no shared setup |
|
Please check our [wandb report](https://wandb.ai/dumas/SPAR_RL_ELK/) for more details and the training code on [our GitHub](https://github.com/Butanium/cleanrl/blob/master/multiplayer_pong/ppo_pettingzoo_ma_atari.py) |
|
|
|
|
|
# Environment |
|
Multiplayer pong_v3 from PettingZoo with : |
|
- 4 stacked frame |
|
- Agent is trained to predict left agent policy (observation is mirrored for right agent) |
|
|
|
```py |
|
def pong_obs_modification(obs, _space, player_id): |
|
obs[:9, :, :] = 0 |
|
if "second" in player_id: |
|
# Mirror the image |
|
obs = obs[:, ::-1, :] |
|
return obs |
|
|
|
|
|
def get_env(args, run_name): |
|
env = importlib.import_module(f"pettingzoo.atari.{args.env_id}").parallel_env() |
|
env = ss.max_observation_v0(env, 2) |
|
env = ss.frame_skip_v0(env, 4) |
|
env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1) |
|
env = ss.color_reduction_v0(env, mode="B") |
|
env = ss.resize_v1(env, x_size=84, y_size=84) |
|
env = ss.frame_stack_v1(env, 4) |
|
# Remove the score from the observation |
|
if "pong" in args.env_id: |
|
env = ss.lambda_wrappers.observation_lambda_v0( |
|
env, |
|
pong_obs_modification, |
|
) |
|
# env = ss.agent_indicator_v0(env, type_only=False) |
|
env = ss.pettingzoo_env_to_vec_env_v1(env) |
|
envs = ss.concat_vec_envs_v1(env, args.num_envs // 2, num_cpus=0, base_class="gym") |
|
envs.single_observation_space = envs.observation_space |
|
envs.single_action_space = envs.action_space |
|
envs.is_vector_env = True |
|
envs = gym.wrappers.RecordEpisodeStatistics(envs) |
|
if args.capture_video: |
|
envs = gym.wrappers.RecordVideo(envs, f"videos/{run_name}") |
|
assert isinstance( |
|
envs.single_action_space, gym.spaces.Discrete |
|
), "only discrete action space is supported" |
|
return envs |
|
``` |
|
|
|
# Model architecture |
|
```py |
|
def atari_network(orth_init=False): |
|
init = layer_init if orth_init else lambda m: m |
|
return nn.Sequential( |
|
init(nn.Conv2d(4, 32, 8, stride=4)), |
|
nn.ReLU(), |
|
init(nn.Conv2d(32, 64, 4, stride=2)), |
|
nn.ReLU(), |
|
init(nn.Conv2d(64, 64, 3, stride=1)), |
|
nn.ReLU(), |
|
nn.Flatten(), |
|
init(nn.Linear(64 * 7 * 7, 512)), |
|
nn.ReLU(), |
|
) |
|
|
|
class Agent(nn.Module): |
|
def __init__(self, envs, share_network=False): |
|
super().__init__() |
|
self.actor_network = atari_network(orth_init=True) |
|
self.share_network = share_network |
|
if share_network: |
|
self.critic_network = self.actor_network |
|
else: |
|
self.critic_network = atari_network(orth_init=True) |
|
self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01) |
|
self.critic = layer_init(nn.Linear(512, 1), std=1) |
|
|
|
def get_value(self, x): |
|
x = x.clone() |
|
x[:, :, :, [0, 1, 2, 3]] /= 255.0 |
|
return self.critic(self.critic_network(x.permute((0, 3, 1, 2)))) |
|
|
|
def get_action_and_value(self, x, action=None): |
|
x = x.clone() |
|
x[:, :, :, [0, 1, 2, 3]] /= 255.0 |
|
logits = self.actor(self.actor_network(x.permute((0, 3, 1, 2)))) |
|
probs = Categorical(logits=logits) |
|
if action is None: |
|
action = probs.sample() |
|
return ( |
|
action, |
|
probs.log_prob(action), |
|
probs.entropy(), |
|
self.critic(self.critic_network(x.permute((0, 3, 1, 2)))), |
|
) |
|
|
|
def load(self, path): |
|
self.load_state_dict(torch.load(path)) |
|
if self.share_network: |
|
self.critic_network = self.actor_network |
|
``` |