In [None]:
import gym

from stable_baselines3 import TD3
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

import wandb
from wandb.integration.sb3 import WandbCallback
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [None]:
config = {
 "policy_type": "MlpPolicy",
 "env_name": "BipedalWalker-v3",
}

In [None]:
run = wandb.init(
 project="BiPedalWalker-v3",
 config=config,
 sync_tensorboard=True, # auto-upload sb3's tensorboard metrics
 monitor_gym=True, # auto-upload the videos of agents playing the game
 save_code=True, # optional
)

In [None]:
import gym


env = gym.make("BipedalWalker-v3")

observation = env.reset()

for _ in range(200):
 # Take a random action
 action = env.action_space.sample()
 print("Action taken:", action)
 env.render()

 # Do this action in the environment and get
 # next_state, reward, done and info
 observation, reward, done, info = env.step(action)
 
 # If the game is done (in our case we land, crashed or timeout)
 if done:
 # Reset the environment
 print("Environment is reset")
 observation = env.reset()


In [None]:
env.close()

In [None]:
env = make_vec_env("BipedalWalker-v3", n_envs=32)

In [None]:
eval_env = make_vec_env("BipedalWalker-v3", n_envs=1)

In [None]:
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)

In [None]:
model = TD3(
 "MlpPolicy",
 env,
 learning_rate=0.0001,
 batch_size=128,
 gamma=0.999,
 train_freq=32,
 gradient_steps=32,
 tensorboard_log='model_log/',
 verbose=0
)

In [None]:
env_id = 'BipedalWalker-v3'

In [None]:
model.learn(total_timesteps=50000000, callback=[WandbCallback() , eval_callback])

In [None]:
model.save('300-Trained.zip')

In [None]:
model = TD3.load('30M_Trained.zip')

In [None]:
eval_env = gym.make("BipedalWalker-v3")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=1, deterministic=True, render=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
eval_env.close()

In [None]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

env_id = "BipedalWalker-v3"

model_architecture = "TD3"
model_name = "TD3_BipedalWalker-v3"

repo_id = "SuperSecureHuman/BipedalWalker-v3-TD3"

commit_message = "Upload score 300 trained bipedal walker"

eval_env = DummyVecEnv([lambda: gym.make(env_id)])

package_to_hub(model=model, # Our trained model
 model_name=model_name, # The name of our trained model 
 model_architecture=model_architecture, # The model architecture we used: in our case PPO
 env_id=env_id, # Name of the environment
 eval_env=eval_env, # Evaluation Environment
 repo_id=repo_id, # id of the model repository from the Hugging Face Hub
 commit_message=commit_message)
eval_env.close()