# DQN

#### This version implements DQN with Keras


In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
import gym
from gym import spaces
from gym.utils import seeding
from gym import wrappers

from tqdm.notebook import tqdm
from collections import deque
import numpy as np
import random
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

import io
import base64
from IPython.display import HTML, Video


2022-12-22 18:43:04.111595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.



In [29]:
class DQN:
    def __init__(self, env=None, replay_buffer_size=1000, action_size=2):
        self.replay_buffer = deque(maxlen=replay_buffer_size)

        self.action_size = action_size

        # Hyperparameters
        self.gamma = 0.95           # Discount rate
        self.epsilon = 1.0          # Exploration rate
        self.epsilon_min = 0.001      # Minimal exploration rate (epsilon-greedy)
        self.epsilon_decay = 0.95    # Decay rate for epsilon
        self.update_rate = 5       # Number of steps until updating the target network
        self.batch_size = 100
        self.learning_rate = 2.5e-4
        
        # Construct DQN models
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()
        self.env = env
        self.action_size = action_size

        self.scaler = None

    def _build_model(self):
        model = tf.keras.Sequential()
        
        model.add(tf.keras.Input(shape=(4,)))
        model.add(layers.Dense(512, activation = 'relu'))
        model.add(layers.Dense(256, activation = 'relu'))
        model.add(layers.Dense(128, activation = 'relu'))
        model.add(layers.Dense(self.action_size, activation = 'linear'))
        # model.compile(optimizer = RMSprop(lr = self.lr, rho = 0.95, epsilon = 0.01), loss = "mse", metrics = ['accuracy'])
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        # model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr = self.learning_rate, rho = 0.95, epsilon = 0.01), metrics = ['accuracy'])
        model.compile(loss='mse', optimizer=optimizer, metrics = ['accuracy'])
        return model

    def _min_max(self):
        """Run some steps to get data to do MINMAX scale """
        state_arr = []
        state = self.env.reset()
        state_arr.append(self.env.observation_space.high)
        state_arr.append(self.env.observation_space.low)
        for i in range(1000):
            random_action = self.env.action_space.sample()
            next_state, reward, done, info = self.env.step(random_action)
            state_arr.append(next_state)
            if done:
                state = self.env.reset()

        state_arr = np.array(state_arr)
        self.scaler = MinMaxScaler()
        self.scaler.fit(state_arr)

    #
    # Trains the model using randomly selected experiences in the replay memory
    #
    def _train(self):
        X, y = [], []
        # state, action, reward, next_state, done 
        # create the targets 
        if self.batch_size > len(self.replay_buffer):
            return
        minibatch = random.sample(self.replay_buffer, self.batch_size)
        mb_arr = np.array(minibatch, dtype=object)

        next_state_arr = np.stack(mb_arr[:,3])
        future_qvalues = self.target_model.predict(next_state_arr, verbose=0)

        state_arr = np.stack(mb_arr[:,0])
        qvalues = self.model.predict(state_arr, verbose=0)

        for index, (state, action, reward, next_state, done) in enumerate(minibatch):
            if done == True:
                q_target = reward
            else:
                q_target = reward + self.gamma * np.max(future_qvalues[index])

            q_curr = qvalues[index]
            q_curr[action] = q_target                
            X.append(state)
            y.append(q_curr)

        # Perform gradient step
        X, y = np.array(X), np.array(y)
        history = self.model.fit(X, y, batch_size = self.batch_size, shuffle = False, verbose=0)
        # history = self.model.fit(X, y, epochs=1, verbose=0)
        # print(f"Loss: {history.history['loss']}  ")


    def learn(self, total_steps=None):
        #create scaler
        self._min_max()
        current_episode = 0
        total_reward = 0
        rewards = [0]
        current_step = 0
        while current_step < total_steps:
            current_episode += 1
            state = self.env.reset()
            total_reward = 0
            done = False
            while done != True:
                current_step +=1
                # e-greedy
                if np.random.random() > (1 - self.epsilon):
                    action = random.randrange(self.action_size)
                else:
                    model_predict = self.model.predict(np.array([state]), verbose=0)
                    action = np.argmax(model_predict)

                # step
                next_state, reward, done, info = self.env.step(action)
                total_reward += reward

                # add to buffer
                self.replay_buffer.append((state, action, reward, next_state, done))

                if current_step>10 and current_step % self.update_rate == 0:
                    print(f"epsilon:{self.epsilon} step:{current_step} episode:{current_episode} last_score {rewards[-1]} ")
                    self._train()
                    # update target
                    self.target_model.set_weights(self.model.get_weights())
                
                state = next_state
            
            rewards.append(total_reward)
            # update epsilon
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
    #
    # Loads a saved model
    #
    def load(self, name):
        self.model.load_weights(name)

    #
    # Saves parameters of a trained model
    #
    def save(self, name):
        self.model.save_weights(name)

    def play(self, state):
        return np.argmax(self.model.predict(np.array([state]), verbose=0)[0])

In [None]:
env = gym.make('CartPole-v1')

model = DQN(env=env, replay_buffer_size=10_000, action_size=2)
model.learn(total_steps=6_000)
env.close()

In [31]:
model.save("./alt/m1.h5")

In [33]:
eval_env = gym.make('CartPole-v1')
model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)
model.load("./alt/m1.h5")
eval_env = wrappers.Monitor(eval_env, "./alt/gym-results", force=True)
state = eval_env.reset()
total_reward = 0
for _ in range(1000):
    action = model.play(state)
    observation, reward, done, info = eval_env.step(action)
    total_reward +=reward
    state = observation
    if done: 
        print(f"Total reward {total_reward}")
        break
eval_env.close()

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_97 (Dense)            (None, 512)               2560      
                                                                 
 dense_98 (Dense)            (None, 256)               131328    
                                                                 
 dense_99 (Dense)            (None, 128)               32896     
                                                                 
 dense_100 (Dense)           (None, 2)                 258       
                                                                 
Total params: 167,042
Trainable params: 167,042
Non-trainable params: 0
_________________________________________________________________
Total reward 500.0
