Spaces:

VarunKumarGupta2003
/

SelfDriving

Configuration error

File size: 10,999 Bytes

b89a51c

from __future__ import generator_stop
from exp_replay import ExperienceReplay
import numpy as np
import tensorflow.contrib.slim as slim
import tensorflow as tf
import re
from processimage import processimage


class DQN:

    def __init__(self,

            env,

            batchsize=64,

            pic_size=(96, 96),

            num_frame_stack=3,

            gamma=0.95,

            frame_skip=3,

            train_freq=3,

            initial_epsilon=1,

            min_epsilon=0.05,

            render=False,

            epsilon_decay_steps=int(100000),

            min_experience_size=int(1000),

            experience_capacity=int(100000),

            target_network_update_freq=1000,

            regularization = 1e-6,

            optimizer_params = None,

            action_map=None

    ):
        self.exp_history = ExperienceReplay(
            num_frame_stack,
            capacity=experience_capacity,
            pic_size=pic_size
        )

        # in playing mode we don't store the experience to agent history
        # but this cache is still needed to get the current frame stack
        self.playing_cache = ExperienceReplay(
            num_frame_stack,
            capacity=num_frame_stack * 5 + 10,
            pic_size=pic_size
        )

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.target_network_update_freq = target_network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        # These default magic values always work with Adam
        self.global_step = tf.Variable(0, trainable=False)
        self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1)
        self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False)
        lr = self.decayed_lr
        # lr = 0.001
        self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7)

        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None

        self.state_size = (self.num_frame_stack,) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

    def build_graph(self):
        input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack)   # (None, 4, 96, 96) changed to (None, 96, 96, 4)
        input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4)

        self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state")
        self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state")
        self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward")
        self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions")
        self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask")

        # The target Q-values come from the fixed network
        with tf.compat.v1.variable_scope("fixed"): #64 96 96 3
            # Create target network which is gonna be fixed and updated every C parameters
            qsa_targets = self.create_network(self.input_next_state, trainable=False)

        with tf.compat.v1.variable_scope("train"): # ? 96 96 3
            # Create Prediction/Estimate network which will be trained/updated every 3 frames
            # Create Prediction/Estimate network which will be trained/updated every 3 frames
            qsa_estimates = self.create_network(self.input_prev_state, trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)

        not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        # select the chosen action from each row
        # in numpy this is qsa_estimates[range(batchsize), self.input_actions]
        action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
        #
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)

        #Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2]
        q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
        training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize

        # reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        reg_loss = [0]

        #Adam optimizer
        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        #Adadelta optimizer:
        # optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params))

        self.train_op = optimizer.minimize(reg_loss + training_loss)

        train_params = self.get_variables("train")
        fixed_params = self.get_variables("fixed")


        assert (len(train_params) == len(fixed_params))
        self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)]

    def get_variables(self, scope):
        vars = [t for t in tf.compat.v1.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name]
        return sorted(vars, key=lambda v: v.name)

    def create_network(self, input, trainable):
        if trainable:
            # wr = None
            wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization)
        else:
            wr = None

        net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr)
        net = tf.nn.relu(net)
        net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
        net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2',
                               kernel_regularizer=wr)
        net = tf.nn.relu(net)
        net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
        net = tf.layers.flatten(net)
        net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr)
        # net = tf.layers.dropout(net, 0.5)
        q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr)

        return q_state_action_values

    # def check_early_stop(self, reward, totalreward):
    #     return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)
        # Feed dict
        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        self.session.run([self.train_op], fd1)

    def play_episode(self, render, load_checkpoint):
        eh = (
            self.exp_history if self.do_training
            else self.playing_cache
        )
        total_reward = 0
        total_score = 0
        frames_in_episode = 0

        first_frame = self.env.reset()
        first_frame_pp = processimage.process_image(first_frame)

        eh.start_new_episode(first_frame_pp)

        epsilon = self.get_epsilon()
        while True:
            if np.random.rand() > epsilon and not load_checkpoint:
                action_idx = self.session.run(
                    self.best_action,
                    {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
                )[0]
            elif not load_checkpoint:
                action_idx = self.get_random_action()
            elif load_checkpoint:
                action_idx = self.session.run(
                    self.best_action,
                    {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
                )[0]

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            score = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if render:
                    self.env.render()


                score += r
                #Increase rewards on the last frames if reward is positive
                if r > 0:
                    r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles
                reward += r

                if done:
                    break

            early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            total_score += score
            frames_in_episode += 1
            observation = processimage.process_image(observation)
            eh.add_experience(observation, action_idx, done, reward)

            if self.do_training:
                self.global_counter += 1
                step = self.session.run(self.increment_global_step_op)
                if self.global_counter % self.target_network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size and
                    self.global_counter % self.train_freq == 0
                )
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                return total_score, total_reward, frames_in_episode, epsilon

    def update_target_network(self):
        self.session.run(self.copy_network_ops)