from __future__ import generator_stop from exp_replay import ExperienceReplay import numpy as np import tensorflow.contrib.slim as slim import tensorflow as tf import re from processimage import processimage class DQN: def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=3, gamma=0.95, frame_skip=3, train_freq=3, initial_epsilon=1, min_epsilon=0.05, render=False, epsilon_decay_steps=int(100000), min_experience_size=int(1000), experience_capacity=int(100000), target_network_update_freq=1000, regularization = 1e-6, optimizer_params = None, action_map=None ): self.exp_history = ExperienceReplay( num_frame_stack, capacity=experience_capacity, pic_size=pic_size ) # in playing mode we don't store the experience to agent history # but this cache is still needed to get the current frame stack self.playing_cache = ExperienceReplay( num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size ) if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.target_network_update_freq = target_network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization # These default magic values always work with Adam self.global_step = tf.Variable(0, trainable=False) self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1) self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False) lr = self.decayed_lr # lr = 0.001 self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack,) + self.pic_size self.global_counter = 0 self.episode_counter = 0 def build_graph(self): input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4) input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4) self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask") # The target Q-values come from the fixed network with tf.compat.v1.variable_scope("fixed"): #64 96 96 3 # Create target network which is gonna be fixed and updated every C parameters qsa_targets = self.create_network(self.input_next_state, trainable=False) with tf.compat.v1.variable_scope("train"): # ? 96 96 3 # Create Prediction/Estimate network which will be trained/updated every 3 frames # Create Prediction/Estimate network which will be trained/updated every 3 frames qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") # select the chosen action from each row # in numpy this is qsa_estimates[range(batchsize), self.input_actions] action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1) # q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) #Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2] q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize # reg_loss = tf.add_n(tf.losses.get_regularization_losses()) reg_loss = [0] #Adam optimizer optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) #Adadelta optimizer: # optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params)) self.train_op = optimizer.minimize(reg_loss + training_loss) train_params = self.get_variables("train") fixed_params = self.get_variables("fixed") assert (len(train_params) == len(fixed_params)) self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)] def get_variables(self, scope): vars = [t for t in tf.compat.v1.global_variables() if "%s/" % scope in t.name and "Adam" not in t.name] return sorted(vars, key=lambda v: v.name) def create_network(self, input, trainable): if trainable: # wr = None wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization) else: wr = None net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr) net = tf.nn.relu(net) net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME') net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2', kernel_regularizer=wr) net = tf.nn.relu(net) net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME') net = tf.layers.flatten(net) net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr) # net = tf.layers.dropout(net, 0.5) q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr) return q_state_action_values # def check_early_stop(self, reward, totalreward): # return False, 0.0 def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # linear decay r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) # Feed dict fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} self.session.run([self.train_op], fd1) def play_episode(self, render, load_checkpoint): eh = ( self.exp_history if self.do_training else self.playing_cache ) total_reward = 0 total_score = 0 frames_in_episode = 0 first_frame = self.env.reset() first_frame_pp = processimage.process_image(first_frame) eh.start_new_episode(first_frame_pp) epsilon = self.get_epsilon() while True: if np.random.rand() > epsilon and not load_checkpoint: action_idx = self.session.run( self.best_action, {self.input_prev_state: eh.current_state()[np.newaxis, ...]} )[0] elif not load_checkpoint: action_idx = self.get_random_action() elif load_checkpoint: action_idx = self.session.run( self.best_action, {self.input_prev_state: eh.current_state()[np.newaxis, ...]} )[0] if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 score = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if render: self.env.render() score += r #Increase rewards on the last frames if reward is positive if r > 0: r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles reward += r if done: break early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode) if early_done: reward += punishment done = done or early_done total_reward += reward total_score += score frames_in_episode += 1 observation = processimage.process_image(observation) eh.add_experience(observation, action_idx, done, reward) if self.do_training: self.global_counter += 1 step = self.session.run(self.increment_global_step_op) if self.global_counter % self.target_network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0 ) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 return total_score, total_reward, frames_in_episode, epsilon def update_target_network(self): self.session.run(self.copy_network_ops)