Spaces:
Configuration error
Configuration error
from __future__ import generator_stop | |
from exp_replay import ExperienceReplay | |
import numpy as np | |
import tensorflow.contrib.slim as slim | |
import tensorflow as tf | |
import re | |
from processimage import processimage | |
class DQN: | |
def __init__(self, | |
env, | |
batchsize=64, | |
pic_size=(96, 96), | |
num_frame_stack=3, | |
gamma=0.95, | |
frame_skip=3, | |
train_freq=3, | |
initial_epsilon=1, | |
min_epsilon=0.05, | |
render=False, | |
epsilon_decay_steps=int(100000), | |
min_experience_size=int(1000), | |
experience_capacity=int(100000), | |
target_network_update_freq=1000, | |
regularization = 1e-6, | |
optimizer_params = None, | |
action_map=None | |
): | |
self.exp_history = ExperienceReplay( | |
num_frame_stack, | |
capacity=experience_capacity, | |
pic_size=pic_size | |
) | |
# in playing mode we don't store the experience to agent history | |
# but this cache is still needed to get the current frame stack | |
self.playing_cache = ExperienceReplay( | |
num_frame_stack, | |
capacity=num_frame_stack * 5 + 10, | |
pic_size=pic_size | |
) | |
if action_map is not None: | |
self.dim_actions = len(action_map) | |
else: | |
self.dim_actions = env.action_space.n | |
self.target_network_update_freq = target_network_update_freq | |
self.action_map = action_map | |
self.env = env | |
self.batchsize = batchsize | |
self.num_frame_stack = num_frame_stack | |
self.gamma = gamma | |
self.frame_skip = frame_skip | |
self.train_freq = train_freq | |
self.initial_epsilon = initial_epsilon | |
self.min_epsilon = min_epsilon | |
self.epsilon_decay_steps = epsilon_decay_steps | |
self.render = render | |
self.min_experience_size = min_experience_size | |
self.pic_size = pic_size | |
self.regularization = regularization | |
# These default magic values always work with Adam | |
self.global_step = tf.Variable(0, trainable=False) | |
self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1) | |
self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False) | |
lr = self.decayed_lr | |
# lr = 0.001 | |
self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7) | |
self.do_training = True | |
self.playing_epsilon = 0.0 | |
self.session = None | |
self.state_size = (self.num_frame_stack,) + self.pic_size | |
self.global_counter = 0 | |
self.episode_counter = 0 | |
def build_graph(self): | |
input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4) | |
input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4) | |
self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state") | |
self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state") | |
self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward") | |
self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions") | |
self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask") | |
# The target Q-values come from the fixed network | |
with tf.compat.v1.variable_scope("fixed"): #64 96 96 3 | |
# Create target network which is gonna be fixed and updated every C parameters | |
qsa_targets = self.create_network(self.input_next_state, trainable=False) | |
with tf.compat.v1.variable_scope("train"): # ? 96 96 3 | |
# Create Prediction/Estimate network which will be trained/updated every 3 frames | |
# Create Prediction/Estimate network which will be trained/updated every 3 frames | |
qsa_estimates = self.create_network(self.input_prev_state, trainable=True) | |
self.best_action = tf.argmax(qsa_estimates, axis=1) | |
not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") | |
# select the chosen action from each row | |
# in numpy this is qsa_estimates[range(batchsize), self.input_actions] | |
action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1) | |
# | |
q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) | |
#Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2] | |
q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward | |
training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize | |
# reg_loss = tf.add_n(tf.losses.get_regularization_losses()) | |
reg_loss = [0] | |
#Adam optimizer | |
optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) | |
#Adadelta optimizer: | |
# optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params)) | |
self.train_op = optimizer.minimize(reg_loss + training_loss) | |
train_params = self.get_variables("train") | |
fixed_params = self.get_variables("fixed") | |
assert (len(train_params) == len(fixed_params)) | |
self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)] | |
def get_variables(self, scope): | |
vars = [t for t in tf.compat.v1.global_variables() | |
if "%s/" % scope in t.name and "Adam" not in t.name] | |
return sorted(vars, key=lambda v: v.name) | |
def create_network(self, input, trainable): | |
if trainable: | |
# wr = None | |
wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization) | |
else: | |
wr = None | |
net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr) | |
net = tf.nn.relu(net) | |
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME') | |
net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2', | |
kernel_regularizer=wr) | |
net = tf.nn.relu(net) | |
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME') | |
net = tf.layers.flatten(net) | |
net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr) | |
# net = tf.layers.dropout(net, 0.5) | |
q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr) | |
return q_state_action_values | |
# def check_early_stop(self, reward, totalreward): | |
# return False, 0.0 | |
def get_random_action(self): | |
return np.random.choice(self.dim_actions) | |
def get_epsilon(self): | |
if not self.do_training: | |
return self.playing_epsilon | |
elif self.global_counter >= self.epsilon_decay_steps: | |
return self.min_epsilon | |
else: | |
# linear decay | |
r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) | |
return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r | |
def train(self): | |
batch = self.exp_history.sample_mini_batch(self.batchsize) | |
# Feed dict | |
fd = { | |
self.input_reward: "reward", | |
self.input_prev_state: "prev_state", | |
self.input_next_state: "next_state", | |
self.input_actions: "actions", | |
self.input_done_mask: "done_mask" | |
} | |
fd1 = {ph: batch[k] for ph, k in fd.items()} | |
self.session.run([self.train_op], fd1) | |
def play_episode(self, render, load_checkpoint): | |
eh = ( | |
self.exp_history if self.do_training | |
else self.playing_cache | |
) | |
total_reward = 0 | |
total_score = 0 | |
frames_in_episode = 0 | |
first_frame = self.env.reset() | |
first_frame_pp = processimage.process_image(first_frame) | |
eh.start_new_episode(first_frame_pp) | |
epsilon = self.get_epsilon() | |
while True: | |
if np.random.rand() > epsilon and not load_checkpoint: | |
action_idx = self.session.run( | |
self.best_action, | |
{self.input_prev_state: eh.current_state()[np.newaxis, ...]} | |
)[0] | |
elif not load_checkpoint: | |
action_idx = self.get_random_action() | |
elif load_checkpoint: | |
action_idx = self.session.run( | |
self.best_action, | |
{self.input_prev_state: eh.current_state()[np.newaxis, ...]} | |
)[0] | |
if self.action_map is not None: | |
action = self.action_map[action_idx] | |
else: | |
action = action_idx | |
reward = 0 | |
score = 0 | |
for _ in range(self.frame_skip): | |
observation, r, done, info = self.env.step(action) | |
if render: | |
self.env.render() | |
score += r | |
#Increase rewards on the last frames if reward is positive | |
if r > 0: | |
r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles | |
reward += r | |
if done: | |
break | |
early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode) | |
if early_done: | |
reward += punishment | |
done = done or early_done | |
total_reward += reward | |
total_score += score | |
frames_in_episode += 1 | |
observation = processimage.process_image(observation) | |
eh.add_experience(observation, action_idx, done, reward) | |
if self.do_training: | |
self.global_counter += 1 | |
step = self.session.run(self.increment_global_step_op) | |
if self.global_counter % self.target_network_update_freq: | |
self.update_target_network() | |
train_cond = ( | |
self.exp_history.counter >= self.min_experience_size and | |
self.global_counter % self.train_freq == 0 | |
) | |
if train_cond: | |
self.train() | |
if done: | |
if self.do_training: | |
self.episode_counter += 1 | |
return total_score, total_reward, frames_in_episode, epsilon | |
def update_target_network(self): | |
self.session.run(self.copy_network_ops) | |