Spaces:
Configuration error
Configuration error
VarunKumarGupta2003
commited on
Commit
•
b89a51c
1
Parent(s):
0f9b2af
Upload 7 files
Browse files- .gitattributes +1 -0
- README.md +7 -11
- SI_Final_Project.pdf +3 -0
- car_dqn.py +58 -0
- dqn.py +268 -0
- exp_replay.py +77 -0
- main.py +156 -0
- processimage.py +25 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
SI_Final_Project.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🔥
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.31.5
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
# AI-self-driving-race-car-Deep-Reinforcement-Learning
|
2 |
+
Solving OpenAI's reinforcement learning CarRacing environment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
In this project, a python based car racing environment is trained using a deep reinforcement learning algorithm to perform efficient self driving on a racing track. A deep Q learning algorithm is developed and then used to train an autonomous driver agent. Different configurations in the deep Q learning algorithm parameters and in the neural network architecture are then tested and compared in order to obtain the best racing car average score over a period of 100 races. According to OpenAI Gym, this environment is considered solved when the agent successfully reaches an average score of 900 on the last 100 runs.
|
5 |
+
|
6 |
+
A video with the final car's performance can be seen here: https://www.youtube.com/watch?v=jbdjhoDT41M
|
7 |
+
|
8 |
+
A video of the car training can be seen here: https://youtu.be/C9CZpbuOz04
|
SI_Final_Project.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4bee8f9f9195010f83899d24f845e9c36c81e4431a6dbdb4570331e64ddfb06
|
3 |
+
size 4964594
|
car_dqn.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dqn import DQN
|
2 |
+
import numpy as np
|
3 |
+
from skimage import color
|
4 |
+
import itertools as it
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
class CarRacingDQN(DQN):
|
9 |
+
#CarRacing specific part of the DQN-agent
|
10 |
+
|
11 |
+
|
12 |
+
# ** is used for unpacking the model configurations
|
13 |
+
def __init__(self, max_negative_rewards=100, **model_config):
|
14 |
+
|
15 |
+
#Define all 12 actions possible:
|
16 |
+
# all_actions = np.array([k for k in it.product([-1, 0, 1], [1, 0], [0.5, 0])])
|
17 |
+
|
18 |
+
#selected 5 actions:
|
19 |
+
all_actions = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 0.5], [0, 0, 0],[1, 0, 0]])
|
20 |
+
|
21 |
+
#Set self parameters
|
22 |
+
super().__init__(
|
23 |
+
action_map=all_actions,
|
24 |
+
pic_size=(96, 96),
|
25 |
+
**model_config
|
26 |
+
)
|
27 |
+
|
28 |
+
self.gas_actions = np.array([a[1] == 1 and a[2] == 0 for a in all_actions])
|
29 |
+
self.break_actions = np.array([a[2] > 0 for a in all_actions])
|
30 |
+
self.n_gas_actions = self.gas_actions.sum()
|
31 |
+
self.neg_reward_counter = 0
|
32 |
+
self.max_neg_rewards = max_negative_rewards
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def get_random_action(self):
|
37 |
+
# give priority to acceleration actions
|
38 |
+
action_weights = 14.0 * self.gas_actions + 1.0
|
39 |
+
action_weights /= np.sum(action_weights)
|
40 |
+
|
41 |
+
return np.random.choice(self.dim_actions, p=action_weights)
|
42 |
+
|
43 |
+
def check_early_stop(self, reward, totalreward, fie):
|
44 |
+
if reward < 0 and fie > 10:
|
45 |
+
self.neg_reward_counter += 1
|
46 |
+
done = (self.neg_reward_counter > self.max_neg_rewards)
|
47 |
+
|
48 |
+
if done and totalreward <= 500:
|
49 |
+
punishment = -20.0
|
50 |
+
else:
|
51 |
+
punishment = 0.0
|
52 |
+
if done:
|
53 |
+
self.neg_reward_counter = 0
|
54 |
+
|
55 |
+
return done, punishment
|
56 |
+
else:
|
57 |
+
self.neg_reward_counter = 0
|
58 |
+
return False, 0.0
|
dqn.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import generator_stop
|
2 |
+
from exp_replay import ExperienceReplay
|
3 |
+
import numpy as np
|
4 |
+
import tensorflow.contrib.slim as slim
|
5 |
+
import tensorflow as tf
|
6 |
+
import re
|
7 |
+
from processimage import processimage
|
8 |
+
|
9 |
+
|
10 |
+
class DQN:
|
11 |
+
|
12 |
+
def __init__(self,
|
13 |
+
env,
|
14 |
+
batchsize=64,
|
15 |
+
pic_size=(96, 96),
|
16 |
+
num_frame_stack=3,
|
17 |
+
gamma=0.95,
|
18 |
+
frame_skip=3,
|
19 |
+
train_freq=3,
|
20 |
+
initial_epsilon=1,
|
21 |
+
min_epsilon=0.05,
|
22 |
+
render=False,
|
23 |
+
epsilon_decay_steps=int(100000),
|
24 |
+
min_experience_size=int(1000),
|
25 |
+
experience_capacity=int(100000),
|
26 |
+
target_network_update_freq=1000,
|
27 |
+
regularization = 1e-6,
|
28 |
+
optimizer_params = None,
|
29 |
+
action_map=None
|
30 |
+
):
|
31 |
+
self.exp_history = ExperienceReplay(
|
32 |
+
num_frame_stack,
|
33 |
+
capacity=experience_capacity,
|
34 |
+
pic_size=pic_size
|
35 |
+
)
|
36 |
+
|
37 |
+
# in playing mode we don't store the experience to agent history
|
38 |
+
# but this cache is still needed to get the current frame stack
|
39 |
+
self.playing_cache = ExperienceReplay(
|
40 |
+
num_frame_stack,
|
41 |
+
capacity=num_frame_stack * 5 + 10,
|
42 |
+
pic_size=pic_size
|
43 |
+
)
|
44 |
+
|
45 |
+
if action_map is not None:
|
46 |
+
self.dim_actions = len(action_map)
|
47 |
+
else:
|
48 |
+
self.dim_actions = env.action_space.n
|
49 |
+
|
50 |
+
self.target_network_update_freq = target_network_update_freq
|
51 |
+
self.action_map = action_map
|
52 |
+
self.env = env
|
53 |
+
self.batchsize = batchsize
|
54 |
+
self.num_frame_stack = num_frame_stack
|
55 |
+
self.gamma = gamma
|
56 |
+
self.frame_skip = frame_skip
|
57 |
+
self.train_freq = train_freq
|
58 |
+
self.initial_epsilon = initial_epsilon
|
59 |
+
self.min_epsilon = min_epsilon
|
60 |
+
self.epsilon_decay_steps = epsilon_decay_steps
|
61 |
+
self.render = render
|
62 |
+
self.min_experience_size = min_experience_size
|
63 |
+
self.pic_size = pic_size
|
64 |
+
self.regularization = regularization
|
65 |
+
# These default magic values always work with Adam
|
66 |
+
self.global_step = tf.Variable(0, trainable=False)
|
67 |
+
self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1)
|
68 |
+
self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False)
|
69 |
+
lr = self.decayed_lr
|
70 |
+
# lr = 0.001
|
71 |
+
self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7)
|
72 |
+
|
73 |
+
self.do_training = True
|
74 |
+
self.playing_epsilon = 0.0
|
75 |
+
self.session = None
|
76 |
+
|
77 |
+
self.state_size = (self.num_frame_stack,) + self.pic_size
|
78 |
+
self.global_counter = 0
|
79 |
+
self.episode_counter = 0
|
80 |
+
|
81 |
+
def build_graph(self):
|
82 |
+
input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4)
|
83 |
+
input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4)
|
84 |
+
|
85 |
+
self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state")
|
86 |
+
self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state")
|
87 |
+
self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward")
|
88 |
+
self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions")
|
89 |
+
self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask")
|
90 |
+
|
91 |
+
# The target Q-values come from the fixed network
|
92 |
+
with tf.compat.v1.variable_scope("fixed"): #64 96 96 3
|
93 |
+
# Create target network which is gonna be fixed and updated every C parameters
|
94 |
+
qsa_targets = self.create_network(self.input_next_state, trainable=False)
|
95 |
+
|
96 |
+
with tf.compat.v1.variable_scope("train"): # ? 96 96 3
|
97 |
+
# Create Prediction/Estimate network which will be trained/updated every 3 frames
|
98 |
+
# Create Prediction/Estimate network which will be trained/updated every 3 frames
|
99 |
+
qsa_estimates = self.create_network(self.input_prev_state, trainable=True)
|
100 |
+
|
101 |
+
self.best_action = tf.argmax(qsa_estimates, axis=1)
|
102 |
+
|
103 |
+
not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
|
104 |
+
# select the chosen action from each row
|
105 |
+
# in numpy this is qsa_estimates[range(batchsize), self.input_actions]
|
106 |
+
action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
|
107 |
+
#
|
108 |
+
q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)
|
109 |
+
|
110 |
+
#Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2]
|
111 |
+
q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
|
112 |
+
training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize
|
113 |
+
|
114 |
+
# reg_loss = tf.add_n(tf.losses.get_regularization_losses())
|
115 |
+
reg_loss = [0]
|
116 |
+
|
117 |
+
#Adam optimizer
|
118 |
+
optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
|
119 |
+
#Adadelta optimizer:
|
120 |
+
# optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params))
|
121 |
+
|
122 |
+
self.train_op = optimizer.minimize(reg_loss + training_loss)
|
123 |
+
|
124 |
+
train_params = self.get_variables("train")
|
125 |
+
fixed_params = self.get_variables("fixed")
|
126 |
+
|
127 |
+
|
128 |
+
assert (len(train_params) == len(fixed_params))
|
129 |
+
self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)]
|
130 |
+
|
131 |
+
def get_variables(self, scope):
|
132 |
+
vars = [t for t in tf.compat.v1.global_variables()
|
133 |
+
if "%s/" % scope in t.name and "Adam" not in t.name]
|
134 |
+
return sorted(vars, key=lambda v: v.name)
|
135 |
+
|
136 |
+
def create_network(self, input, trainable):
|
137 |
+
if trainable:
|
138 |
+
# wr = None
|
139 |
+
wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization)
|
140 |
+
else:
|
141 |
+
wr = None
|
142 |
+
|
143 |
+
net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr)
|
144 |
+
net = tf.nn.relu(net)
|
145 |
+
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
|
146 |
+
net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2',
|
147 |
+
kernel_regularizer=wr)
|
148 |
+
net = tf.nn.relu(net)
|
149 |
+
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
|
150 |
+
net = tf.layers.flatten(net)
|
151 |
+
net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr)
|
152 |
+
# net = tf.layers.dropout(net, 0.5)
|
153 |
+
q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr)
|
154 |
+
|
155 |
+
return q_state_action_values
|
156 |
+
|
157 |
+
# def check_early_stop(self, reward, totalreward):
|
158 |
+
# return False, 0.0
|
159 |
+
|
160 |
+
def get_random_action(self):
|
161 |
+
return np.random.choice(self.dim_actions)
|
162 |
+
|
163 |
+
def get_epsilon(self):
|
164 |
+
if not self.do_training:
|
165 |
+
return self.playing_epsilon
|
166 |
+
elif self.global_counter >= self.epsilon_decay_steps:
|
167 |
+
return self.min_epsilon
|
168 |
+
else:
|
169 |
+
# linear decay
|
170 |
+
r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
|
171 |
+
return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r
|
172 |
+
|
173 |
+
def train(self):
|
174 |
+
batch = self.exp_history.sample_mini_batch(self.batchsize)
|
175 |
+
# Feed dict
|
176 |
+
fd = {
|
177 |
+
self.input_reward: "reward",
|
178 |
+
self.input_prev_state: "prev_state",
|
179 |
+
self.input_next_state: "next_state",
|
180 |
+
self.input_actions: "actions",
|
181 |
+
self.input_done_mask: "done_mask"
|
182 |
+
}
|
183 |
+
fd1 = {ph: batch[k] for ph, k in fd.items()}
|
184 |
+
self.session.run([self.train_op], fd1)
|
185 |
+
|
186 |
+
def play_episode(self, render, load_checkpoint):
|
187 |
+
eh = (
|
188 |
+
self.exp_history if self.do_training
|
189 |
+
else self.playing_cache
|
190 |
+
)
|
191 |
+
total_reward = 0
|
192 |
+
total_score = 0
|
193 |
+
frames_in_episode = 0
|
194 |
+
|
195 |
+
first_frame = self.env.reset()
|
196 |
+
first_frame_pp = processimage.process_image(first_frame)
|
197 |
+
|
198 |
+
eh.start_new_episode(first_frame_pp)
|
199 |
+
|
200 |
+
epsilon = self.get_epsilon()
|
201 |
+
while True:
|
202 |
+
if np.random.rand() > epsilon and not load_checkpoint:
|
203 |
+
action_idx = self.session.run(
|
204 |
+
self.best_action,
|
205 |
+
{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
|
206 |
+
)[0]
|
207 |
+
elif not load_checkpoint:
|
208 |
+
action_idx = self.get_random_action()
|
209 |
+
elif load_checkpoint:
|
210 |
+
action_idx = self.session.run(
|
211 |
+
self.best_action,
|
212 |
+
{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
|
213 |
+
)[0]
|
214 |
+
|
215 |
+
if self.action_map is not None:
|
216 |
+
action = self.action_map[action_idx]
|
217 |
+
else:
|
218 |
+
action = action_idx
|
219 |
+
|
220 |
+
reward = 0
|
221 |
+
score = 0
|
222 |
+
for _ in range(self.frame_skip):
|
223 |
+
observation, r, done, info = self.env.step(action)
|
224 |
+
if render:
|
225 |
+
self.env.render()
|
226 |
+
|
227 |
+
|
228 |
+
score += r
|
229 |
+
#Increase rewards on the last frames if reward is positive
|
230 |
+
if r > 0:
|
231 |
+
r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles
|
232 |
+
reward += r
|
233 |
+
|
234 |
+
if done:
|
235 |
+
break
|
236 |
+
|
237 |
+
early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode)
|
238 |
+
if early_done:
|
239 |
+
reward += punishment
|
240 |
+
|
241 |
+
done = done or early_done
|
242 |
+
|
243 |
+
total_reward += reward
|
244 |
+
total_score += score
|
245 |
+
frames_in_episode += 1
|
246 |
+
observation = processimage.process_image(observation)
|
247 |
+
eh.add_experience(observation, action_idx, done, reward)
|
248 |
+
|
249 |
+
if self.do_training:
|
250 |
+
self.global_counter += 1
|
251 |
+
step = self.session.run(self.increment_global_step_op)
|
252 |
+
if self.global_counter % self.target_network_update_freq:
|
253 |
+
self.update_target_network()
|
254 |
+
train_cond = (
|
255 |
+
self.exp_history.counter >= self.min_experience_size and
|
256 |
+
self.global_counter % self.train_freq == 0
|
257 |
+
)
|
258 |
+
if train_cond:
|
259 |
+
self.train()
|
260 |
+
|
261 |
+
if done:
|
262 |
+
if self.do_training:
|
263 |
+
self.episode_counter += 1
|
264 |
+
|
265 |
+
return total_score, total_reward, frames_in_episode, epsilon
|
266 |
+
|
267 |
+
def update_target_network(self):
|
268 |
+
self.session.run(self.copy_network_ops)
|
exp_replay.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
class ExperienceReplay:
|
4 |
+
|
5 |
+
def __init__(self,
|
6 |
+
num_frame_stack=4,
|
7 |
+
capacity=int(1e5),
|
8 |
+
pic_size=(96, 96)
|
9 |
+
):
|
10 |
+
self.num_frame_stack = num_frame_stack
|
11 |
+
self.capacity = capacity
|
12 |
+
self.pic_size = pic_size
|
13 |
+
self.counter = 0
|
14 |
+
self.frame_window = None
|
15 |
+
self.init_caches()
|
16 |
+
self.expecting_new_episode = True
|
17 |
+
|
18 |
+
def add_experience(self, frame, action, done, reward):
|
19 |
+
assert self.frame_window is not None, "start episode first"
|
20 |
+
self.counter += 1
|
21 |
+
frame_idx = self.counter % self.max_frame_cache
|
22 |
+
exp_idx = (self.counter - 1) % self.capacity
|
23 |
+
|
24 |
+
self.prev_states[exp_idx] = self.frame_window
|
25 |
+
self.frame_window = np.append(self.frame_window[1:], frame_idx)
|
26 |
+
self.next_states[exp_idx] = self.frame_window
|
27 |
+
self.actions[exp_idx] = action
|
28 |
+
self.is_done[exp_idx] = done
|
29 |
+
self.frames[frame_idx] = frame
|
30 |
+
self.rewards[exp_idx] = reward
|
31 |
+
if done:
|
32 |
+
self.expecting_new_episode = True
|
33 |
+
|
34 |
+
def start_new_episode(self, frame):
|
35 |
+
# it should be okay not to increment counter here
|
36 |
+
# because episode ending frames are not used
|
37 |
+
assert self.expecting_new_episode, "previous episode didn't end yet"
|
38 |
+
frame_idx = self.counter % self.max_frame_cache
|
39 |
+
self.frame_window = np.repeat(frame_idx, self.num_frame_stack)
|
40 |
+
self.frames[frame_idx] = frame
|
41 |
+
self.expecting_new_episode = False
|
42 |
+
|
43 |
+
def sample_mini_batch(self, n):
|
44 |
+
count = min(self.capacity, self.counter)
|
45 |
+
batchidx = np.random.randint(count, size=n)
|
46 |
+
|
47 |
+
prev_frames = self.frames[self.prev_states[batchidx]]
|
48 |
+
next_frames = self.frames[self.next_states[batchidx]]
|
49 |
+
prev_frames = np.moveaxis(prev_frames, 1, -1)
|
50 |
+
next_frames = np.moveaxis(next_frames, 1, -1)
|
51 |
+
return {
|
52 |
+
"reward": self.rewards[batchidx],
|
53 |
+
"prev_state": prev_frames,
|
54 |
+
"next_state": next_frames,
|
55 |
+
"actions": self.actions[batchidx],
|
56 |
+
"done_mask": self.is_done[batchidx]
|
57 |
+
}
|
58 |
+
|
59 |
+
def current_state(self):
|
60 |
+
# assert not self.expecting_new_episode, "start new episode first"'
|
61 |
+
assert self.frame_window is not None, "do something first"
|
62 |
+
|
63 |
+
sf = self.frames[self.frame_window]
|
64 |
+
sf = np.moveaxis(sf, 0, -1)
|
65 |
+
return sf
|
66 |
+
|
67 |
+
def init_caches(self):
|
68 |
+
self.rewards = np.zeros(self.capacity, dtype="float32")
|
69 |
+
self.prev_states = -np.ones((self.capacity, self.num_frame_stack),
|
70 |
+
dtype="int32")
|
71 |
+
self.next_states = -np.ones((self.capacity, self.num_frame_stack),
|
72 |
+
dtype="int32")
|
73 |
+
self.is_done = -np.ones(self.capacity, "int32")
|
74 |
+
self.actions = -np.ones(self.capacity, dtype="int32")
|
75 |
+
|
76 |
+
self.max_frame_cache = self.capacity + 2 * self.num_frame_stack + 1
|
77 |
+
self.frames = -np.ones((self.max_frame_cache,) + self.pic_size, dtype="float32")
|
main.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
+
from car_dqn import CarRacingDQN
|
3 |
+
import os
|
4 |
+
import tensorflow as tf
|
5 |
+
import gym
|
6 |
+
import _thread
|
7 |
+
import re
|
8 |
+
import sys
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
#Ensure its running og GPU
|
12 |
+
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
|
13 |
+
|
14 |
+
load_checkpoint = True
|
15 |
+
checkpoint_path = "data/checkpoints/train24"
|
16 |
+
train_episodes = 15000
|
17 |
+
save_freq_episodes = train_episodes/100 ###############333
|
18 |
+
finished = False
|
19 |
+
opendir = checkpoint_path + '.txt'
|
20 |
+
text_results = open(opendir, "w")
|
21 |
+
render = False
|
22 |
+
|
23 |
+
frame_skip = 3 #frame_skip number n. model is trained n to n times only
|
24 |
+
model_config = dict(
|
25 |
+
min_epsilon=0.05,
|
26 |
+
max_negative_rewards=8,
|
27 |
+
min_experience_size=int(100), #######################################33
|
28 |
+
experience_capacity=int(150000),
|
29 |
+
num_frame_stack=frame_skip,
|
30 |
+
frame_skip=frame_skip,
|
31 |
+
train_freq=frame_skip,
|
32 |
+
batchsize=64,
|
33 |
+
epsilon_decay_steps=int(100000),
|
34 |
+
target_network_update_freq=int(1000), #Updates the target network every 10000 global steps by copying them from the prediction network to the target network
|
35 |
+
gamma=0.95,
|
36 |
+
render=False,
|
37 |
+
)
|
38 |
+
|
39 |
+
dqn_scores = []
|
40 |
+
eps_history = []
|
41 |
+
avg_score_all = [0]
|
42 |
+
|
43 |
+
env = gym.make('CarRacing-v0', verbose=False)
|
44 |
+
|
45 |
+
tf.compat.v1.reset_default_graph
|
46 |
+
|
47 |
+
dqn_agent = CarRacingDQN(env=env, **model_config)
|
48 |
+
dqn_agent.build_graph()
|
49 |
+
sess = tf.InteractiveSession()
|
50 |
+
dqn_agent.session = sess
|
51 |
+
|
52 |
+
#Initialize save checkpoints
|
53 |
+
saver = tf.train.Saver(max_to_keep=1000) #max number of checkpoints = 500
|
54 |
+
#Choice to load checkpoints
|
55 |
+
if load_checkpoint:
|
56 |
+
train_episodes = 150
|
57 |
+
save_freq_episodes = 0
|
58 |
+
print("loading the latest checkpoint from %s" % checkpoint_path)
|
59 |
+
ckpt = tf.train.get_checkpoint_state(checkpoint_path)
|
60 |
+
assert ckpt, "checkpoint path %s not found" % checkpoint_path
|
61 |
+
global_counter = int(re.findall("-(\d+)$", ckpt.model_checkpoint_path)[0])
|
62 |
+
saver.restore(sess, ckpt.model_checkpoint_path)
|
63 |
+
dqn_agent.global_counter = global_counter
|
64 |
+
render = True
|
65 |
+
else:
|
66 |
+
if checkpoint_path is not None:
|
67 |
+
assert not os.path.exists(checkpoint_path), \
|
68 |
+
"checkpoint path already exists but load_checkpoint is false"
|
69 |
+
|
70 |
+
tf.global_variables_initializer().run()
|
71 |
+
|
72 |
+
|
73 |
+
def save_checkpoint():
|
74 |
+
if not os.path.exists(checkpoint_path):
|
75 |
+
os.makedirs(checkpoint_path)
|
76 |
+
p = os.path.join(checkpoint_path, "m.ckpt")
|
77 |
+
saver.save(sess, p, dqn_agent.global_counter)
|
78 |
+
print("saved to %s - %d" % (p, dqn_agent.global_counter))
|
79 |
+
|
80 |
+
|
81 |
+
def one_episode(eps_history,dqn_scores,avg_score_all,render,load_checkpoint):
|
82 |
+
score, reward, frames, epsilon = dqn_agent.play_episode(render, load_checkpoint)
|
83 |
+
|
84 |
+
eps_history.append(epsilon)
|
85 |
+
|
86 |
+
dqn_scores.append(score)
|
87 |
+
i = dqn_agent.episode_counter
|
88 |
+
avg_score = np.mean(dqn_scores[max(0, i - 100):(i + 1)])
|
89 |
+
avg_score_all.append(avg_score)
|
90 |
+
max_avg_score = max(avg_score_all)
|
91 |
+
if avg_score >= max_avg_score:
|
92 |
+
new_max = ' => New HighScore! <= '
|
93 |
+
highscore = True
|
94 |
+
else:
|
95 |
+
new_max = ''
|
96 |
+
highscore = False
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
strm = ("#> episode: %i | score: %.2f | total steps: %i | epsilon: %.5f | average 100 score: %.2f" %
|
101 |
+
(i, score, dqn_agent.global_counter, epsilon, avg_score))
|
102 |
+
|
103 |
+
print(strm + new_max)
|
104 |
+
|
105 |
+
text_results = open(opendir, "a")
|
106 |
+
text_results.write(strm + new_max + '\n')
|
107 |
+
text_results.close()
|
108 |
+
|
109 |
+
if not load_checkpoint:
|
110 |
+
save_cond = (
|
111 |
+
dqn_agent.episode_counter % save_freq_episodes == 0
|
112 |
+
and checkpoint_path is not None
|
113 |
+
and dqn_agent.do_training
|
114 |
+
)
|
115 |
+
if save_cond or (highscore and dqn_agent.episode_counter > 100):
|
116 |
+
save_checkpoint()
|
117 |
+
|
118 |
+
return eps_history,dqn_scores,avg_score_all
|
119 |
+
|
120 |
+
def input_thread(list):
|
121 |
+
input("...enter to stop after current episode\n")
|
122 |
+
list.append("OK")
|
123 |
+
|
124 |
+
|
125 |
+
def main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint):
|
126 |
+
#call training loop
|
127 |
+
list = []
|
128 |
+
_thread.start_new_thread(input_thread, (list,))
|
129 |
+
while True:
|
130 |
+
if list:
|
131 |
+
break
|
132 |
+
if dqn_agent.do_training and dqn_agent.episode_counter >= train_episodes:
|
133 |
+
|
134 |
+
break
|
135 |
+
eps_history,dqn_scores,avg_score_all = one_episode(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
|
136 |
+
|
137 |
+
print("done")
|
138 |
+
text_results.close()
|
139 |
+
exit()
|
140 |
+
return eps_history,dqn_scores,avg_score_all
|
141 |
+
|
142 |
+
|
143 |
+
if train_episodes > 0 and dqn_agent.episode_counter < train_episodes and not load_checkpoint :
|
144 |
+
print("now training... you can early stop with enter...")
|
145 |
+
print("##########")
|
146 |
+
sys.stdout.flush()
|
147 |
+
main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
|
148 |
+
save_checkpoint()
|
149 |
+
print("ok training done")
|
150 |
+
|
151 |
+
else:
|
152 |
+
print("now just playing...")
|
153 |
+
sys.stdout.flush()
|
154 |
+
main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
|
155 |
+
|
156 |
+
|
processimage.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import matplotlib.image as mpimg
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
from skimage import color, transform, io
|
6 |
+
|
7 |
+
|
8 |
+
class processimage:
|
9 |
+
def process_image(obs):
|
10 |
+
#uncomment to see original image
|
11 |
+
# plt.imshow(obs)
|
12 |
+
# plt.show()
|
13 |
+
|
14 |
+
obs1 = obs.astype(np.uint8)
|
15 |
+
obs_gray = color.rgb2gray(obs1)
|
16 |
+
# obs_gray[abs(obs_gray - 0.60116) < 0.1] = 1
|
17 |
+
obs_gray[84:95,0:12] = 0
|
18 |
+
obs_gray[abs(obs_gray - 0.68616) < 0.0001] = 1
|
19 |
+
obs_gray[abs(obs_gray - 0.75630) < 0.0001] = 1
|
20 |
+
#uncomment to see pre processed image
|
21 |
+
# plt.imshow(obs_gray, cmap='gray')
|
22 |
+
# plt.show()
|
23 |
+
|
24 |
+
#Set values between -1 and 1 for input normalization
|
25 |
+
return 2 * obs_gray - 1
|