VarunKumarGupta2003 commited on
Commit
b89a51c
1 Parent(s): 0f9b2af

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. README.md +7 -11
  3. SI_Final_Project.pdf +3 -0
  4. car_dqn.py +58 -0
  5. dqn.py +268 -0
  6. exp_replay.py +77 -0
  7. main.py +156 -0
  8. processimage.py +25 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ SI_Final_Project.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,8 @@
1
- ---
2
- title: SelfDriving
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.31.5
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ # AI-self-driving-race-car-Deep-Reinforcement-Learning
2
+ Solving OpenAI's reinforcement learning CarRacing environment
 
 
 
 
 
 
 
 
3
 
4
+ In this project, a python based car racing environment is trained using a deep reinforcement learning algorithm to perform efficient self driving on a racing track. A deep Q learning algorithm is developed and then used to train an autonomous driver agent. Different configurations in the deep Q learning algorithm parameters and in the neural network architecture are then tested and compared in order to obtain the best racing car average score over a period of 100 races. According to OpenAI Gym, this environment is considered solved when the agent successfully reaches an average score of 900 on the last 100 runs.
5
+
6
+ A video with the final car's performance can be seen here: https://www.youtube.com/watch?v=jbdjhoDT41M
7
+
8
+ A video of the car training can be seen here: https://youtu.be/C9CZpbuOz04
SI_Final_Project.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4bee8f9f9195010f83899d24f845e9c36c81e4431a6dbdb4570331e64ddfb06
3
+ size 4964594
car_dqn.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dqn import DQN
2
+ import numpy as np
3
+ from skimage import color
4
+ import itertools as it
5
+
6
+
7
+
8
+ class CarRacingDQN(DQN):
9
+ #CarRacing specific part of the DQN-agent
10
+
11
+
12
+ # ** is used for unpacking the model configurations
13
+ def __init__(self, max_negative_rewards=100, **model_config):
14
+
15
+ #Define all 12 actions possible:
16
+ # all_actions = np.array([k for k in it.product([-1, 0, 1], [1, 0], [0.5, 0])])
17
+
18
+ #selected 5 actions:
19
+ all_actions = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 0.5], [0, 0, 0],[1, 0, 0]])
20
+
21
+ #Set self parameters
22
+ super().__init__(
23
+ action_map=all_actions,
24
+ pic_size=(96, 96),
25
+ **model_config
26
+ )
27
+
28
+ self.gas_actions = np.array([a[1] == 1 and a[2] == 0 for a in all_actions])
29
+ self.break_actions = np.array([a[2] > 0 for a in all_actions])
30
+ self.n_gas_actions = self.gas_actions.sum()
31
+ self.neg_reward_counter = 0
32
+ self.max_neg_rewards = max_negative_rewards
33
+
34
+
35
+
36
+ def get_random_action(self):
37
+ # give priority to acceleration actions
38
+ action_weights = 14.0 * self.gas_actions + 1.0
39
+ action_weights /= np.sum(action_weights)
40
+
41
+ return np.random.choice(self.dim_actions, p=action_weights)
42
+
43
+ def check_early_stop(self, reward, totalreward, fie):
44
+ if reward < 0 and fie > 10:
45
+ self.neg_reward_counter += 1
46
+ done = (self.neg_reward_counter > self.max_neg_rewards)
47
+
48
+ if done and totalreward <= 500:
49
+ punishment = -20.0
50
+ else:
51
+ punishment = 0.0
52
+ if done:
53
+ self.neg_reward_counter = 0
54
+
55
+ return done, punishment
56
+ else:
57
+ self.neg_reward_counter = 0
58
+ return False, 0.0
dqn.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import generator_stop
2
+ from exp_replay import ExperienceReplay
3
+ import numpy as np
4
+ import tensorflow.contrib.slim as slim
5
+ import tensorflow as tf
6
+ import re
7
+ from processimage import processimage
8
+
9
+
10
+ class DQN:
11
+
12
+ def __init__(self,
13
+ env,
14
+ batchsize=64,
15
+ pic_size=(96, 96),
16
+ num_frame_stack=3,
17
+ gamma=0.95,
18
+ frame_skip=3,
19
+ train_freq=3,
20
+ initial_epsilon=1,
21
+ min_epsilon=0.05,
22
+ render=False,
23
+ epsilon_decay_steps=int(100000),
24
+ min_experience_size=int(1000),
25
+ experience_capacity=int(100000),
26
+ target_network_update_freq=1000,
27
+ regularization = 1e-6,
28
+ optimizer_params = None,
29
+ action_map=None
30
+ ):
31
+ self.exp_history = ExperienceReplay(
32
+ num_frame_stack,
33
+ capacity=experience_capacity,
34
+ pic_size=pic_size
35
+ )
36
+
37
+ # in playing mode we don't store the experience to agent history
38
+ # but this cache is still needed to get the current frame stack
39
+ self.playing_cache = ExperienceReplay(
40
+ num_frame_stack,
41
+ capacity=num_frame_stack * 5 + 10,
42
+ pic_size=pic_size
43
+ )
44
+
45
+ if action_map is not None:
46
+ self.dim_actions = len(action_map)
47
+ else:
48
+ self.dim_actions = env.action_space.n
49
+
50
+ self.target_network_update_freq = target_network_update_freq
51
+ self.action_map = action_map
52
+ self.env = env
53
+ self.batchsize = batchsize
54
+ self.num_frame_stack = num_frame_stack
55
+ self.gamma = gamma
56
+ self.frame_skip = frame_skip
57
+ self.train_freq = train_freq
58
+ self.initial_epsilon = initial_epsilon
59
+ self.min_epsilon = min_epsilon
60
+ self.epsilon_decay_steps = epsilon_decay_steps
61
+ self.render = render
62
+ self.min_experience_size = min_experience_size
63
+ self.pic_size = pic_size
64
+ self.regularization = regularization
65
+ # These default magic values always work with Adam
66
+ self.global_step = tf.Variable(0, trainable=False)
67
+ self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1)
68
+ self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False)
69
+ lr = self.decayed_lr
70
+ # lr = 0.001
71
+ self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7)
72
+
73
+ self.do_training = True
74
+ self.playing_epsilon = 0.0
75
+ self.session = None
76
+
77
+ self.state_size = (self.num_frame_stack,) + self.pic_size
78
+ self.global_counter = 0
79
+ self.episode_counter = 0
80
+
81
+ def build_graph(self):
82
+ input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4)
83
+ input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4)
84
+
85
+ self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state")
86
+ self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state")
87
+ self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward")
88
+ self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions")
89
+ self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask")
90
+
91
+ # The target Q-values come from the fixed network
92
+ with tf.compat.v1.variable_scope("fixed"): #64 96 96 3
93
+ # Create target network which is gonna be fixed and updated every C parameters
94
+ qsa_targets = self.create_network(self.input_next_state, trainable=False)
95
+
96
+ with tf.compat.v1.variable_scope("train"): # ? 96 96 3
97
+ # Create Prediction/Estimate network which will be trained/updated every 3 frames
98
+ # Create Prediction/Estimate network which will be trained/updated every 3 frames
99
+ qsa_estimates = self.create_network(self.input_prev_state, trainable=True)
100
+
101
+ self.best_action = tf.argmax(qsa_estimates, axis=1)
102
+
103
+ not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
104
+ # select the chosen action from each row
105
+ # in numpy this is qsa_estimates[range(batchsize), self.input_actions]
106
+ action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
107
+ #
108
+ q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)
109
+
110
+ #Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2]
111
+ q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
112
+ training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize
113
+
114
+ # reg_loss = tf.add_n(tf.losses.get_regularization_losses())
115
+ reg_loss = [0]
116
+
117
+ #Adam optimizer
118
+ optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
119
+ #Adadelta optimizer:
120
+ # optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params))
121
+
122
+ self.train_op = optimizer.minimize(reg_loss + training_loss)
123
+
124
+ train_params = self.get_variables("train")
125
+ fixed_params = self.get_variables("fixed")
126
+
127
+
128
+ assert (len(train_params) == len(fixed_params))
129
+ self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)]
130
+
131
+ def get_variables(self, scope):
132
+ vars = [t for t in tf.compat.v1.global_variables()
133
+ if "%s/" % scope in t.name and "Adam" not in t.name]
134
+ return sorted(vars, key=lambda v: v.name)
135
+
136
+ def create_network(self, input, trainable):
137
+ if trainable:
138
+ # wr = None
139
+ wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization)
140
+ else:
141
+ wr = None
142
+
143
+ net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr)
144
+ net = tf.nn.relu(net)
145
+ net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
146
+ net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2',
147
+ kernel_regularizer=wr)
148
+ net = tf.nn.relu(net)
149
+ net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
150
+ net = tf.layers.flatten(net)
151
+ net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr)
152
+ # net = tf.layers.dropout(net, 0.5)
153
+ q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr)
154
+
155
+ return q_state_action_values
156
+
157
+ # def check_early_stop(self, reward, totalreward):
158
+ # return False, 0.0
159
+
160
+ def get_random_action(self):
161
+ return np.random.choice(self.dim_actions)
162
+
163
+ def get_epsilon(self):
164
+ if not self.do_training:
165
+ return self.playing_epsilon
166
+ elif self.global_counter >= self.epsilon_decay_steps:
167
+ return self.min_epsilon
168
+ else:
169
+ # linear decay
170
+ r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
171
+ return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r
172
+
173
+ def train(self):
174
+ batch = self.exp_history.sample_mini_batch(self.batchsize)
175
+ # Feed dict
176
+ fd = {
177
+ self.input_reward: "reward",
178
+ self.input_prev_state: "prev_state",
179
+ self.input_next_state: "next_state",
180
+ self.input_actions: "actions",
181
+ self.input_done_mask: "done_mask"
182
+ }
183
+ fd1 = {ph: batch[k] for ph, k in fd.items()}
184
+ self.session.run([self.train_op], fd1)
185
+
186
+ def play_episode(self, render, load_checkpoint):
187
+ eh = (
188
+ self.exp_history if self.do_training
189
+ else self.playing_cache
190
+ )
191
+ total_reward = 0
192
+ total_score = 0
193
+ frames_in_episode = 0
194
+
195
+ first_frame = self.env.reset()
196
+ first_frame_pp = processimage.process_image(first_frame)
197
+
198
+ eh.start_new_episode(first_frame_pp)
199
+
200
+ epsilon = self.get_epsilon()
201
+ while True:
202
+ if np.random.rand() > epsilon and not load_checkpoint:
203
+ action_idx = self.session.run(
204
+ self.best_action,
205
+ {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
206
+ )[0]
207
+ elif not load_checkpoint:
208
+ action_idx = self.get_random_action()
209
+ elif load_checkpoint:
210
+ action_idx = self.session.run(
211
+ self.best_action,
212
+ {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
213
+ )[0]
214
+
215
+ if self.action_map is not None:
216
+ action = self.action_map[action_idx]
217
+ else:
218
+ action = action_idx
219
+
220
+ reward = 0
221
+ score = 0
222
+ for _ in range(self.frame_skip):
223
+ observation, r, done, info = self.env.step(action)
224
+ if render:
225
+ self.env.render()
226
+
227
+
228
+ score += r
229
+ #Increase rewards on the last frames if reward is positive
230
+ if r > 0:
231
+ r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles
232
+ reward += r
233
+
234
+ if done:
235
+ break
236
+
237
+ early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode)
238
+ if early_done:
239
+ reward += punishment
240
+
241
+ done = done or early_done
242
+
243
+ total_reward += reward
244
+ total_score += score
245
+ frames_in_episode += 1
246
+ observation = processimage.process_image(observation)
247
+ eh.add_experience(observation, action_idx, done, reward)
248
+
249
+ if self.do_training:
250
+ self.global_counter += 1
251
+ step = self.session.run(self.increment_global_step_op)
252
+ if self.global_counter % self.target_network_update_freq:
253
+ self.update_target_network()
254
+ train_cond = (
255
+ self.exp_history.counter >= self.min_experience_size and
256
+ self.global_counter % self.train_freq == 0
257
+ )
258
+ if train_cond:
259
+ self.train()
260
+
261
+ if done:
262
+ if self.do_training:
263
+ self.episode_counter += 1
264
+
265
+ return total_score, total_reward, frames_in_episode, epsilon
266
+
267
+ def update_target_network(self):
268
+ self.session.run(self.copy_network_ops)
exp_replay.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ class ExperienceReplay:
4
+
5
+ def __init__(self,
6
+ num_frame_stack=4,
7
+ capacity=int(1e5),
8
+ pic_size=(96, 96)
9
+ ):
10
+ self.num_frame_stack = num_frame_stack
11
+ self.capacity = capacity
12
+ self.pic_size = pic_size
13
+ self.counter = 0
14
+ self.frame_window = None
15
+ self.init_caches()
16
+ self.expecting_new_episode = True
17
+
18
+ def add_experience(self, frame, action, done, reward):
19
+ assert self.frame_window is not None, "start episode first"
20
+ self.counter += 1
21
+ frame_idx = self.counter % self.max_frame_cache
22
+ exp_idx = (self.counter - 1) % self.capacity
23
+
24
+ self.prev_states[exp_idx] = self.frame_window
25
+ self.frame_window = np.append(self.frame_window[1:], frame_idx)
26
+ self.next_states[exp_idx] = self.frame_window
27
+ self.actions[exp_idx] = action
28
+ self.is_done[exp_idx] = done
29
+ self.frames[frame_idx] = frame
30
+ self.rewards[exp_idx] = reward
31
+ if done:
32
+ self.expecting_new_episode = True
33
+
34
+ def start_new_episode(self, frame):
35
+ # it should be okay not to increment counter here
36
+ # because episode ending frames are not used
37
+ assert self.expecting_new_episode, "previous episode didn't end yet"
38
+ frame_idx = self.counter % self.max_frame_cache
39
+ self.frame_window = np.repeat(frame_idx, self.num_frame_stack)
40
+ self.frames[frame_idx] = frame
41
+ self.expecting_new_episode = False
42
+
43
+ def sample_mini_batch(self, n):
44
+ count = min(self.capacity, self.counter)
45
+ batchidx = np.random.randint(count, size=n)
46
+
47
+ prev_frames = self.frames[self.prev_states[batchidx]]
48
+ next_frames = self.frames[self.next_states[batchidx]]
49
+ prev_frames = np.moveaxis(prev_frames, 1, -1)
50
+ next_frames = np.moveaxis(next_frames, 1, -1)
51
+ return {
52
+ "reward": self.rewards[batchidx],
53
+ "prev_state": prev_frames,
54
+ "next_state": next_frames,
55
+ "actions": self.actions[batchidx],
56
+ "done_mask": self.is_done[batchidx]
57
+ }
58
+
59
+ def current_state(self):
60
+ # assert not self.expecting_new_episode, "start new episode first"'
61
+ assert self.frame_window is not None, "do something first"
62
+
63
+ sf = self.frames[self.frame_window]
64
+ sf = np.moveaxis(sf, 0, -1)
65
+ return sf
66
+
67
+ def init_caches(self):
68
+ self.rewards = np.zeros(self.capacity, dtype="float32")
69
+ self.prev_states = -np.ones((self.capacity, self.num_frame_stack),
70
+ dtype="int32")
71
+ self.next_states = -np.ones((self.capacity, self.num_frame_stack),
72
+ dtype="int32")
73
+ self.is_done = -np.ones(self.capacity, "int32")
74
+ self.actions = -np.ones(self.capacity, dtype="int32")
75
+
76
+ self.max_frame_cache = self.capacity + 2 * self.num_frame_stack + 1
77
+ self.frames = -np.ones((self.max_frame_cache,) + self.pic_size, dtype="float32")
main.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, print_function, unicode_literals
2
+ from car_dqn import CarRacingDQN
3
+ import os
4
+ import tensorflow as tf
5
+ import gym
6
+ import _thread
7
+ import re
8
+ import sys
9
+ import numpy as np
10
+
11
+ #Ensure its running og GPU
12
+ print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
13
+
14
+ load_checkpoint = True
15
+ checkpoint_path = "data/checkpoints/train24"
16
+ train_episodes = 15000
17
+ save_freq_episodes = train_episodes/100 ###############333
18
+ finished = False
19
+ opendir = checkpoint_path + '.txt'
20
+ text_results = open(opendir, "w")
21
+ render = False
22
+
23
+ frame_skip = 3 #frame_skip number n. model is trained n to n times only
24
+ model_config = dict(
25
+ min_epsilon=0.05,
26
+ max_negative_rewards=8,
27
+ min_experience_size=int(100), #######################################33
28
+ experience_capacity=int(150000),
29
+ num_frame_stack=frame_skip,
30
+ frame_skip=frame_skip,
31
+ train_freq=frame_skip,
32
+ batchsize=64,
33
+ epsilon_decay_steps=int(100000),
34
+ target_network_update_freq=int(1000), #Updates the target network every 10000 global steps by copying them from the prediction network to the target network
35
+ gamma=0.95,
36
+ render=False,
37
+ )
38
+
39
+ dqn_scores = []
40
+ eps_history = []
41
+ avg_score_all = [0]
42
+
43
+ env = gym.make('CarRacing-v0', verbose=False)
44
+
45
+ tf.compat.v1.reset_default_graph
46
+
47
+ dqn_agent = CarRacingDQN(env=env, **model_config)
48
+ dqn_agent.build_graph()
49
+ sess = tf.InteractiveSession()
50
+ dqn_agent.session = sess
51
+
52
+ #Initialize save checkpoints
53
+ saver = tf.train.Saver(max_to_keep=1000) #max number of checkpoints = 500
54
+ #Choice to load checkpoints
55
+ if load_checkpoint:
56
+ train_episodes = 150
57
+ save_freq_episodes = 0
58
+ print("loading the latest checkpoint from %s" % checkpoint_path)
59
+ ckpt = tf.train.get_checkpoint_state(checkpoint_path)
60
+ assert ckpt, "checkpoint path %s not found" % checkpoint_path
61
+ global_counter = int(re.findall("-(\d+)$", ckpt.model_checkpoint_path)[0])
62
+ saver.restore(sess, ckpt.model_checkpoint_path)
63
+ dqn_agent.global_counter = global_counter
64
+ render = True
65
+ else:
66
+ if checkpoint_path is not None:
67
+ assert not os.path.exists(checkpoint_path), \
68
+ "checkpoint path already exists but load_checkpoint is false"
69
+
70
+ tf.global_variables_initializer().run()
71
+
72
+
73
+ def save_checkpoint():
74
+ if not os.path.exists(checkpoint_path):
75
+ os.makedirs(checkpoint_path)
76
+ p = os.path.join(checkpoint_path, "m.ckpt")
77
+ saver.save(sess, p, dqn_agent.global_counter)
78
+ print("saved to %s - %d" % (p, dqn_agent.global_counter))
79
+
80
+
81
+ def one_episode(eps_history,dqn_scores,avg_score_all,render,load_checkpoint):
82
+ score, reward, frames, epsilon = dqn_agent.play_episode(render, load_checkpoint)
83
+
84
+ eps_history.append(epsilon)
85
+
86
+ dqn_scores.append(score)
87
+ i = dqn_agent.episode_counter
88
+ avg_score = np.mean(dqn_scores[max(0, i - 100):(i + 1)])
89
+ avg_score_all.append(avg_score)
90
+ max_avg_score = max(avg_score_all)
91
+ if avg_score >= max_avg_score:
92
+ new_max = ' => New HighScore! <= '
93
+ highscore = True
94
+ else:
95
+ new_max = ''
96
+ highscore = False
97
+
98
+
99
+
100
+ strm = ("#> episode: %i | score: %.2f | total steps: %i | epsilon: %.5f | average 100 score: %.2f" %
101
+ (i, score, dqn_agent.global_counter, epsilon, avg_score))
102
+
103
+ print(strm + new_max)
104
+
105
+ text_results = open(opendir, "a")
106
+ text_results.write(strm + new_max + '\n')
107
+ text_results.close()
108
+
109
+ if not load_checkpoint:
110
+ save_cond = (
111
+ dqn_agent.episode_counter % save_freq_episodes == 0
112
+ and checkpoint_path is not None
113
+ and dqn_agent.do_training
114
+ )
115
+ if save_cond or (highscore and dqn_agent.episode_counter > 100):
116
+ save_checkpoint()
117
+
118
+ return eps_history,dqn_scores,avg_score_all
119
+
120
+ def input_thread(list):
121
+ input("...enter to stop after current episode\n")
122
+ list.append("OK")
123
+
124
+
125
+ def main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint):
126
+ #call training loop
127
+ list = []
128
+ _thread.start_new_thread(input_thread, (list,))
129
+ while True:
130
+ if list:
131
+ break
132
+ if dqn_agent.do_training and dqn_agent.episode_counter >= train_episodes:
133
+
134
+ break
135
+ eps_history,dqn_scores,avg_score_all = one_episode(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
136
+
137
+ print("done")
138
+ text_results.close()
139
+ exit()
140
+ return eps_history,dqn_scores,avg_score_all
141
+
142
+
143
+ if train_episodes > 0 and dqn_agent.episode_counter < train_episodes and not load_checkpoint :
144
+ print("now training... you can early stop with enter...")
145
+ print("##########")
146
+ sys.stdout.flush()
147
+ main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
148
+ save_checkpoint()
149
+ print("ok training done")
150
+
151
+ else:
152
+ print("now just playing...")
153
+ sys.stdout.flush()
154
+ main_loop(eps_history,dqn_scores,avg_score_all,render,load_checkpoint)
155
+
156
+
processimage.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import matplotlib.image as mpimg
3
+ import cv2
4
+ import numpy as np
5
+ from skimage import color, transform, io
6
+
7
+
8
+ class processimage:
9
+ def process_image(obs):
10
+ #uncomment to see original image
11
+ # plt.imshow(obs)
12
+ # plt.show()
13
+
14
+ obs1 = obs.astype(np.uint8)
15
+ obs_gray = color.rgb2gray(obs1)
16
+ # obs_gray[abs(obs_gray - 0.60116) < 0.1] = 1
17
+ obs_gray[84:95,0:12] = 0
18
+ obs_gray[abs(obs_gray - 0.68616) < 0.0001] = 1
19
+ obs_gray[abs(obs_gray - 0.75630) < 0.0001] = 1
20
+ #uncomment to see pre processed image
21
+ # plt.imshow(obs_gray, cmap='gray')
22
+ # plt.show()
23
+
24
+ #Set values between -1 and 1 for input normalization
25
+ return 2 * obs_gray - 1