DQN_v1 converging
Browse files- DQN_v1.ipynb +113 -145
- DQN_v1_result.mp4 +0 -0
- DQN_v2.ipynb +0 -0
DQN_v1.ipynb
CHANGED
@@ -13,20 +13,7 @@
|
|
13 |
},
|
14 |
{
|
15 |
"cell_type": "code",
|
16 |
-
"execution_count":
|
17 |
-
"metadata": {
|
18 |
-
"id": "DDf1gLC2NTiK"
|
19 |
-
},
|
20 |
-
"outputs": [],
|
21 |
-
"source": [
|
22 |
-
"# !pip install -r ./requirements.txt\n",
|
23 |
-
"!pip install stable_baselines3[extra]\n",
|
24 |
-
"!pip install huggingface_sb3\n"
|
25 |
-
]
|
26 |
-
},
|
27 |
-
{
|
28 |
-
"cell_type": "code",
|
29 |
-
"execution_count": 2,
|
30 |
"metadata": {
|
31 |
"id": "LNXxxKojNTiL"
|
32 |
},
|
@@ -35,7 +22,7 @@
|
|
35 |
"name": "stderr",
|
36 |
"output_type": "stream",
|
37 |
"text": [
|
38 |
-
"2022-12-
|
39 |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
40 |
"\n"
|
41 |
]
|
@@ -55,6 +42,7 @@
|
|
55 |
"import numpy as np\n",
|
56 |
"import random\n",
|
57 |
"from matplotlib import pyplot as plt\n",
|
|
|
58 |
"\n",
|
59 |
"import io\n",
|
60 |
"import base64\n",
|
@@ -63,7 +51,7 @@
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
-
"execution_count":
|
67 |
"metadata": {},
|
68 |
"outputs": [],
|
69 |
"source": [
|
@@ -76,11 +64,11 @@
|
|
76 |
" # Hyperparameters\n",
|
77 |
" self.gamma = 0.95 # Discount rate\n",
|
78 |
" self.epsilon = 1.0 # Exploration rate\n",
|
79 |
-
" self.epsilon_min = 0.
|
80 |
-
" self.epsilon_decay = 0.
|
81 |
-
" self.update_rate =
|
82 |
" self.batch_size = 100\n",
|
83 |
-
" self.learning_rate =
|
84 |
" \n",
|
85 |
" # Construct DQN models\n",
|
86 |
" self.model = self._build_model()\n",
|
@@ -90,120 +78,116 @@
|
|
90 |
" self.env = env\n",
|
91 |
" self.action_size = action_size\n",
|
92 |
"\n",
|
|
|
|
|
93 |
" def _build_model(self):\n",
|
94 |
" model = tf.keras.Sequential()\n",
|
95 |
" \n",
|
96 |
" model.add(tf.keras.Input(shape=(4,)))\n",
|
97 |
-
"
|
98 |
-
" model.add(layers.Dense(
|
99 |
-
" model.add(layers.Dense(
|
100 |
-
" model.add(layers.Dense(self.action_size, activation='linear'))\n",
|
|
|
101 |
" \n",
|
102 |
" optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
|
103 |
-
" model.compile(loss='mse', optimizer=
|
|
|
104 |
" return model\n",
|
105 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
"\n",
|
107 |
" #\n",
|
108 |
" # Trains the model using randomly selected experiences in the replay memory\n",
|
109 |
" #\n",
|
110 |
" def _train(self):\n",
|
111 |
-
"
|
112 |
-
" \n",
|
113 |
-
" for state, action, reward, next_state, done in minibatch:\n",
|
114 |
-
" \n",
|
115 |
-
" if not done:\n",
|
116 |
-
" model_predict = self.model.predict(np.array([next_state]), verbose=0)\n",
|
117 |
-
" max_action = np.argmax(model_predict[0])\n",
|
118 |
-
" target = (reward + self.gamma * self.target_model.predict(np.array([next_state]), verbose=0)[0][max_action])\n",
|
119 |
-
" else:\n",
|
120 |
-
" target = reward\n",
|
121 |
-
" \n",
|
122 |
-
" # Construct the target vector as follows:\n",
|
123 |
-
" # 1. Use the current model to output the Q-value predictions\n",
|
124 |
-
" target_f = self.model.predict(np.array([state]), verbose=0)\n",
|
125 |
-
" \n",
|
126 |
-
" # 2. Rewrite the chosen action value with the computed target\n",
|
127 |
-
" target_f[0][action] = target\n",
|
128 |
-
" \n",
|
129 |
-
" # 3. Use vectors in the objective computation\n",
|
130 |
-
" history = self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)\n",
|
131 |
-
" print(f\"Loss: {history.history['loss']} \")\n",
|
132 |
-
" \n",
|
133 |
-
" if self.epsilon > self.epsilon_min:\n",
|
134 |
-
" self.epsilon *= self.epsilon_decay\n",
|
135 |
-
" #\n",
|
136 |
-
" # Trains the model using randomly selected experiences in the replay memory\n",
|
137 |
-
" #\n",
|
138 |
-
" def _train_b(self):\n",
|
139 |
-
" \n",
|
140 |
" # state, action, reward, next_state, done \n",
|
141 |
" # create the targets \n",
|
142 |
-
"
|
|
|
|
|
|
|
143 |
"\n",
|
144 |
" next_state_arr = np.stack(mb_arr[:,3])\n",
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"
|
151 |
-
" if
|
152 |
-
"
|
153 |
" else:\n",
|
154 |
-
"
|
155 |
"\n",
|
156 |
-
"
|
157 |
-
"
|
158 |
-
"
|
159 |
-
"
|
160 |
"\n",
|
161 |
" # Perform gradient step\n",
|
162 |
-
"
|
163 |
-
"
|
164 |
-
"
|
165 |
-
"
|
166 |
-
" for idx, val in enumerate(zip(action_arr, target_arr)):\n",
|
167 |
-
" act, targ = val\n",
|
168 |
-
" model_predict[idx][act] = targ\n",
|
169 |
"\n",
|
170 |
-
" history = self.model.fit(state_arr, model_predict, epochs=1, verbose=0)\n",
|
171 |
-
" print(f\"Loss: {history.history['loss']} \")\n",
|
172 |
-
" # update epsilon\n",
|
173 |
-
" if self.epsilon > self.epsilon_min:\n",
|
174 |
-
" self.epsilon *= self.epsilon_decay\n",
|
175 |
"\n",
|
176 |
" def learn(self, total_steps=None):\n",
|
177 |
-
"\n",
|
178 |
-
"
|
|
|
179 |
" total_reward = 0\n",
|
180 |
-
" rewards = []\n",
|
181 |
-
"
|
182 |
-
"
|
183 |
-
"
|
184 |
-
"
|
185 |
-
"
|
186 |
-
"
|
187 |
-
"
|
188 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
"\n",
|
190 |
-
"
|
191 |
-
"
|
192 |
-
"
|
193 |
-
" # add to buffer\n",
|
194 |
-
" self.replay_buffer.append((state, action, reward, next_state, done))\n",
|
195 |
"\n",
|
196 |
-
"
|
197 |
-
"
|
198 |
-
" total_reward = 0\n",
|
199 |
-
" state = self.env.reset()\n",
|
200 |
"\n",
|
201 |
-
"
|
202 |
-
"
|
203 |
-
"
|
204 |
-
"
|
205 |
-
"
|
|
|
|
|
206 |
" \n",
|
|
|
|
|
|
|
|
|
207 |
" #\n",
|
208 |
" # Loads a saved model\n",
|
209 |
" #\n",
|
@@ -229,89 +213,73 @@
|
|
229 |
"env = gym.make('CartPole-v1')\n",
|
230 |
"\n",
|
231 |
"model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
|
232 |
-
"model.learn(total_steps=
|
233 |
"env.close()"
|
234 |
]
|
235 |
},
|
236 |
{
|
237 |
"cell_type": "code",
|
238 |
-
"execution_count":
|
239 |
-
"metadata": {},
|
240 |
-
"outputs": [],
|
241 |
-
"source": [
|
242 |
-
"# env = gym.make('CartPole-v1')\n",
|
243 |
-
"\n",
|
244 |
-
"# model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
|
245 |
-
"\n",
|
246 |
-
"# state = model.env.reset()\n",
|
247 |
-
"# for i in range(100):\n",
|
248 |
-
"# random_action = env.action_space.sample()\n",
|
249 |
-
"# next_state, reward, done, info = model.env.step(random_action)\n",
|
250 |
-
"# model.replay_buffer.append((state, random_action, reward, next_state, done))\n",
|
251 |
-
"# if done:\n",
|
252 |
-
"# state = model.env.reset()\n",
|
253 |
-
"# else:\n",
|
254 |
-
"# state = next_state\n",
|
255 |
-
"\n",
|
256 |
-
"# minibatch = random.sample(model.replay_buffer, 10)\n",
|
257 |
-
"# mb = np.array(minibatch, dtype=object)\n",
|
258 |
-
"# print(mb[:,0])\n",
|
259 |
-
"# np.stack(mb[:,0])\n"
|
260 |
-
]
|
261 |
-
},
|
262 |
-
{
|
263 |
-
"cell_type": "code",
|
264 |
-
"execution_count": 6,
|
265 |
"metadata": {},
|
266 |
"outputs": [],
|
267 |
"source": [
|
268 |
-
"model.save(\"./m1.h5\")"
|
269 |
]
|
270 |
},
|
271 |
{
|
272 |
"cell_type": "code",
|
273 |
-
"execution_count":
|
274 |
"metadata": {},
|
275 |
"outputs": [
|
276 |
{
|
277 |
"name": "stdout",
|
278 |
"output_type": "stream",
|
279 |
"text": [
|
280 |
-
"Model: \"
|
281 |
"_________________________________________________________________\n",
|
282 |
" Layer (type) Output Shape Param # \n",
|
283 |
"=================================================================\n",
|
284 |
-
"
|
285 |
" \n",
|
286 |
-
"
|
287 |
" \n",
|
288 |
-
"
|
|
|
|
|
289 |
" \n",
|
290 |
"=================================================================\n",
|
291 |
-
"Total params:
|
292 |
-
"Trainable params:
|
293 |
"Non-trainable params: 0\n",
|
294 |
"_________________________________________________________________\n",
|
295 |
-
"
|
296 |
]
|
297 |
}
|
298 |
],
|
299 |
"source": [
|
300 |
"eval_env = gym.make('CartPole-v1')\n",
|
301 |
"model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n",
|
302 |
-
"model.load(\"./m1.h5\")\n",
|
303 |
"eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n",
|
304 |
"state = eval_env.reset()\n",
|
|
|
305 |
"for _ in range(1000):\n",
|
306 |
" action = model.play(state)\n",
|
307 |
" observation, reward, done, info = eval_env.step(action)\n",
|
308 |
-
"
|
309 |
" state = observation\n",
|
310 |
" if done: \n",
|
311 |
-
" print(reward
|
312 |
" break\n",
|
313 |
"eval_env.close()"
|
314 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
}
|
316 |
],
|
317 |
"metadata": {
|
|
|
13 |
},
|
14 |
{
|
15 |
"cell_type": "code",
|
16 |
+
"execution_count": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"metadata": {
|
18 |
"id": "LNXxxKojNTiL"
|
19 |
},
|
|
|
22 |
"name": "stderr",
|
23 |
"output_type": "stream",
|
24 |
"text": [
|
25 |
+
"2022-12-22 18:43:04.111595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
|
26 |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
27 |
"\n"
|
28 |
]
|
|
|
42 |
"import numpy as np\n",
|
43 |
"import random\n",
|
44 |
"from matplotlib import pyplot as plt\n",
|
45 |
+
"from sklearn.preprocessing import MinMaxScaler\n",
|
46 |
"\n",
|
47 |
"import io\n",
|
48 |
"import base64\n",
|
|
|
51 |
},
|
52 |
{
|
53 |
"cell_type": "code",
|
54 |
+
"execution_count": 29,
|
55 |
"metadata": {},
|
56 |
"outputs": [],
|
57 |
"source": [
|
|
|
64 |
" # Hyperparameters\n",
|
65 |
" self.gamma = 0.95 # Discount rate\n",
|
66 |
" self.epsilon = 1.0 # Exploration rate\n",
|
67 |
+
" self.epsilon_min = 0.001 # Minimal exploration rate (epsilon-greedy)\n",
|
68 |
+
" self.epsilon_decay = 0.95 # Decay rate for epsilon\n",
|
69 |
+
" self.update_rate = 5 # Number of steps until updating the target network\n",
|
70 |
" self.batch_size = 100\n",
|
71 |
+
" self.learning_rate = 2.5e-4\n",
|
72 |
" \n",
|
73 |
" # Construct DQN models\n",
|
74 |
" self.model = self._build_model()\n",
|
|
|
78 |
" self.env = env\n",
|
79 |
" self.action_size = action_size\n",
|
80 |
"\n",
|
81 |
+
" self.scaler = None\n",
|
82 |
+
"\n",
|
83 |
" def _build_model(self):\n",
|
84 |
" model = tf.keras.Sequential()\n",
|
85 |
" \n",
|
86 |
" model.add(tf.keras.Input(shape=(4,)))\n",
|
87 |
+
" model.add(layers.Dense(512, activation = 'relu'))\n",
|
88 |
+
" model.add(layers.Dense(256, activation = 'relu'))\n",
|
89 |
+
" model.add(layers.Dense(128, activation = 'relu'))\n",
|
90 |
+
" model.add(layers.Dense(self.action_size, activation = 'linear'))\n",
|
91 |
+
" # model.compile(optimizer = RMSprop(lr = self.lr, rho = 0.95, epsilon = 0.01), loss = \"mse\", metrics = ['accuracy'])\n",
|
92 |
" \n",
|
93 |
" optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
|
94 |
+
" # model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr = self.learning_rate, rho = 0.95, epsilon = 0.01), metrics = ['accuracy'])\n",
|
95 |
+
" model.compile(loss='mse', optimizer=optimizer, metrics = ['accuracy'])\n",
|
96 |
" return model\n",
|
97 |
"\n",
|
98 |
+
" def _min_max(self):\n",
|
99 |
+
" \"\"\"Run some steps to get data to do MINMAX scale \"\"\"\n",
|
100 |
+
" state_arr = []\n",
|
101 |
+
" state = self.env.reset()\n",
|
102 |
+
" state_arr.append(self.env.observation_space.high)\n",
|
103 |
+
" state_arr.append(self.env.observation_space.low)\n",
|
104 |
+
" for i in range(1000):\n",
|
105 |
+
" random_action = self.env.action_space.sample()\n",
|
106 |
+
" next_state, reward, done, info = self.env.step(random_action)\n",
|
107 |
+
" state_arr.append(next_state)\n",
|
108 |
+
" if done:\n",
|
109 |
+
" state = self.env.reset()\n",
|
110 |
+
"\n",
|
111 |
+
" state_arr = np.array(state_arr)\n",
|
112 |
+
" self.scaler = MinMaxScaler()\n",
|
113 |
+
" self.scaler.fit(state_arr)\n",
|
114 |
"\n",
|
115 |
" #\n",
|
116 |
" # Trains the model using randomly selected experiences in the replay memory\n",
|
117 |
" #\n",
|
118 |
" def _train(self):\n",
|
119 |
+
" X, y = [], []\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
" # state, action, reward, next_state, done \n",
|
121 |
" # create the targets \n",
|
122 |
+
" if self.batch_size > len(self.replay_buffer):\n",
|
123 |
+
" return\n",
|
124 |
+
" minibatch = random.sample(self.replay_buffer, self.batch_size)\n",
|
125 |
+
" mb_arr = np.array(minibatch, dtype=object)\n",
|
126 |
"\n",
|
127 |
" next_state_arr = np.stack(mb_arr[:,3])\n",
|
128 |
+
" future_qvalues = self.target_model.predict(next_state_arr, verbose=0)\n",
|
129 |
+
"\n",
|
130 |
+
" state_arr = np.stack(mb_arr[:,0])\n",
|
131 |
+
" qvalues = self.model.predict(state_arr, verbose=0)\n",
|
132 |
+
"\n",
|
133 |
+
" for index, (state, action, reward, next_state, done) in enumerate(minibatch):\n",
|
134 |
+
" if done == True:\n",
|
135 |
+
" q_target = reward\n",
|
136 |
" else:\n",
|
137 |
+
" q_target = reward + self.gamma * np.max(future_qvalues[index])\n",
|
138 |
"\n",
|
139 |
+
" q_curr = qvalues[index]\n",
|
140 |
+
" q_curr[action] = q_target \n",
|
141 |
+
" X.append(state)\n",
|
142 |
+
" y.append(q_curr)\n",
|
143 |
"\n",
|
144 |
" # Perform gradient step\n",
|
145 |
+
" X, y = np.array(X), np.array(y)\n",
|
146 |
+
" history = self.model.fit(X, y, batch_size = self.batch_size, shuffle = False, verbose=0)\n",
|
147 |
+
" # history = self.model.fit(X, y, epochs=1, verbose=0)\n",
|
148 |
+
" # print(f\"Loss: {history.history['loss']} \")\n",
|
|
|
|
|
|
|
149 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
150 |
"\n",
|
151 |
" def learn(self, total_steps=None):\n",
|
152 |
+
" #create scaler\n",
|
153 |
+
" self._min_max()\n",
|
154 |
+
" current_episode = 0\n",
|
155 |
" total_reward = 0\n",
|
156 |
+
" rewards = [0]\n",
|
157 |
+
" current_step = 0\n",
|
158 |
+
" while current_step < total_steps:\n",
|
159 |
+
" current_episode += 1\n",
|
160 |
+
" state = self.env.reset()\n",
|
161 |
+
" total_reward = 0\n",
|
162 |
+
" done = False\n",
|
163 |
+
" while done != True:\n",
|
164 |
+
" current_step +=1\n",
|
165 |
+
" # e-greedy\n",
|
166 |
+
" if np.random.random() > (1 - self.epsilon):\n",
|
167 |
+
" action = random.randrange(self.action_size)\n",
|
168 |
+
" else:\n",
|
169 |
+
" model_predict = self.model.predict(np.array([state]), verbose=0)\n",
|
170 |
+
" action = np.argmax(model_predict)\n",
|
171 |
"\n",
|
172 |
+
" # step\n",
|
173 |
+
" next_state, reward, done, info = self.env.step(action)\n",
|
174 |
+
" total_reward += reward\n",
|
|
|
|
|
175 |
"\n",
|
176 |
+
" # add to buffer\n",
|
177 |
+
" self.replay_buffer.append((state, action, reward, next_state, done))\n",
|
|
|
|
|
178 |
"\n",
|
179 |
+
" if current_step>10 and current_step % self.update_rate == 0:\n",
|
180 |
+
" print(f\"epsilon:{self.epsilon} step:{current_step} episode:{current_episode} last_score {rewards[-1]} \")\n",
|
181 |
+
" self._train()\n",
|
182 |
+
" # update target\n",
|
183 |
+
" self.target_model.set_weights(self.model.get_weights())\n",
|
184 |
+
" \n",
|
185 |
+
" state = next_state\n",
|
186 |
" \n",
|
187 |
+
" rewards.append(total_reward)\n",
|
188 |
+
" # update epsilon\n",
|
189 |
+
" if self.epsilon > self.epsilon_min:\n",
|
190 |
+
" self.epsilon *= self.epsilon_decay\n",
|
191 |
" #\n",
|
192 |
" # Loads a saved model\n",
|
193 |
" #\n",
|
|
|
213 |
"env = gym.make('CartPole-v1')\n",
|
214 |
"\n",
|
215 |
"model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
|
216 |
+
"model.learn(total_steps=6_000)\n",
|
217 |
"env.close()"
|
218 |
]
|
219 |
},
|
220 |
{
|
221 |
"cell_type": "code",
|
222 |
+
"execution_count": 31,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
"metadata": {},
|
224 |
"outputs": [],
|
225 |
"source": [
|
226 |
+
"model.save(\"./alt/m1.h5\")"
|
227 |
]
|
228 |
},
|
229 |
{
|
230 |
"cell_type": "code",
|
231 |
+
"execution_count": 33,
|
232 |
"metadata": {},
|
233 |
"outputs": [
|
234 |
{
|
235 |
"name": "stdout",
|
236 |
"output_type": "stream",
|
237 |
"text": [
|
238 |
+
"Model: \"sequential_28\"\n",
|
239 |
"_________________________________________________________________\n",
|
240 |
" Layer (type) Output Shape Param # \n",
|
241 |
"=================================================================\n",
|
242 |
+
" dense_97 (Dense) (None, 512) 2560 \n",
|
243 |
" \n",
|
244 |
+
" dense_98 (Dense) (None, 256) 131328 \n",
|
245 |
" \n",
|
246 |
+
" dense_99 (Dense) (None, 128) 32896 \n",
|
247 |
+
" \n",
|
248 |
+
" dense_100 (Dense) (None, 2) 258 \n",
|
249 |
" \n",
|
250 |
"=================================================================\n",
|
251 |
+
"Total params: 167,042\n",
|
252 |
+
"Trainable params: 167,042\n",
|
253 |
"Non-trainable params: 0\n",
|
254 |
"_________________________________________________________________\n",
|
255 |
+
"Total reward 500.0\n"
|
256 |
]
|
257 |
}
|
258 |
],
|
259 |
"source": [
|
260 |
"eval_env = gym.make('CartPole-v1')\n",
|
261 |
"model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n",
|
262 |
+
"model.load(\"./alt/m1.h5\")\n",
|
263 |
"eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n",
|
264 |
"state = eval_env.reset()\n",
|
265 |
+
"total_reward = 0\n",
|
266 |
"for _ in range(1000):\n",
|
267 |
" action = model.play(state)\n",
|
268 |
" observation, reward, done, info = eval_env.step(action)\n",
|
269 |
+
" total_reward +=reward\n",
|
270 |
" state = observation\n",
|
271 |
" if done: \n",
|
272 |
+
" print(f\"Total reward {total_reward}\")\n",
|
273 |
" break\n",
|
274 |
"eval_env.close()"
|
275 |
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": null,
|
280 |
+
"metadata": {},
|
281 |
+
"outputs": [],
|
282 |
+
"source": []
|
283 |
}
|
284 |
],
|
285 |
"metadata": {
|
DQN_v1_result.mp4
ADDED
Binary file (23.8 kB). View file
|
|
DQN_v2.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|