Initial Commit
Browse files- .gitignore +163 -0
- Qlearning_pole.py +222 -0
- config.ini +14 -0
- convergence.png +0 -0
- convergence15kBuffer(1klimiter).png +0 -0
- convergence20000.png +0 -0
- histogram.png +0 -0
- main.py +55 -0
- readme.md +63 -0
- requirements.txt +6 -0
- resources/boxplot.png +0 -0
- resources/boxplot15k_bufferMemory.png +0 -0
- resources/boxplot_5k_bufferMemory.png +0 -0
- resources/boxplot_old.png +0 -0
- resources/boxplot_old_theta.png +0 -0
- resources/convergence20kbuffer.png +0 -0
- resources/convergence_old.png +0 -0
- resources/histogram_old.png +0 -0
- resources/old_agent20k.png +0 -0
- resources/old_boxplot.png +0 -0
- resources/trained_agent20k.png +0 -0
- test_weights.py +40 -0
- time.py +32 -0
- watchModel.py +8 -0
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
.idea/
|
161 |
+
/shelf/
|
162 |
+
/workspace.xml
|
163 |
+
*.npy
|
Qlearning_pole.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import gym
|
5 |
+
import time
|
6 |
+
from tqdm import tqdm
|
7 |
+
import configparser
|
8 |
+
class Qlearning:
|
9 |
+
###########################################################################
|
10 |
+
# START - __init__ function
|
11 |
+
###########################################################################
|
12 |
+
# INPUTS:
|
13 |
+
# env - Cart Pole environment
|
14 |
+
# alpha - step size
|
15 |
+
# gamma - discount rate
|
16 |
+
# epsilon - parameter for epsilon-greedy approach
|
17 |
+
# numberEpisodes - total number of simulation episodes
|
18 |
+
|
19 |
+
# numberOfBins - this is a 4 dimensional list that defines the number of grid points
|
20 |
+
# for state discretization
|
21 |
+
# that is, this list contains number of bins for every state entry,
|
22 |
+
# we have 4 entries, that is,
|
23 |
+
# discretization for cart position, cart velocity, pole angle, and pole angular velocity
|
24 |
+
|
25 |
+
# lowerBounds - lower bounds (limits) for discretization, list with 4 entries:
|
26 |
+
# lower bounds on cart position, cart velocity, pole angle, and pole angular velocity
|
27 |
+
|
28 |
+
# upperBounds - upper bounds (limits) for discretization, list with 4 entries:
|
29 |
+
# upper bounds on cart position, cart velocity, pole angle, and pole angular velocity
|
30 |
+
def __init__(self, env = gym.make('CartPole-v1'), file='config.ini'):
|
31 |
+
self.env = env
|
32 |
+
self.load_values(file)
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def load_values(self,file):
|
37 |
+
config = configparser.ConfigParser()
|
38 |
+
config.read(file)
|
39 |
+
|
40 |
+
cart_velocity_min = float(config['Parameters']['cart_velocity_min'])
|
41 |
+
cart_velocity_max = float(config['Parameters']['cart_velocity_max'])
|
42 |
+
pole_angle_velocity_min = float(config['Parameters']['pole_angle_velocity_min'])
|
43 |
+
pole_angle_velocity_max = float(config['Parameters']['pole_angle_velocity_max'])
|
44 |
+
number_of_bins_position = int(config['Parameters']['number_of_bins_position'])
|
45 |
+
number_of_bins_velocity = int(config['Parameters']['number_of_bins_velocity'])
|
46 |
+
number_of_bins_angle = int(config['Parameters']['number_of_bins_angle'])
|
47 |
+
number_of_bins_angle_velocity = int(config['Parameters']['number_of_bins_angle_velocity'])
|
48 |
+
self.action_number = self.env.action_space.n
|
49 |
+
self.alpha = float(config['Parameters']['alpha'])
|
50 |
+
self.gamma = float(config['Parameters']['gamma'])
|
51 |
+
self.epsilon = float(config['Parameters']['epsilon'])
|
52 |
+
self.numEpisodes = int(config['Parameters']['number_episodes'])
|
53 |
+
|
54 |
+
self.upperBounds = self.env.observation_space.high
|
55 |
+
self.lowerBounds = self.env.observation_space.low
|
56 |
+
self.upperBounds[1] = cart_velocity_max
|
57 |
+
self.upperBounds[3] = pole_angle_velocity_max
|
58 |
+
self.lowerBounds[1] = cart_velocity_min
|
59 |
+
self.lowerBounds[3] = pole_angle_velocity_min
|
60 |
+
|
61 |
+
self.batch_size = int(config['Parameters']['batch_size'])
|
62 |
+
|
63 |
+
self.rewardsEpisode = 0
|
64 |
+
self.sumRewardsEpisode = []
|
65 |
+
|
66 |
+
# Update the number of bins
|
67 |
+
self.num_bins = [number_of_bins_position, number_of_bins_velocity, number_of_bins_angle,
|
68 |
+
number_of_bins_angle_velocity]
|
69 |
+
|
70 |
+
self.replayBuffer = []
|
71 |
+
self.Q = np.random.uniform(0, 1, size=(self.num_bins[0], self.num_bins[1], self.num_bins[2], self.num_bins[3], self.action_number))
|
72 |
+
|
73 |
+
# Observation space is not discrete so we make it discrete
|
74 |
+
def returnIndexState(self, state):
|
75 |
+
position = state[0]
|
76 |
+
velocity = state[1]
|
77 |
+
angle = state[2]
|
78 |
+
angularVelocity = state[3]
|
79 |
+
|
80 |
+
cartPositionBin = np.linspace(self.lowerBounds[0], self.upperBounds[0], self.num_bins[0])
|
81 |
+
cartVelocityBin = np.linspace(self.lowerBounds[1], self.upperBounds[1], self.num_bins[1])
|
82 |
+
cartAngleBin = np.linspace(self.lowerBounds[2], self.upperBounds[2], self.num_bins[2])
|
83 |
+
cartAngularVelocityBin = np.linspace(self.lowerBounds[3], self.upperBounds[3], self.num_bins[3])
|
84 |
+
|
85 |
+
indexPosition = np.maximum(np.digitize(position, cartPositionBin) - 1, 0)
|
86 |
+
indexVelocity = np.maximum(np.digitize(velocity, cartVelocityBin) - 1, 0)
|
87 |
+
indexAngle = np.maximum(np.digitize(angle, cartAngleBin) - 1, 0)
|
88 |
+
indexAngularVelocity = np.maximum(np.digitize(angularVelocity, cartAngularVelocityBin) - 1, 0)
|
89 |
+
|
90 |
+
return tuple([indexPosition, indexVelocity, indexAngle, indexAngularVelocity])
|
91 |
+
|
92 |
+
def selectAction(self, state, index):
|
93 |
+
# First 10% episodes will be random
|
94 |
+
if index < self.numEpisodes * 0.1:
|
95 |
+
return np.random.choice(self.action_number)
|
96 |
+
|
97 |
+
# We generate a random number to decide if we are exploring or not.
|
98 |
+
randomNumber = np.random.random()
|
99 |
+
|
100 |
+
# Decay starts at 55%
|
101 |
+
if index > self.numEpisodes * 0.6:
|
102 |
+
self.epsilon = 0.999 * self.epsilon
|
103 |
+
|
104 |
+
# If satisfied we are exploring
|
105 |
+
if randomNumber < self.epsilon:
|
106 |
+
return np.random.choice(self.action_number)
|
107 |
+
|
108 |
+
# Else we are being greedy
|
109 |
+
else:
|
110 |
+
return np.random.choice(np.where(
|
111 |
+
self.Q[self.returnIndexState(state)] == np.max(self.Q[self.returnIndexState(state)]))[0])
|
112 |
+
|
113 |
+
def train(self):
|
114 |
+
for indexEpisode in tqdm(range(self.numEpisodes)):#, miniters=1):
|
115 |
+
#for indexEpisode in range(self.numEpisodes):
|
116 |
+
rewardsEpisode = []
|
117 |
+
(stateS, _) = self.env.reset()
|
118 |
+
stateS = list(stateS)
|
119 |
+
#print(f'Simulating Episode {indexEpisode}')
|
120 |
+
terminalState = False
|
121 |
+
steps = 0
|
122 |
+
# Add a steps limiter to shorten training time
|
123 |
+
while not terminalState and steps < 2000:
|
124 |
+
steps += 1
|
125 |
+
stateSIndex = self.returnIndexState(stateS)
|
126 |
+
actionA = self.selectAction(stateS, indexEpisode)
|
127 |
+
|
128 |
+
(stateSprime, reward, terminalState, _, _) = self.env.step(actionA)
|
129 |
+
rewardsEpisode.append(reward)
|
130 |
+
stateSprime = list(stateSprime)
|
131 |
+
|
132 |
+
# Store the experience in the buffer
|
133 |
+
self.replayBuffer.append([stateS,actionA,reward,stateSprime,terminalState])
|
134 |
+
|
135 |
+
stateSprimeIndex = self.returnIndexState(stateSprime)
|
136 |
+
|
137 |
+
QmaxPrime = np.max(self.Q[stateSprimeIndex])
|
138 |
+
if not terminalState:
|
139 |
+
error = reward + self.gamma * QmaxPrime - self.Q[stateSIndex + (actionA,)]
|
140 |
+
self.Q[stateSIndex + (actionA,)] = self.Q[stateSIndex + (actionA,)] + self.alpha * error
|
141 |
+
else:
|
142 |
+
error = reward - self.Q[stateSIndex + (actionA,)]
|
143 |
+
self.Q[stateSIndex + (actionA,)] = self.Q[stateSIndex + (actionA,)] + self.alpha * error
|
144 |
+
|
145 |
+
stateS = stateSprime
|
146 |
+
|
147 |
+
if indexEpisode % 5 == 0:
|
148 |
+
self.updateQValues()
|
149 |
+
#print("Sum of rewards {}".format(np.sum(rewardsEpisode)))
|
150 |
+
self.sumRewardsEpisode.append(np.sum(rewardsEpisode))
|
151 |
+
|
152 |
+
|
153 |
+
def updateQValues(self):
|
154 |
+
if len(self.replayBuffer)<self.batch_size:
|
155 |
+
return
|
156 |
+
|
157 |
+
# Select a random batch of experiences
|
158 |
+
batch = random.sample(self.replayBuffer, self.batch_size)
|
159 |
+
|
160 |
+
for experience in batch:
|
161 |
+
state,action,reward,next_state,done = experience
|
162 |
+
stateIndex = self.returnIndexState(state)
|
163 |
+
actionIndex = action
|
164 |
+
|
165 |
+
if not done:
|
166 |
+
next_stateIndex = self.returnIndexState(next_state)
|
167 |
+
QmaxPrime = np.max(self.Q[next_stateIndex])
|
168 |
+
error = reward + self.gamma * QmaxPrime - self.Q[stateIndex + (actionIndex,)]
|
169 |
+
else:
|
170 |
+
error = reward - self.Q[stateIndex + (actionIndex,)]
|
171 |
+
self.Q[stateIndex + (actionIndex,)] += self.alpha * error
|
172 |
+
|
173 |
+
def simulateLearnedStrategy(self,env1 = gym.make("CartPole-v1"), render=False):
|
174 |
+
import gym
|
175 |
+
import time
|
176 |
+
# Choose this line if you want to see how it behaves
|
177 |
+
#env1 = gym.make("CartPole-v1", render_mode='human')
|
178 |
+
(currentState, _) = env1.reset()
|
179 |
+
if render:
|
180 |
+
env1.render()
|
181 |
+
timeSteps = 3000
|
182 |
+
steps = 0
|
183 |
+
# obtained rewards at every time step
|
184 |
+
obtainedRewards = []
|
185 |
+
terminated = False
|
186 |
+
truncated = False
|
187 |
+
while (not (terminated or truncated)) or steps < timeSteps:
|
188 |
+
steps+=1
|
189 |
+
#print(timeIndex)
|
190 |
+
# select greedy actions
|
191 |
+
actionInStateS = np.random.choice(np.where(self.Q[self.returnIndexState(currentState)] == np.max(
|
192 |
+
self.Q[self.returnIndexState(currentState)]))[0])
|
193 |
+
currentState, reward, terminated, truncated, info = env1.step(actionInStateS)
|
194 |
+
obtainedRewards.append(reward)
|
195 |
+
time.sleep(0.05)
|
196 |
+
if (terminated):
|
197 |
+
time.sleep(1)
|
198 |
+
break
|
199 |
+
return obtainedRewards, env1
|
200 |
+
|
201 |
+
def simulateRandomStrategy(self):
|
202 |
+
env2 = gym.make('CartPole-v1')
|
203 |
+
(currentState, _) = env2.reset()
|
204 |
+
#env2.render()
|
205 |
+
# number of simulation episodes
|
206 |
+
episodeNumber = 100
|
207 |
+
# time steps in every episode
|
208 |
+
timeSteps = 1000
|
209 |
+
# sum of rewards in each episode
|
210 |
+
rewardsEpisode = []
|
211 |
+
|
212 |
+
|
213 |
+
for timeIndex in range(timeSteps):
|
214 |
+
random_action = env2.action_space.sample()
|
215 |
+
observation, reward, terminated, truncated, info = env2.step(random_action)
|
216 |
+
rewardsEpisode.append(reward)
|
217 |
+
if (terminated):
|
218 |
+
break
|
219 |
+
|
220 |
+
return np.sum(rewardsEpisode), env2
|
221 |
+
|
222 |
+
|
config.ini
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[Parameters]
|
2 |
+
cart_velocity_min = -5
|
3 |
+
cart_velocity_max = 5
|
4 |
+
pole_angle_velocity_min = -10
|
5 |
+
pole_angle_velocity_max = 10
|
6 |
+
number_of_bins_position = 50
|
7 |
+
number_of_bins_velocity = 50
|
8 |
+
number_of_bins_angle = 50
|
9 |
+
number_of_bins_angle_velocity = 50
|
10 |
+
alpha = 0.15
|
11 |
+
gamma = 1
|
12 |
+
epsilon = 0.25
|
13 |
+
number_episodes = 20000
|
14 |
+
batch_size = 32
|
convergence.png
ADDED
![]() |
convergence15kBuffer(1klimiter).png
ADDED
![]() |
convergence20000.png
ADDED
![]() |
histogram.png
ADDED
![]() |
main.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gymnasium as gym
|
3 |
+
import numpy
|
4 |
+
import numpy as np
|
5 |
+
from Qlearning_pole import Qlearning
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Rendering the environment
|
9 |
+
# env=gym.make('CartPole-v1',render_mode='human')
|
10 |
+
|
11 |
+
|
12 |
+
Q1 = Qlearning()
|
13 |
+
# run the Q-Learning algorithm
|
14 |
+
Q1.train()
|
15 |
+
# simulate the learned strategy
|
16 |
+
(obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()
|
17 |
+
|
18 |
+
# close the environment
|
19 |
+
env1.close()
|
20 |
+
# get the sum of rewards
|
21 |
+
np.sum(obtainedRewardsOptimal)
|
22 |
+
import matplotlib.pyplot as plt
|
23 |
+
# now simulate a random strategy
|
24 |
+
(obtainedRewardsRandom, env2) = Q1.simulateRandomStrategy()
|
25 |
+
plt.figure(figsize=(12, 5))
|
26 |
+
# plot the figure and adjust the plot parameters
|
27 |
+
numpy.save("Qmatrix_new.npy",Q1.Q)
|
28 |
+
plt.plot(Q1.sumRewardsEpisode, color='blue', linewidth=1)
|
29 |
+
plt.xlabel('Episode')
|
30 |
+
plt.ylabel('Reward')
|
31 |
+
plt.yscale('log')
|
32 |
+
plt.savefig('convergence.png')
|
33 |
+
plt.title("Convergence of rewards")
|
34 |
+
plt.show()
|
35 |
+
|
36 |
+
|
37 |
+
# close the environment
|
38 |
+
env1.close()
|
39 |
+
# get the sum of rewards
|
40 |
+
np.sum(obtainedRewardsOptimal)
|
41 |
+
|
42 |
+
# now simulate a random strategy
|
43 |
+
obtainedRewardsRandom = []
|
44 |
+
for i in range(50):
|
45 |
+
(rewardsRandom, env2) = Q1.simulateRandomStrategy()
|
46 |
+
obtainedRewardsRandom.append(rewardsRandom)
|
47 |
+
plt.title("Rewards with random strategy")
|
48 |
+
plt.hist(obtainedRewardsRandom)
|
49 |
+
plt.xlabel('Sum of rewards')
|
50 |
+
plt.ylabel('Percentage')
|
51 |
+
plt.savefig('histogram.png')
|
52 |
+
plt.show()
|
53 |
+
|
54 |
+
# run this several times and compare with a random learning strategy
|
55 |
+
(obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()
|
readme.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Cartpole Reinforcement Learning
|
2 |
+
|
3 |
+
This repository is a project focused on exploring reinforcement learning techniques using the OpenAI Gym environment. The objective is to compare different algorithms and approaches to improve the performance of an agent in the Cartpole task.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
Installation of packages
|
7 |
+
```
|
8 |
+
pip install -r requirements.txt
|
9 |
+
```
|
10 |
+
|
11 |
+
If you want to execute the the training phase and get your own model execute the main program, the hyperparameters and different options can be changes via config.ini file.
|
12 |
+
|
13 |
+
If you just want to watch the trained model play the game execute the following
|
14 |
+
|
15 |
+
```
|
16 |
+
python3 watchModel.py
|
17 |
+
```
|
18 |
+
|
19 |
+
|
20 |
+
## Objectives
|
21 |
+
|
22 |
+
The main objectives of this project are as follows:
|
23 |
+
|
24 |
+
1. Develop a working model that demonstrates an increase in survival time through training.
|
25 |
+
2. Experiment with different reinforcement learning algorithms and compare their training time, complexity, and achieved scores.
|
26 |
+
3. Fine-tune the algorithm parameters and the number of bins used to achieve optimal training results.
|
27 |
+
4. Improve the consistency of the trained agent's strategy.
|
28 |
+
5. Implement experience replay to enhance learning.
|
29 |
+
|
30 |
+
## Results
|
31 |
+
|
32 |
+
The initial approach used in this project was Q-Learning, and it produced the following results:
|
33 |
+
|
34 |
+

|
35 |
+
|
36 |
+
The convergence plot shows an increase in the score over time, with three distinct phases. The first phase corresponds to random inputs, followed by a phase where the model explores a lot. The third phase occurs when the epsilon value starts to decay.
|
37 |
+
|
38 |
+

|
39 |
+
|
40 |
+
Comparing the results of the trained agent (after 20,000 episodes) with a random agent clearly demonstrates the improvement achieved:
|
41 |
+
|
42 |
+

|
43 |
+
|
44 |
+
Despite the improvements, the trained agent still lacks consistency. This inconsistency is believed to be due to the inherent randomness in the Cartpole environment.
|
45 |
+
|
46 |
+
## Experience Replay
|
47 |
+
|
48 |
+
Experience replay has been implemented in this project, leading to significant improvements in the agent's performance. The details and results of this implementation are yet to be provided.
|
49 |
+
|
50 |
+
The results of the trained agent with experience replay are as follows:
|
51 |
+
It should be mention that to speed up the training phase, the experience replay agent had a score limit of 2000.
|
52 |
+
| Metric | Old Agent | Trained Agent with Experience Replay |
|
53 |
+
|------------------------|--------------|--------------------------------------|
|
54 |
+
| Convergence Plot |  |  |
|
55 |
+
| Score Histogram |  |  |
|
56 |
+
|Boxplot|| |
|
57 |
+
|
58 |
+
As observed by adding experience replay the agent has been able to objectively increase it's score.
|
59 |
+
|
60 |
+
## References
|
61 |
+
|
62 |
+
- https://arxiv.org/pdf/2112.04213.pdf
|
63 |
+
- https://aleksandarhaber.com/q-learning-in-python-with-tests-in-cart-pole-openai-gym-environment-reinforcement-learning-tutorial/
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gym==0.26.2
|
2 |
+
gymnasium==0.28.1
|
3 |
+
matplotlib==3.7.1
|
4 |
+
numba==0.57.0
|
5 |
+
numpy==1.23.5
|
6 |
+
tqdm==4.65.0
|
resources/boxplot.png
ADDED
![]() |
resources/boxplot15k_bufferMemory.png
ADDED
![]() |
resources/boxplot_5k_bufferMemory.png
ADDED
![]() |
resources/boxplot_old.png
ADDED
![]() |
resources/boxplot_old_theta.png
ADDED
![]() |
resources/convergence20kbuffer.png
ADDED
![]() |
resources/convergence_old.png
ADDED
![]() |
resources/histogram_old.png
ADDED
![]() |
resources/old_agent20k.png
ADDED
![]() |
resources/old_boxplot.png
ADDED
![]() |
resources/trained_agent20k.png
ADDED
![]() |
test_weights.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from Qlearning_pole import Qlearning
|
3 |
+
from tqdm import tqdm
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
|
6 |
+
|
7 |
+
# Number of games the agent will play.
|
8 |
+
iterations = 50
|
9 |
+
agent = Qlearning()
|
10 |
+
# Insert the weights of the Agent to plot.
|
11 |
+
agent.Q = np.load("Qmatrix.npy")
|
12 |
+
|
13 |
+
scores = []
|
14 |
+
for i in tqdm(range(iterations),miniters=1,desc="Trained Agent"):
|
15 |
+
a, b = agent.simulateLearnedStrategy()
|
16 |
+
scores.append(np.sum(a))
|
17 |
+
|
18 |
+
random_scores = []
|
19 |
+
for i in tqdm(range(iterations),miniters=1,desc="Random Agent"):
|
20 |
+
a, b = agent.simulateRandomStrategy()
|
21 |
+
random_scores.append(a)
|
22 |
+
|
23 |
+
data = [random_scores,scores]
|
24 |
+
print(data)
|
25 |
+
|
26 |
+
plt.title("Rewards with trained agent")
|
27 |
+
plt.hist(scores)
|
28 |
+
plt.xlabel('Reward')
|
29 |
+
plt.ylabel('Percentage')
|
30 |
+
# plt.savefig('./resources/new.png')
|
31 |
+
plt.show()
|
32 |
+
|
33 |
+
fig = plt.figure(figsize=(10,7))
|
34 |
+
ax = fig.add_subplot(111)
|
35 |
+
|
36 |
+
|
37 |
+
bp = ax.boxplot(data,patch_artist=True,notch=True,vert=0)
|
38 |
+
plt.title("Trained Agent vs Random Agent (50 episodes)")
|
39 |
+
# plt.savefig("./resources/old_boxplot.png")
|
40 |
+
plt.show()
|
time.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numba import jit, cuda
|
2 |
+
import numpy as np
|
3 |
+
# to measure exec time
|
4 |
+
from tqdm import tqdm
|
5 |
+
from timeit import default_timer as timer
|
6 |
+
|
7 |
+
|
8 |
+
# normal function to run on cpu
|
9 |
+
def func(a):
|
10 |
+
for i in range(100000000):
|
11 |
+
a[i] += 1
|
12 |
+
if i % 100 == 0:
|
13 |
+
print(i)
|
14 |
+
# function optimized to run on gpu
|
15 |
+
|
16 |
+
|
17 |
+
def func2(a):
|
18 |
+
for i in tqdm(range(100000000)):
|
19 |
+
a[i] += 1
|
20 |
+
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
n = 100000000
|
24 |
+
a = np.ones(n, dtype=np.float64)
|
25 |
+
"""
|
26 |
+
start = timer()
|
27 |
+
func(a)
|
28 |
+
print("Prints", timer() - start)
|
29 |
+
"""
|
30 |
+
start = timer()
|
31 |
+
func2(a)
|
32 |
+
print("tqdm", timer() - start)
|
watchModel.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gymnasium as gym
|
2 |
+
from Qlearning_pole import Qlearning
|
3 |
+
import numpy as np
|
4 |
+
if __name__ == '__main__':
|
5 |
+
env = gym.make('CartPole-v1',render_mode='human')
|
6 |
+
q = Qlearning(env)
|
7 |
+
q.Q = np.load('Qmatrix.npy')
|
8 |
+
q.simulateLearnedStrategy(render=True,env1=env)
|