CarlosMN commited on
Commit
f1acf35
·
1 Parent(s): 796f73d

Initial Commit

Browse files
.gitignore ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+ /shelf/
162
+ /workspace.xml
163
+ *.npy
Qlearning_pole.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import numpy as np
4
+ import gym
5
+ import time
6
+ from tqdm import tqdm
7
+ import configparser
8
+ class Qlearning:
9
+ ###########################################################################
10
+ # START - __init__ function
11
+ ###########################################################################
12
+ # INPUTS:
13
+ # env - Cart Pole environment
14
+ # alpha - step size
15
+ # gamma - discount rate
16
+ # epsilon - parameter for epsilon-greedy approach
17
+ # numberEpisodes - total number of simulation episodes
18
+
19
+ # numberOfBins - this is a 4 dimensional list that defines the number of grid points
20
+ # for state discretization
21
+ # that is, this list contains number of bins for every state entry,
22
+ # we have 4 entries, that is,
23
+ # discretization for cart position, cart velocity, pole angle, and pole angular velocity
24
+
25
+ # lowerBounds - lower bounds (limits) for discretization, list with 4 entries:
26
+ # lower bounds on cart position, cart velocity, pole angle, and pole angular velocity
27
+
28
+ # upperBounds - upper bounds (limits) for discretization, list with 4 entries:
29
+ # upper bounds on cart position, cart velocity, pole angle, and pole angular velocity
30
+ def __init__(self, env = gym.make('CartPole-v1'), file='config.ini'):
31
+ self.env = env
32
+ self.load_values(file)
33
+
34
+
35
+
36
+ def load_values(self,file):
37
+ config = configparser.ConfigParser()
38
+ config.read(file)
39
+
40
+ cart_velocity_min = float(config['Parameters']['cart_velocity_min'])
41
+ cart_velocity_max = float(config['Parameters']['cart_velocity_max'])
42
+ pole_angle_velocity_min = float(config['Parameters']['pole_angle_velocity_min'])
43
+ pole_angle_velocity_max = float(config['Parameters']['pole_angle_velocity_max'])
44
+ number_of_bins_position = int(config['Parameters']['number_of_bins_position'])
45
+ number_of_bins_velocity = int(config['Parameters']['number_of_bins_velocity'])
46
+ number_of_bins_angle = int(config['Parameters']['number_of_bins_angle'])
47
+ number_of_bins_angle_velocity = int(config['Parameters']['number_of_bins_angle_velocity'])
48
+ self.action_number = self.env.action_space.n
49
+ self.alpha = float(config['Parameters']['alpha'])
50
+ self.gamma = float(config['Parameters']['gamma'])
51
+ self.epsilon = float(config['Parameters']['epsilon'])
52
+ self.numEpisodes = int(config['Parameters']['number_episodes'])
53
+
54
+ self.upperBounds = self.env.observation_space.high
55
+ self.lowerBounds = self.env.observation_space.low
56
+ self.upperBounds[1] = cart_velocity_max
57
+ self.upperBounds[3] = pole_angle_velocity_max
58
+ self.lowerBounds[1] = cart_velocity_min
59
+ self.lowerBounds[3] = pole_angle_velocity_min
60
+
61
+ self.batch_size = int(config['Parameters']['batch_size'])
62
+
63
+ self.rewardsEpisode = 0
64
+ self.sumRewardsEpisode = []
65
+
66
+ # Update the number of bins
67
+ self.num_bins = [number_of_bins_position, number_of_bins_velocity, number_of_bins_angle,
68
+ number_of_bins_angle_velocity]
69
+
70
+ self.replayBuffer = []
71
+ self.Q = np.random.uniform(0, 1, size=(self.num_bins[0], self.num_bins[1], self.num_bins[2], self.num_bins[3], self.action_number))
72
+
73
+ # Observation space is not discrete so we make it discrete
74
+ def returnIndexState(self, state):
75
+ position = state[0]
76
+ velocity = state[1]
77
+ angle = state[2]
78
+ angularVelocity = state[3]
79
+
80
+ cartPositionBin = np.linspace(self.lowerBounds[0], self.upperBounds[0], self.num_bins[0])
81
+ cartVelocityBin = np.linspace(self.lowerBounds[1], self.upperBounds[1], self.num_bins[1])
82
+ cartAngleBin = np.linspace(self.lowerBounds[2], self.upperBounds[2], self.num_bins[2])
83
+ cartAngularVelocityBin = np.linspace(self.lowerBounds[3], self.upperBounds[3], self.num_bins[3])
84
+
85
+ indexPosition = np.maximum(np.digitize(position, cartPositionBin) - 1, 0)
86
+ indexVelocity = np.maximum(np.digitize(velocity, cartVelocityBin) - 1, 0)
87
+ indexAngle = np.maximum(np.digitize(angle, cartAngleBin) - 1, 0)
88
+ indexAngularVelocity = np.maximum(np.digitize(angularVelocity, cartAngularVelocityBin) - 1, 0)
89
+
90
+ return tuple([indexPosition, indexVelocity, indexAngle, indexAngularVelocity])
91
+
92
+ def selectAction(self, state, index):
93
+ # First 10% episodes will be random
94
+ if index < self.numEpisodes * 0.1:
95
+ return np.random.choice(self.action_number)
96
+
97
+ # We generate a random number to decide if we are exploring or not.
98
+ randomNumber = np.random.random()
99
+
100
+ # Decay starts at 55%
101
+ if index > self.numEpisodes * 0.6:
102
+ self.epsilon = 0.999 * self.epsilon
103
+
104
+ # If satisfied we are exploring
105
+ if randomNumber < self.epsilon:
106
+ return np.random.choice(self.action_number)
107
+
108
+ # Else we are being greedy
109
+ else:
110
+ return np.random.choice(np.where(
111
+ self.Q[self.returnIndexState(state)] == np.max(self.Q[self.returnIndexState(state)]))[0])
112
+
113
+ def train(self):
114
+ for indexEpisode in tqdm(range(self.numEpisodes)):#, miniters=1):
115
+ #for indexEpisode in range(self.numEpisodes):
116
+ rewardsEpisode = []
117
+ (stateS, _) = self.env.reset()
118
+ stateS = list(stateS)
119
+ #print(f'Simulating Episode {indexEpisode}')
120
+ terminalState = False
121
+ steps = 0
122
+ # Add a steps limiter to shorten training time
123
+ while not terminalState and steps < 2000:
124
+ steps += 1
125
+ stateSIndex = self.returnIndexState(stateS)
126
+ actionA = self.selectAction(stateS, indexEpisode)
127
+
128
+ (stateSprime, reward, terminalState, _, _) = self.env.step(actionA)
129
+ rewardsEpisode.append(reward)
130
+ stateSprime = list(stateSprime)
131
+
132
+ # Store the experience in the buffer
133
+ self.replayBuffer.append([stateS,actionA,reward,stateSprime,terminalState])
134
+
135
+ stateSprimeIndex = self.returnIndexState(stateSprime)
136
+
137
+ QmaxPrime = np.max(self.Q[stateSprimeIndex])
138
+ if not terminalState:
139
+ error = reward + self.gamma * QmaxPrime - self.Q[stateSIndex + (actionA,)]
140
+ self.Q[stateSIndex + (actionA,)] = self.Q[stateSIndex + (actionA,)] + self.alpha * error
141
+ else:
142
+ error = reward - self.Q[stateSIndex + (actionA,)]
143
+ self.Q[stateSIndex + (actionA,)] = self.Q[stateSIndex + (actionA,)] + self.alpha * error
144
+
145
+ stateS = stateSprime
146
+
147
+ if indexEpisode % 5 == 0:
148
+ self.updateQValues()
149
+ #print("Sum of rewards {}".format(np.sum(rewardsEpisode)))
150
+ self.sumRewardsEpisode.append(np.sum(rewardsEpisode))
151
+
152
+
153
+ def updateQValues(self):
154
+ if len(self.replayBuffer)<self.batch_size:
155
+ return
156
+
157
+ # Select a random batch of experiences
158
+ batch = random.sample(self.replayBuffer, self.batch_size)
159
+
160
+ for experience in batch:
161
+ state,action,reward,next_state,done = experience
162
+ stateIndex = self.returnIndexState(state)
163
+ actionIndex = action
164
+
165
+ if not done:
166
+ next_stateIndex = self.returnIndexState(next_state)
167
+ QmaxPrime = np.max(self.Q[next_stateIndex])
168
+ error = reward + self.gamma * QmaxPrime - self.Q[stateIndex + (actionIndex,)]
169
+ else:
170
+ error = reward - self.Q[stateIndex + (actionIndex,)]
171
+ self.Q[stateIndex + (actionIndex,)] += self.alpha * error
172
+
173
+ def simulateLearnedStrategy(self,env1 = gym.make("CartPole-v1"), render=False):
174
+ import gym
175
+ import time
176
+ # Choose this line if you want to see how it behaves
177
+ #env1 = gym.make("CartPole-v1", render_mode='human')
178
+ (currentState, _) = env1.reset()
179
+ if render:
180
+ env1.render()
181
+ timeSteps = 3000
182
+ steps = 0
183
+ # obtained rewards at every time step
184
+ obtainedRewards = []
185
+ terminated = False
186
+ truncated = False
187
+ while (not (terminated or truncated)) or steps < timeSteps:
188
+ steps+=1
189
+ #print(timeIndex)
190
+ # select greedy actions
191
+ actionInStateS = np.random.choice(np.where(self.Q[self.returnIndexState(currentState)] == np.max(
192
+ self.Q[self.returnIndexState(currentState)]))[0])
193
+ currentState, reward, terminated, truncated, info = env1.step(actionInStateS)
194
+ obtainedRewards.append(reward)
195
+ time.sleep(0.05)
196
+ if (terminated):
197
+ time.sleep(1)
198
+ break
199
+ return obtainedRewards, env1
200
+
201
+ def simulateRandomStrategy(self):
202
+ env2 = gym.make('CartPole-v1')
203
+ (currentState, _) = env2.reset()
204
+ #env2.render()
205
+ # number of simulation episodes
206
+ episodeNumber = 100
207
+ # time steps in every episode
208
+ timeSteps = 1000
209
+ # sum of rewards in each episode
210
+ rewardsEpisode = []
211
+
212
+
213
+ for timeIndex in range(timeSteps):
214
+ random_action = env2.action_space.sample()
215
+ observation, reward, terminated, truncated, info = env2.step(random_action)
216
+ rewardsEpisode.append(reward)
217
+ if (terminated):
218
+ break
219
+
220
+ return np.sum(rewardsEpisode), env2
221
+
222
+
config.ini ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Parameters]
2
+ cart_velocity_min = -5
3
+ cart_velocity_max = 5
4
+ pole_angle_velocity_min = -10
5
+ pole_angle_velocity_max = 10
6
+ number_of_bins_position = 50
7
+ number_of_bins_velocity = 50
8
+ number_of_bins_angle = 50
9
+ number_of_bins_angle_velocity = 50
10
+ alpha = 0.15
11
+ gamma = 1
12
+ epsilon = 0.25
13
+ number_episodes = 20000
14
+ batch_size = 32
convergence.png ADDED
convergence15kBuffer(1klimiter).png ADDED
convergence20000.png ADDED
histogram.png ADDED
main.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gymnasium as gym
3
+ import numpy
4
+ import numpy as np
5
+ from Qlearning_pole import Qlearning
6
+ import os
7
+
8
+ # Rendering the environment
9
+ # env=gym.make('CartPole-v1',render_mode='human')
10
+
11
+
12
+ Q1 = Qlearning()
13
+ # run the Q-Learning algorithm
14
+ Q1.train()
15
+ # simulate the learned strategy
16
+ (obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()
17
+
18
+ # close the environment
19
+ env1.close()
20
+ # get the sum of rewards
21
+ np.sum(obtainedRewardsOptimal)
22
+ import matplotlib.pyplot as plt
23
+ # now simulate a random strategy
24
+ (obtainedRewardsRandom, env2) = Q1.simulateRandomStrategy()
25
+ plt.figure(figsize=(12, 5))
26
+ # plot the figure and adjust the plot parameters
27
+ numpy.save("Qmatrix_new.npy",Q1.Q)
28
+ plt.plot(Q1.sumRewardsEpisode, color='blue', linewidth=1)
29
+ plt.xlabel('Episode')
30
+ plt.ylabel('Reward')
31
+ plt.yscale('log')
32
+ plt.savefig('convergence.png')
33
+ plt.title("Convergence of rewards")
34
+ plt.show()
35
+
36
+
37
+ # close the environment
38
+ env1.close()
39
+ # get the sum of rewards
40
+ np.sum(obtainedRewardsOptimal)
41
+
42
+ # now simulate a random strategy
43
+ obtainedRewardsRandom = []
44
+ for i in range(50):
45
+ (rewardsRandom, env2) = Q1.simulateRandomStrategy()
46
+ obtainedRewardsRandom.append(rewardsRandom)
47
+ plt.title("Rewards with random strategy")
48
+ plt.hist(obtainedRewardsRandom)
49
+ plt.xlabel('Sum of rewards')
50
+ plt.ylabel('Percentage')
51
+ plt.savefig('histogram.png')
52
+ plt.show()
53
+
54
+ # run this several times and compare with a random learning strategy
55
+ (obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()
readme.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cartpole Reinforcement Learning
2
+
3
+ This repository is a project focused on exploring reinforcement learning techniques using the OpenAI Gym environment. The objective is to compare different algorithms and approaches to improve the performance of an agent in the Cartpole task.
4
+
5
+ ## Installation
6
+ Installation of packages
7
+ ```
8
+ pip install -r requirements.txt
9
+ ```
10
+
11
+ If you want to execute the the training phase and get your own model execute the main program, the hyperparameters and different options can be changes via config.ini file.
12
+
13
+ If you just want to watch the trained model play the game execute the following
14
+
15
+ ```
16
+ python3 watchModel.py
17
+ ```
18
+
19
+
20
+ ## Objectives
21
+
22
+ The main objectives of this project are as follows:
23
+
24
+ 1. Develop a working model that demonstrates an increase in survival time through training.
25
+ 2. Experiment with different reinforcement learning algorithms and compare their training time, complexity, and achieved scores.
26
+ 3. Fine-tune the algorithm parameters and the number of bins used to achieve optimal training results.
27
+ 4. Improve the consistency of the trained agent's strategy.
28
+ 5. Implement experience replay to enhance learning.
29
+
30
+ ## Results
31
+
32
+ The initial approach used in this project was Q-Learning, and it produced the following results:
33
+
34
+ ![Convergence Plot](./resources/convergence_old.png)
35
+
36
+ The convergence plot shows an increase in the score over time, with three distinct phases. The first phase corresponds to random inputs, followed by a phase where the model explores a lot. The third phase occurs when the epsilon value starts to decay.
37
+
38
+ ![Score Histogram](./resources/histogram_old.png)
39
+
40
+ Comparing the results of the trained agent (after 20,000 episodes) with a random agent clearly demonstrates the improvement achieved:
41
+
42
+ ![Score Boxplot](./resources/boxplot_old.png)
43
+
44
+ Despite the improvements, the trained agent still lacks consistency. This inconsistency is believed to be due to the inherent randomness in the Cartpole environment.
45
+
46
+ ## Experience Replay
47
+
48
+ Experience replay has been implemented in this project, leading to significant improvements in the agent's performance. The details and results of this implementation are yet to be provided.
49
+
50
+ The results of the trained agent with experience replay are as follows:
51
+ It should be mention that to speed up the training phase, the experience replay agent had a score limit of 2000.
52
+ | Metric | Old Agent | Trained Agent with Experience Replay |
53
+ |------------------------|--------------|--------------------------------------|
54
+ | Convergence Plot | ![Convergence Plot](./resources/convergence_old.png) | ![Convergence Plot](./resources/convergence20kbuffer.png) |
55
+ | Score Histogram | ![Score Histogram](./resources/old_agent20k.png) | ![Score Histogram](./resources/trained_agent20k.png) |
56
+ |Boxplot|![Score Boxplot](./resources/old_boxplot.png)| ![Score Boxplot](./resources/boxplot.png)|
57
+
58
+ As observed by adding experience replay the agent has been able to objectively increase it's score.
59
+
60
+ ## References
61
+
62
+ - https://arxiv.org/pdf/2112.04213.pdf
63
+ - https://aleksandarhaber.com/q-learning-in-python-with-tests-in-cart-pole-openai-gym-environment-reinforcement-learning-tutorial/
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gym==0.26.2
2
+ gymnasium==0.28.1
3
+ matplotlib==3.7.1
4
+ numba==0.57.0
5
+ numpy==1.23.5
6
+ tqdm==4.65.0
resources/boxplot.png ADDED
resources/boxplot15k_bufferMemory.png ADDED
resources/boxplot_5k_bufferMemory.png ADDED
resources/boxplot_old.png ADDED
resources/boxplot_old_theta.png ADDED
resources/convergence20kbuffer.png ADDED
resources/convergence_old.png ADDED
resources/histogram_old.png ADDED
resources/old_agent20k.png ADDED
resources/old_boxplot.png ADDED
resources/trained_agent20k.png ADDED
test_weights.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from Qlearning_pole import Qlearning
3
+ from tqdm import tqdm
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ # Number of games the agent will play.
8
+ iterations = 50
9
+ agent = Qlearning()
10
+ # Insert the weights of the Agent to plot.
11
+ agent.Q = np.load("Qmatrix.npy")
12
+
13
+ scores = []
14
+ for i in tqdm(range(iterations),miniters=1,desc="Trained Agent"):
15
+ a, b = agent.simulateLearnedStrategy()
16
+ scores.append(np.sum(a))
17
+
18
+ random_scores = []
19
+ for i in tqdm(range(iterations),miniters=1,desc="Random Agent"):
20
+ a, b = agent.simulateRandomStrategy()
21
+ random_scores.append(a)
22
+
23
+ data = [random_scores,scores]
24
+ print(data)
25
+
26
+ plt.title("Rewards with trained agent")
27
+ plt.hist(scores)
28
+ plt.xlabel('Reward')
29
+ plt.ylabel('Percentage')
30
+ # plt.savefig('./resources/new.png')
31
+ plt.show()
32
+
33
+ fig = plt.figure(figsize=(10,7))
34
+ ax = fig.add_subplot(111)
35
+
36
+
37
+ bp = ax.boxplot(data,patch_artist=True,notch=True,vert=0)
38
+ plt.title("Trained Agent vs Random Agent (50 episodes)")
39
+ # plt.savefig("./resources/old_boxplot.png")
40
+ plt.show()
time.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numba import jit, cuda
2
+ import numpy as np
3
+ # to measure exec time
4
+ from tqdm import tqdm
5
+ from timeit import default_timer as timer
6
+
7
+
8
+ # normal function to run on cpu
9
+ def func(a):
10
+ for i in range(100000000):
11
+ a[i] += 1
12
+ if i % 100 == 0:
13
+ print(i)
14
+ # function optimized to run on gpu
15
+
16
+
17
+ def func2(a):
18
+ for i in tqdm(range(100000000)):
19
+ a[i] += 1
20
+
21
+
22
+ if __name__ == "__main__":
23
+ n = 100000000
24
+ a = np.ones(n, dtype=np.float64)
25
+ """
26
+ start = timer()
27
+ func(a)
28
+ print("Prints", timer() - start)
29
+ """
30
+ start = timer()
31
+ func2(a)
32
+ print("tqdm", timer() - start)
watchModel.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ from Qlearning_pole import Qlearning
3
+ import numpy as np
4
+ if __name__ == '__main__':
5
+ env = gym.make('CartPole-v1',render_mode='human')
6
+ q = Qlearning(env)
7
+ q.Q = np.load('Qmatrix.npy')
8
+ q.simulateLearnedStrategy(render=True,env1=env)