Spaces:
Running
Running
Commit
·
c161b3b
1
Parent(s):
d217164
feat: data-pipelines
Browse filesimplemented data loading pipelines, data cleaning and basic EDA along with TFIDF vectorization
- data/MMR_DATA.csv +0 -0
- data/__init__.py +0 -0
- notebooks/EDA.ipynb +658 -0
- notebooks/TFIDF.ipynb +0 -0
- notebooks/__init__.py +0 -0
- notebooks/data_loading.ipynb +996 -0
- src/main.py +33 -0
- utilities/__init__.py +2 -0
- utilities/__pycache__/__init__.cpython-311.pyc +0 -0
- utilities/__pycache__/__init__.cpython-312.pyc +0 -0
- utilities/__pycache__/data_loader.cpython-311.pyc +0 -0
- utilities/__pycache__/data_loader.cpython-312.pyc +0 -0
- utilities/data_cleaner.py +27 -0
- utilities/data_loader.py +222 -0
- utils/__init__.py +0 -0
data/MMR_DATA.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/__init__.py
DELETED
File without changes
|
notebooks/EDA.ipynb
ADDED
@@ -0,0 +1,658 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"## __Exploratory Data Analysis__"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 53,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"## importing libraries\n",
|
17 |
+
"\n",
|
18 |
+
"import numpy as numpy\n",
|
19 |
+
"import pandas as pd\n",
|
20 |
+
"import matplotlib.pyplot as plt\n",
|
21 |
+
"import seaborn as sns\n",
|
22 |
+
"\n",
|
23 |
+
"import sys\n",
|
24 |
+
"import os\n",
|
25 |
+
"\n",
|
26 |
+
"import re\n",
|
27 |
+
"import nltk\n",
|
28 |
+
"from nltk.corpus import stopwords\n",
|
29 |
+
"from nltk.stem import PorterStemmer\n",
|
30 |
+
"from nltk.stem import WordNetLemmatizer"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 6,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [
|
38 |
+
{
|
39 |
+
"data": {
|
40 |
+
"text/html": [
|
41 |
+
"<div>\n",
|
42 |
+
"<style scoped>\n",
|
43 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
44 |
+
" vertical-align: middle;\n",
|
45 |
+
" }\n",
|
46 |
+
"\n",
|
47 |
+
" .dataframe tbody tr th {\n",
|
48 |
+
" vertical-align: top;\n",
|
49 |
+
" }\n",
|
50 |
+
"\n",
|
51 |
+
" .dataframe thead th {\n",
|
52 |
+
" text-align: right;\n",
|
53 |
+
" }\n",
|
54 |
+
"</style>\n",
|
55 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
56 |
+
" <thead>\n",
|
57 |
+
" <tr style=\"text-align: right;\">\n",
|
58 |
+
" <th></th>\n",
|
59 |
+
" <th>row</th>\n",
|
60 |
+
" <th>col</th>\n",
|
61 |
+
" <th>latitude</th>\n",
|
62 |
+
" <th>longitude</th>\n",
|
63 |
+
" <th>Map Data</th>\n",
|
64 |
+
" </tr>\n",
|
65 |
+
" </thead>\n",
|
66 |
+
" <tbody>\n",
|
67 |
+
" <tr>\n",
|
68 |
+
" <th>0</th>\n",
|
69 |
+
" <td>0</td>\n",
|
70 |
+
" <td>0</td>\n",
|
71 |
+
" <td>18.89433</td>\n",
|
72 |
+
" <td>72.784597</td>\n",
|
73 |
+
" <td>NaN</td>\n",
|
74 |
+
" </tr>\n",
|
75 |
+
" <tr>\n",
|
76 |
+
" <th>1</th>\n",
|
77 |
+
" <td>0</td>\n",
|
78 |
+
" <td>1</td>\n",
|
79 |
+
" <td>18.89433</td>\n",
|
80 |
+
" <td>72.794102</td>\n",
|
81 |
+
" <td>Prongs Reef is a Natural;</td>\n",
|
82 |
+
" </tr>\n",
|
83 |
+
" <tr>\n",
|
84 |
+
" <th>2</th>\n",
|
85 |
+
" <td>0</td>\n",
|
86 |
+
" <td>2</td>\n",
|
87 |
+
" <td>18.89433</td>\n",
|
88 |
+
" <td>72.803607</td>\n",
|
89 |
+
" <td>United Services Club Golf Course is a Leisure ...</td>\n",
|
90 |
+
" </tr>\n",
|
91 |
+
" <tr>\n",
|
92 |
+
" <th>3</th>\n",
|
93 |
+
" <td>0</td>\n",
|
94 |
+
" <td>3</td>\n",
|
95 |
+
" <td>18.89433</td>\n",
|
96 |
+
" <td>72.813112</td>\n",
|
97 |
+
" <td>Indian Meterological Department is a Commercia...</td>\n",
|
98 |
+
" </tr>\n",
|
99 |
+
" <tr>\n",
|
100 |
+
" <th>4</th>\n",
|
101 |
+
" <td>0</td>\n",
|
102 |
+
" <td>4</td>\n",
|
103 |
+
" <td>18.89433</td>\n",
|
104 |
+
" <td>72.822617</td>\n",
|
105 |
+
" <td>NaN</td>\n",
|
106 |
+
" </tr>\n",
|
107 |
+
" </tbody>\n",
|
108 |
+
"</table>\n",
|
109 |
+
"</div>"
|
110 |
+
],
|
111 |
+
"text/plain": [
|
112 |
+
" row col latitude longitude \\\n",
|
113 |
+
"0 0 0 18.89433 72.784597 \n",
|
114 |
+
"1 0 1 18.89433 72.794102 \n",
|
115 |
+
"2 0 2 18.89433 72.803607 \n",
|
116 |
+
"3 0 3 18.89433 72.813112 \n",
|
117 |
+
"4 0 4 18.89433 72.822617 \n",
|
118 |
+
"\n",
|
119 |
+
" Map Data \n",
|
120 |
+
"0 NaN \n",
|
121 |
+
"1 Prongs Reef is a Natural; \n",
|
122 |
+
"2 United Services Club Golf Course is a Leisure ... \n",
|
123 |
+
"3 Indian Meterological Department is a Commercia... \n",
|
124 |
+
"4 NaN "
|
125 |
+
]
|
126 |
+
},
|
127 |
+
"execution_count": 6,
|
128 |
+
"metadata": {},
|
129 |
+
"output_type": "execute_result"
|
130 |
+
}
|
131 |
+
],
|
132 |
+
"source": [
|
133 |
+
"data_folder = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
|
134 |
+
"data_file = os.path.join(data_folder, 'MMR_DATA.csv')\n",
|
135 |
+
"df = pd.read_csv(data_file)\n",
|
136 |
+
"df.head()"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"cell_type": "code",
|
141 |
+
"execution_count": 7,
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [
|
144 |
+
{
|
145 |
+
"data": {
|
146 |
+
"text/html": [
|
147 |
+
"<div>\n",
|
148 |
+
"<style scoped>\n",
|
149 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
150 |
+
" vertical-align: middle;\n",
|
151 |
+
" }\n",
|
152 |
+
"\n",
|
153 |
+
" .dataframe tbody tr th {\n",
|
154 |
+
" vertical-align: top;\n",
|
155 |
+
" }\n",
|
156 |
+
"\n",
|
157 |
+
" .dataframe thead th {\n",
|
158 |
+
" text-align: right;\n",
|
159 |
+
" }\n",
|
160 |
+
"</style>\n",
|
161 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
162 |
+
" <thead>\n",
|
163 |
+
" <tr style=\"text-align: right;\">\n",
|
164 |
+
" <th></th>\n",
|
165 |
+
" <th>row</th>\n",
|
166 |
+
" <th>col</th>\n",
|
167 |
+
" <th>latitude</th>\n",
|
168 |
+
" <th>longitude</th>\n",
|
169 |
+
" <th>Map Data</th>\n",
|
170 |
+
" </tr>\n",
|
171 |
+
" </thead>\n",
|
172 |
+
" <tbody>\n",
|
173 |
+
" <tr>\n",
|
174 |
+
" <th>0</th>\n",
|
175 |
+
" <td>0</td>\n",
|
176 |
+
" <td>0</td>\n",
|
177 |
+
" <td>18.89433</td>\n",
|
178 |
+
" <td>72.784597</td>\n",
|
179 |
+
" <td></td>\n",
|
180 |
+
" </tr>\n",
|
181 |
+
" <tr>\n",
|
182 |
+
" <th>1</th>\n",
|
183 |
+
" <td>0</td>\n",
|
184 |
+
" <td>1</td>\n",
|
185 |
+
" <td>18.89433</td>\n",
|
186 |
+
" <td>72.794102</td>\n",
|
187 |
+
" <td>Prongs Reef is a Natural;</td>\n",
|
188 |
+
" </tr>\n",
|
189 |
+
" <tr>\n",
|
190 |
+
" <th>2</th>\n",
|
191 |
+
" <td>0</td>\n",
|
192 |
+
" <td>2</td>\n",
|
193 |
+
" <td>18.89433</td>\n",
|
194 |
+
" <td>72.803607</td>\n",
|
195 |
+
" <td>United Services Club Golf Course is a Leisure ...</td>\n",
|
196 |
+
" </tr>\n",
|
197 |
+
" <tr>\n",
|
198 |
+
" <th>3</th>\n",
|
199 |
+
" <td>0</td>\n",
|
200 |
+
" <td>3</td>\n",
|
201 |
+
" <td>18.89433</td>\n",
|
202 |
+
" <td>72.813112</td>\n",
|
203 |
+
" <td>Indian Meterological Department is a Commercia...</td>\n",
|
204 |
+
" </tr>\n",
|
205 |
+
" <tr>\n",
|
206 |
+
" <th>4</th>\n",
|
207 |
+
" <td>0</td>\n",
|
208 |
+
" <td>4</td>\n",
|
209 |
+
" <td>18.89433</td>\n",
|
210 |
+
" <td>72.822617</td>\n",
|
211 |
+
" <td></td>\n",
|
212 |
+
" </tr>\n",
|
213 |
+
" </tbody>\n",
|
214 |
+
"</table>\n",
|
215 |
+
"</div>"
|
216 |
+
],
|
217 |
+
"text/plain": [
|
218 |
+
" row col latitude longitude \\\n",
|
219 |
+
"0 0 0 18.89433 72.784597 \n",
|
220 |
+
"1 0 1 18.89433 72.794102 \n",
|
221 |
+
"2 0 2 18.89433 72.803607 \n",
|
222 |
+
"3 0 3 18.89433 72.813112 \n",
|
223 |
+
"4 0 4 18.89433 72.822617 \n",
|
224 |
+
"\n",
|
225 |
+
" Map Data \n",
|
226 |
+
"0 \n",
|
227 |
+
"1 Prongs Reef is a Natural; \n",
|
228 |
+
"2 United Services Club Golf Course is a Leisure ... \n",
|
229 |
+
"3 Indian Meterological Department is a Commercia... \n",
|
230 |
+
"4 "
|
231 |
+
]
|
232 |
+
},
|
233 |
+
"execution_count": 7,
|
234 |
+
"metadata": {},
|
235 |
+
"output_type": "execute_result"
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"source": [
|
239 |
+
"## filling the NaN values in the Map Data Column with empty string\n",
|
240 |
+
"\n",
|
241 |
+
"df['Map Data'] = df['Map Data'].fillna('')\n",
|
242 |
+
"df.head()"
|
243 |
+
]
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"cell_type": "code",
|
247 |
+
"execution_count": 20,
|
248 |
+
"metadata": {},
|
249 |
+
"outputs": [
|
250 |
+
{
|
251 |
+
"data": {
|
252 |
+
"text/plain": [
|
253 |
+
"1225"
|
254 |
+
]
|
255 |
+
},
|
256 |
+
"execution_count": 20,
|
257 |
+
"metadata": {},
|
258 |
+
"output_type": "execute_result"
|
259 |
+
}
|
260 |
+
],
|
261 |
+
"source": [
|
262 |
+
"len(df)"
|
263 |
+
]
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"cell_type": "code",
|
267 |
+
"execution_count": 42,
|
268 |
+
"metadata": {},
|
269 |
+
"outputs": [],
|
270 |
+
"source": [
|
271 |
+
"df_len_explore = df.copy()"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"cell_type": "code",
|
276 |
+
"execution_count": 43,
|
277 |
+
"metadata": {},
|
278 |
+
"outputs": [
|
279 |
+
{
|
280 |
+
"data": {
|
281 |
+
"text/html": [
|
282 |
+
"<div>\n",
|
283 |
+
"<style scoped>\n",
|
284 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
285 |
+
" vertical-align: middle;\n",
|
286 |
+
" }\n",
|
287 |
+
"\n",
|
288 |
+
" .dataframe tbody tr th {\n",
|
289 |
+
" vertical-align: top;\n",
|
290 |
+
" }\n",
|
291 |
+
"\n",
|
292 |
+
" .dataframe thead th {\n",
|
293 |
+
" text-align: right;\n",
|
294 |
+
" }\n",
|
295 |
+
"</style>\n",
|
296 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
297 |
+
" <thead>\n",
|
298 |
+
" <tr style=\"text-align: right;\">\n",
|
299 |
+
" <th></th>\n",
|
300 |
+
" <th>Map Data</th>\n",
|
301 |
+
" </tr>\n",
|
302 |
+
" </thead>\n",
|
303 |
+
" <tbody>\n",
|
304 |
+
" <tr>\n",
|
305 |
+
" <th>0</th>\n",
|
306 |
+
" <td></td>\n",
|
307 |
+
" </tr>\n",
|
308 |
+
" <tr>\n",
|
309 |
+
" <th>1</th>\n",
|
310 |
+
" <td>Prongs Reef is a Natural;</td>\n",
|
311 |
+
" </tr>\n",
|
312 |
+
" <tr>\n",
|
313 |
+
" <th>2</th>\n",
|
314 |
+
" <td>United Services Club Golf Course is a Leisure ...</td>\n",
|
315 |
+
" </tr>\n",
|
316 |
+
" <tr>\n",
|
317 |
+
" <th>3</th>\n",
|
318 |
+
" <td>Indian Meterological Department is a Commercia...</td>\n",
|
319 |
+
" </tr>\n",
|
320 |
+
" <tr>\n",
|
321 |
+
" <th>4</th>\n",
|
322 |
+
" <td></td>\n",
|
323 |
+
" </tr>\n",
|
324 |
+
" </tbody>\n",
|
325 |
+
"</table>\n",
|
326 |
+
"</div>"
|
327 |
+
],
|
328 |
+
"text/plain": [
|
329 |
+
" Map Data\n",
|
330 |
+
"0 \n",
|
331 |
+
"1 Prongs Reef is a Natural; \n",
|
332 |
+
"2 United Services Club Golf Course is a Leisure ...\n",
|
333 |
+
"3 Indian Meterological Department is a Commercia...\n",
|
334 |
+
"4 "
|
335 |
+
]
|
336 |
+
},
|
337 |
+
"execution_count": 43,
|
338 |
+
"metadata": {},
|
339 |
+
"output_type": "execute_result"
|
340 |
+
}
|
341 |
+
],
|
342 |
+
"source": [
|
343 |
+
"## dropping the columns that are not needed for the analysis\n",
|
344 |
+
"\n",
|
345 |
+
"df_len_explore = df_len_explore.drop(columns=['row', 'col', 'latitude', 'longitude'])\n",
|
346 |
+
"df_len_explore.head()"
|
347 |
+
]
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"cell_type": "code",
|
351 |
+
"execution_count": 44,
|
352 |
+
"metadata": {},
|
353 |
+
"outputs": [
|
354 |
+
{
|
355 |
+
"data": {
|
356 |
+
"text/plain": [
|
357 |
+
"791"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
"execution_count": 44,
|
361 |
+
"metadata": {},
|
362 |
+
"output_type": "execute_result"
|
363 |
+
}
|
364 |
+
],
|
365 |
+
"source": [
|
366 |
+
"## dropping the rows with 0 string length and string length > 5000\n",
|
367 |
+
"\n",
|
368 |
+
"df_len_explore = df_len_explore[df_len_explore['Map Data'].str.len() > 0]\n",
|
369 |
+
"df_len_explore = df_len_explore[df_len_explore['Map Data'].str.len() < 5000]\n",
|
370 |
+
"len(df_len_explore)"
|
371 |
+
]
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"cell_type": "code",
|
375 |
+
"execution_count": 45,
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [
|
378 |
+
{
|
379 |
+
"name": "stdout",
|
380 |
+
"output_type": "stream",
|
381 |
+
"text": [
|
382 |
+
"Discarded rows: 434 / 1225\n"
|
383 |
+
]
|
384 |
+
}
|
385 |
+
],
|
386 |
+
"source": [
|
387 |
+
"print('Discarded rows: ', len(df) - len(df_len_explore), '/', len(df))"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"cell_type": "code",
|
392 |
+
"execution_count": 52,
|
393 |
+
"metadata": {},
|
394 |
+
"outputs": [
|
395 |
+
{
|
396 |
+
"name": "stdout",
|
397 |
+
"output_type": "stream",
|
398 |
+
"text": [
|
399 |
+
"Mean string length: 834.7509481668774\n",
|
400 |
+
"80th percentile string length: 1560.0\n"
|
401 |
+
]
|
402 |
+
}
|
403 |
+
],
|
404 |
+
"source": [
|
405 |
+
"## mean of the string length\n",
|
406 |
+
"\n",
|
407 |
+
"print('Mean string length: ', df_len_explore['Map Data'].str.len().mean())\n",
|
408 |
+
"print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"cell_type": "code",
|
413 |
+
"execution_count": 51,
|
414 |
+
"metadata": {},
|
415 |
+
"outputs": [
|
416 |
+
{
|
417 |
+
"data": {
|
418 |
+
"text/plain": [
|
419 |
+
"<matplotlib.lines.Line2D at 0x1c5b27a9790>"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
"execution_count": 51,
|
423 |
+
"metadata": {},
|
424 |
+
"output_type": "execute_result"
|
425 |
+
},
|
426 |
+
{
|
427 |
+
"data": {
|
428 |
+
"image/png": "",
|
429 |
+
"text/plain": [
|
430 |
+
"<Figure size 640x480 with 1 Axes>"
|
431 |
+
]
|
432 |
+
},
|
433 |
+
"metadata": {},
|
434 |
+
"output_type": "display_data"
|
435 |
+
}
|
436 |
+
],
|
437 |
+
"source": [
|
438 |
+
"## plotting lenth of strings in the Map Data Column, mean and 75th percentile\n",
|
439 |
+
"\n",
|
440 |
+
"df_len_explore['Map Data'].str.len().plot(kind='hist', bins=100)\n",
|
441 |
+
"plt.axvline(df_len_explore['Map Data'].str.len().mean(), color='red', linestyle='dashed', linewidth=2)\n",
|
442 |
+
"plt.axvline(df_len_explore['Map Data'].str.len().quantile(0.80), color='green', linestyle='dashed', linewidth=2)"
|
443 |
+
]
|
444 |
+
},
|
445 |
+
{
|
446 |
+
"cell_type": "markdown",
|
447 |
+
"metadata": {},
|
448 |
+
"source": [
|
449 |
+
"### Same visualization, post cleaning"
|
450 |
+
]
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"cell_type": "code",
|
454 |
+
"execution_count": 54,
|
455 |
+
"metadata": {},
|
456 |
+
"outputs": [
|
457 |
+
{
|
458 |
+
"name": "stderr",
|
459 |
+
"output_type": "stream",
|
460 |
+
"text": [
|
461 |
+
"[nltk_data] Downloading package stopwords to C:\\Users\\Akhil\n",
|
462 |
+
"[nltk_data] PC\\AppData\\Roaming\\nltk_data...\n",
|
463 |
+
"[nltk_data] Package stopwords is already up-to-date!\n",
|
464 |
+
"[nltk_data] Downloading package wordnet to C:\\Users\\Akhil\n",
|
465 |
+
"[nltk_data] PC\\AppData\\Roaming\\nltk_data...\n",
|
466 |
+
"[nltk_data] Package wordnet is already up-to-date!\n"
|
467 |
+
]
|
468 |
+
}
|
469 |
+
],
|
470 |
+
"source": [
|
471 |
+
"nltk.download('stopwords')\n",
|
472 |
+
"nltk.download('wordnet')\n",
|
473 |
+
"\n",
|
474 |
+
"stop_words = set(stopwords.words('english'))\n",
|
475 |
+
"stemmer = PorterStemmer()\n",
|
476 |
+
"lemmatizer = WordNetLemmatizer()"
|
477 |
+
]
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"cell_type": "code",
|
481 |
+
"execution_count": 55,
|
482 |
+
"metadata": {},
|
483 |
+
"outputs": [],
|
484 |
+
"source": [
|
485 |
+
"## cleaning the strings, stemming and lemmatizing\n",
|
486 |
+
"\n",
|
487 |
+
"def clean_text(text):\n",
|
488 |
+
" text = re.sub(r'[^\\w\\s]', '', text)\n",
|
489 |
+
" text = text.lower()\n",
|
490 |
+
" text = [word for word in text.split() if word not in stop_words]\n",
|
491 |
+
" text = [stemmer.stem(word) for word in text] \n",
|
492 |
+
" text = [lemmatizer.lemmatize(word) for word in text]\n",
|
493 |
+
" return ' '.join(text)\n",
|
494 |
+
"\n",
|
495 |
+
"df_len_explore['Map Data'] = df_len_explore['Map Data'].apply(clean_text)"
|
496 |
+
]
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"cell_type": "code",
|
500 |
+
"execution_count": 56,
|
501 |
+
"metadata": {},
|
502 |
+
"outputs": [
|
503 |
+
{
|
504 |
+
"name": "stdout",
|
505 |
+
"output_type": "stream",
|
506 |
+
"text": [
|
507 |
+
"Mean string length: 596.3046776232617\n",
|
508 |
+
"80th percentile string length: 1114.0\n"
|
509 |
+
]
|
510 |
+
}
|
511 |
+
],
|
512 |
+
"source": [
|
513 |
+
"print('Mean string length: ', df_len_explore['Map Data'].str.len().mean())\n",
|
514 |
+
"print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
|
515 |
+
]
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"cell_type": "code",
|
519 |
+
"execution_count": 57,
|
520 |
+
"metadata": {},
|
521 |
+
"outputs": [
|
522 |
+
{
|
523 |
+
"data": {
|
524 |
+
"text/plain": [
|
525 |
+
"<matplotlib.lines.Line2D at 0x1c5b5e37b00>"
|
526 |
+
]
|
527 |
+
},
|
528 |
+
"execution_count": 57,
|
529 |
+
"metadata": {},
|
530 |
+
"output_type": "execute_result"
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"data": {
|
534 |
+
"image/png": "",
|
535 |
+
"text/plain": [
|
536 |
+
"<Figure size 640x480 with 1 Axes>"
|
537 |
+
]
|
538 |
+
},
|
539 |
+
"metadata": {},
|
540 |
+
"output_type": "display_data"
|
541 |
+
}
|
542 |
+
],
|
543 |
+
"source": [
|
544 |
+
"df_len_explore['Map Data'].str.len().plot(kind='hist', bins=100)\n",
|
545 |
+
"plt.axvline(df_len_explore['Map Data'].str.len().mean(), color='red', linestyle='dashed', linewidth=2)\n",
|
546 |
+
"plt.axvline(df_len_explore['Map Data'].str.len().quantile(0.80), color='green', linestyle='dashed', linewidth=2)"
|
547 |
+
]
|
548 |
+
},
|
549 |
+
{
|
550 |
+
"cell_type": "code",
|
551 |
+
"execution_count": 59,
|
552 |
+
"metadata": {},
|
553 |
+
"outputs": [
|
554 |
+
{
|
555 |
+
"name": "stdout",
|
556 |
+
"output_type": "stream",
|
557 |
+
"text": [
|
558 |
+
"Original data length: 1225\n",
|
559 |
+
"Number of blanks: 407\n",
|
560 |
+
"Number of strings with length > 5000: 27\n",
|
561 |
+
"Number of useful rows: 791\n"
|
562 |
+
]
|
563 |
+
}
|
564 |
+
],
|
565 |
+
"source": [
|
566 |
+
"## Final Summary\n",
|
567 |
+
"\n",
|
568 |
+
"print('Original data length: ', len(df))\n",
|
569 |
+
"print('Number of blanks: ', len(df) - len(df[df['Map Data'].str.len() > 0]))\n",
|
570 |
+
"print('Number of strings with length > 5000: ', len(df[df['Map Data'].str.len() > 5000]))\n",
|
571 |
+
"print('Number of useful rows: ', len(df_len_explore))"
|
572 |
+
]
|
573 |
+
},
|
574 |
+
{
|
575 |
+
"cell_type": "code",
|
576 |
+
"execution_count": 62,
|
577 |
+
"metadata": {},
|
578 |
+
"outputs": [
|
579 |
+
{
|
580 |
+
"name": "stdout",
|
581 |
+
"output_type": "stream",
|
582 |
+
"text": [
|
583 |
+
"Post cleaning data length: 791\n",
|
584 |
+
"Avg string length: 596.3046776232617\n",
|
585 |
+
"Median string length: 192.0\n",
|
586 |
+
"25th percentile string length: 43.5\n",
|
587 |
+
"80th percentile string length: 1114.0\n"
|
588 |
+
]
|
589 |
+
}
|
590 |
+
],
|
591 |
+
"source": [
|
592 |
+
"print('Post cleaning data length: ', len(df_len_explore))\n",
|
593 |
+
"print('Avg string length: ', df_len_explore['Map Data'].str.len().mean())\n",
|
594 |
+
"print('Median string length: ', df_len_explore['Map Data'].str.len().median())\n",
|
595 |
+
"print('25th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.25)) \n",
|
596 |
+
"print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
|
597 |
+
]
|
598 |
+
},
|
599 |
+
{
|
600 |
+
"cell_type": "markdown",
|
601 |
+
"metadata": {},
|
602 |
+
"source": [
|
603 |
+
"Highly uneven string length distribution.\n",
|
604 |
+
"- 33% of the total data is useless, ie. blank rows\n",
|
605 |
+
"- 25% of the strings have length less than 45 characters\n",
|
606 |
+
"- 50% of the strings have length less than 200 characters\n",
|
607 |
+
"- 30% of the strings have length between 200 - 1100 characters (huge variation)\n",
|
608 |
+
"- 20% of the strings have length greater than 1100 characters\n",
|
609 |
+
"- about 5% of strings are longer than 5000 characters (wont be used in training)"
|
610 |
+
]
|
611 |
+
},
|
612 |
+
{
|
613 |
+
"cell_type": "code",
|
614 |
+
"execution_count": null,
|
615 |
+
"metadata": {},
|
616 |
+
"outputs": [],
|
617 |
+
"source": [
|
618 |
+
"## function to clean the given data frame\n",
|
619 |
+
"\n",
|
620 |
+
"def clean_text(text):\n",
|
621 |
+
" text = re.sub(r'[^\\w\\s]', '', text)\n",
|
622 |
+
" text = text.lower()\n",
|
623 |
+
" text = [word for word in text.split() if word not in stop_words]\n",
|
624 |
+
" text = [stemmer.stem(word) for word in text] \n",
|
625 |
+
" text = [lemmatizer.lemmatize(word) for word in text]\n",
|
626 |
+
" return ' '.join(text)\n",
|
627 |
+
"\n",
|
628 |
+
"def clean_data(df):\n",
|
629 |
+
" df['Map Data'] = df['Map Data'].fillna('')\n",
|
630 |
+
" df = df[df['Map Data'].str.len() > 0]\n",
|
631 |
+
" df = df[df['Map Data'].str.len() < 5000]\n",
|
632 |
+
" df['Map Data'] = df['Map Data'].apply(clean_text)\n",
|
633 |
+
" return df"
|
634 |
+
]
|
635 |
+
}
|
636 |
+
],
|
637 |
+
"metadata": {
|
638 |
+
"kernelspec": {
|
639 |
+
"display_name": "Python 3",
|
640 |
+
"language": "python",
|
641 |
+
"name": "python3"
|
642 |
+
},
|
643 |
+
"language_info": {
|
644 |
+
"codemirror_mode": {
|
645 |
+
"name": "ipython",
|
646 |
+
"version": 3
|
647 |
+
},
|
648 |
+
"file_extension": ".py",
|
649 |
+
"mimetype": "text/x-python",
|
650 |
+
"name": "python",
|
651 |
+
"nbconvert_exporter": "python",
|
652 |
+
"pygments_lexer": "ipython3",
|
653 |
+
"version": "3.12.0"
|
654 |
+
}
|
655 |
+
},
|
656 |
+
"nbformat": 4,
|
657 |
+
"nbformat_minor": 2
|
658 |
+
}
|
notebooks/TFIDF.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/__init__.py
DELETED
File without changes
|
notebooks/data_loading.ipynb
ADDED
@@ -0,0 +1,996 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"## __Data Pipelines__ \n",
|
8 |
+
"Loading data from OpenStreetMap using overpass API"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 60,
|
14 |
+
"metadata": {},
|
15 |
+
"outputs": [],
|
16 |
+
"source": [
|
17 |
+
"import requests\n",
|
18 |
+
"import pandas as pd\n",
|
19 |
+
"import re\n",
|
20 |
+
"import math\n",
|
21 |
+
"from typing import Tuple, List, Dict"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 97,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"def fetch_osm_data(lat: float, lon: float, radius: int) -> List[Dict]:\n",
|
31 |
+
" overpass_url = \"http://overpass-api.de/api/interpreter\"\n",
|
32 |
+
" overpass_query = f\"\"\"\n",
|
33 |
+
" [out:json];\n",
|
34 |
+
" (\n",
|
35 |
+
" node[\"name\"](around:{radius},{lat},{lon});\n",
|
36 |
+
" way[\"name\"](around:{radius},{lat},{lon});\n",
|
37 |
+
" relation[\"name\"](around:{radius},{lat},{lon});\n",
|
38 |
+
" );\n",
|
39 |
+
" out center;\n",
|
40 |
+
" \"\"\"\n",
|
41 |
+
" \n",
|
42 |
+
" response = requests.get(overpass_url, params={'data': overpass_query})\n",
|
43 |
+
" data = response.json()\n",
|
44 |
+
" return data['elements']\n",
|
45 |
+
"\n",
|
46 |
+
"def determine_location_type(tags: Dict[str, str]) -> str:\n",
|
47 |
+
" # Residential\n",
|
48 |
+
" if 'building' in tags and tags['building'] in ['residential', 'house', 'apartments', 'detached', 'terrace', 'dormitory', 'bungalow']:\n",
|
49 |
+
" return 'Residential'\n",
|
50 |
+
" \n",
|
51 |
+
" # Commercial\n",
|
52 |
+
" if any(key in tags for key in ['shop', 'office', 'craft']):\n",
|
53 |
+
" return 'Commercial'\n",
|
54 |
+
" if 'building' in tags and tags['building'] in ['commercial', 'office', 'retail', 'supermarket', 'kiosk']:\n",
|
55 |
+
" return 'Commercial'\n",
|
56 |
+
" \n",
|
57 |
+
" # Industrial\n",
|
58 |
+
" if 'building' in tags and tags['building'] in ['industrial', 'warehouse', 'factory', 'manufacture']:\n",
|
59 |
+
" return 'Industrial'\n",
|
60 |
+
" if 'industrial' in tags or 'industry' in tags:\n",
|
61 |
+
" return 'Industrial'\n",
|
62 |
+
" \n",
|
63 |
+
" # Educational\n",
|
64 |
+
" if 'amenity' in tags and tags['amenity'] in ['school', 'university', 'college', 'library', 'kindergarten', 'language_school']:\n",
|
65 |
+
" return 'Educational'\n",
|
66 |
+
" \n",
|
67 |
+
" # Healthcare\n",
|
68 |
+
" if 'amenity' in tags and tags['amenity'] in ['hospital', 'clinic', 'doctors', 'dentist', 'pharmacy', 'veterinary']:\n",
|
69 |
+
" return 'Healthcare'\n",
|
70 |
+
" \n",
|
71 |
+
" # Food & Drink\n",
|
72 |
+
" if 'amenity' in tags and tags['amenity'] in ['restaurant', 'cafe', 'bar', 'fast_food', 'pub', 'food_court']:\n",
|
73 |
+
" return 'Food & Drink'\n",
|
74 |
+
" \n",
|
75 |
+
" # Leisure & Entertainment\n",
|
76 |
+
" if 'leisure' in tags or 'tourism' in tags:\n",
|
77 |
+
" return 'Leisure & Entertainment'\n",
|
78 |
+
" if 'amenity' in tags and tags['amenity'] in ['theatre', 'cinema', 'nightclub', 'arts_centre', 'community_centre']:\n",
|
79 |
+
" return 'Leisure & Entertainment'\n",
|
80 |
+
" \n",
|
81 |
+
" # Transportation\n",
|
82 |
+
" if 'amenity' in tags and tags['amenity'] in ['parking', 'bicycle_parking', 'bus_station', 'ferry_terminal']:\n",
|
83 |
+
" return 'Transportation'\n",
|
84 |
+
" if 'highway' in tags or 'railway' in tags or 'aeroway' in tags:\n",
|
85 |
+
" return 'Transportation'\n",
|
86 |
+
" \n",
|
87 |
+
" # Religious\n",
|
88 |
+
" if 'amenity' in tags and tags['amenity'] in ['place_of_worship', 'monastery']:\n",
|
89 |
+
" return 'Religious'\n",
|
90 |
+
" \n",
|
91 |
+
" # Government & Public Services\n",
|
92 |
+
" if 'amenity' in tags and tags['amenity'] in ['townhall', 'courthouse', 'police', 'fire_station', 'post_office']:\n",
|
93 |
+
" return 'Government & Public Services'\n",
|
94 |
+
" \n",
|
95 |
+
" # Parks & Recreation\n",
|
96 |
+
" if 'leisure' in tags and tags['leisure'] in ['park', 'playground', 'sports_centre', 'stadium', 'garden']:\n",
|
97 |
+
" return 'Parks & Recreation'\n",
|
98 |
+
" \n",
|
99 |
+
" # Natural\n",
|
100 |
+
" if 'natural' in tags:\n",
|
101 |
+
" return 'Natural'\n",
|
102 |
+
" \n",
|
103 |
+
" # Landuse\n",
|
104 |
+
" if 'landuse' in tags:\n",
|
105 |
+
" landuse = tags['landuse'].capitalize()\n",
|
106 |
+
" if landuse in ['Residential', 'Commercial', 'Industrial', 'Retail']:\n",
|
107 |
+
" return landuse\n",
|
108 |
+
" else:\n",
|
109 |
+
" return f'Landuse: {landuse}'\n",
|
110 |
+
" \n",
|
111 |
+
" # If no specific category is found, return 'Other'\n",
|
112 |
+
" return 'Other'\n",
|
113 |
+
"\n",
|
114 |
+
"def parse_osm_data(elements: List[Dict]) -> pd.DataFrame:\n",
|
115 |
+
" parsed_data = []\n",
|
116 |
+
" for element in elements:\n",
|
117 |
+
" tags = element.get('tags', {})\n",
|
118 |
+
" parsed_element = {\n",
|
119 |
+
" 'ID': f\"{element['type']}_{element['id']}\",\n",
|
120 |
+
" 'Location Name': tags.get('name', ''),\n",
|
121 |
+
" 'Location Type': determine_location_type(tags)\n",
|
122 |
+
" }\n",
|
123 |
+
" parsed_data.append(parsed_element)\n",
|
124 |
+
" if len(parsed_data) == 0:\n",
|
125 |
+
" return pd.DataFrame(columns=['ID', 'Location Name', 'Location Type'])\n",
|
126 |
+
" return pd.DataFrame(parsed_data)\n",
|
127 |
+
"\n",
|
128 |
+
"def get_osm_data(lat: float, lon: float, radius: int) -> pd.DataFrame:\n",
|
129 |
+
" raw_data = fetch_osm_data(lat, lon, radius)\n",
|
130 |
+
" return parse_osm_data(raw_data)\n",
|
131 |
+
"\n",
|
132 |
+
"def dms_to_decimal(coord_str):\n",
|
133 |
+
" # Regular expression to match the coordinate format\n",
|
134 |
+
" pattern = r'(\\d+)°(\\d+)\\'([\\d.]+)\"([NS])\\s*(\\d+)°(\\d+)\\'([\\d.]+)\"([EW])'\n",
|
135 |
+
" \n",
|
136 |
+
" match = re.match(pattern, coord_str)\n",
|
137 |
+
" if not match:\n",
|
138 |
+
" raise ValueError(\"Invalid coordinate format. Expected format: 19°03'08.6\\\"N 72°54'06.0\\\"E\")\n",
|
139 |
+
"\n",
|
140 |
+
" lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()\n",
|
141 |
+
"\n",
|
142 |
+
" # Convert to decimal degrees\n",
|
143 |
+
" lat = float(lat_deg) + float(lat_min)/60 + float(lat_sec)/3600\n",
|
144 |
+
" lon = float(lon_deg) + float(lon_min)/60 + float(lon_sec)/3600\n",
|
145 |
+
"\n",
|
146 |
+
" # Adjust sign based on direction\n",
|
147 |
+
" if lat_dir == 'S':\n",
|
148 |
+
" lat = -lat\n",
|
149 |
+
" if lon_dir == 'W':\n",
|
150 |
+
" lon = -lon\n",
|
151 |
+
"\n",
|
152 |
+
" return lat, lon"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "code",
|
157 |
+
"execution_count": 91,
|
158 |
+
"metadata": {},
|
159 |
+
"outputs": [
|
160 |
+
{
|
161 |
+
"name": "stdout",
|
162 |
+
"output_type": "stream",
|
163 |
+
"text": [
|
164 |
+
"Latitude: 19.015805555555556\n",
|
165 |
+
"Longitude: 72.89944444444446\n"
|
166 |
+
]
|
167 |
+
}
|
168 |
+
],
|
169 |
+
"source": [
|
170 |
+
"coord_str = '19°00\\'56.9\"N 72°53\\'58.0\"E'\n",
|
171 |
+
"radius_meters = 1000\n",
|
172 |
+
"try:\n",
|
173 |
+
" latitude, longitude = dms_to_decimal(coord_str)\n",
|
174 |
+
" print(f\"Latitude: {latitude}\")\n",
|
175 |
+
" print(f\"Longitude: {longitude}\")\n",
|
176 |
+
"except ValueError as e:\n",
|
177 |
+
" print(f\"Error: {e}\")"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 92,
|
183 |
+
"metadata": {},
|
184 |
+
"outputs": [],
|
185 |
+
"source": [
|
186 |
+
"result_df = get_osm_data(latitude, longitude, radius_meters)"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": 93,
|
192 |
+
"metadata": {},
|
193 |
+
"outputs": [
|
194 |
+
{
|
195 |
+
"data": {
|
196 |
+
"text/html": [
|
197 |
+
"<div>\n",
|
198 |
+
"<style scoped>\n",
|
199 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
200 |
+
" vertical-align: middle;\n",
|
201 |
+
" }\n",
|
202 |
+
"\n",
|
203 |
+
" .dataframe tbody tr th {\n",
|
204 |
+
" vertical-align: top;\n",
|
205 |
+
" }\n",
|
206 |
+
"\n",
|
207 |
+
" .dataframe thead th {\n",
|
208 |
+
" text-align: right;\n",
|
209 |
+
" }\n",
|
210 |
+
"</style>\n",
|
211 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
212 |
+
" <thead>\n",
|
213 |
+
" <tr style=\"text-align: right;\">\n",
|
214 |
+
" <th></th>\n",
|
215 |
+
" <th>ID</th>\n",
|
216 |
+
" <th>Location Name</th>\n",
|
217 |
+
" <th>Location Type</th>\n",
|
218 |
+
" </tr>\n",
|
219 |
+
" </thead>\n",
|
220 |
+
" <tbody>\n",
|
221 |
+
" <tr>\n",
|
222 |
+
" <th>0</th>\n",
|
223 |
+
" <td>node_622002639</td>\n",
|
224 |
+
" <td>Mahul</td>\n",
|
225 |
+
" <td>Other</td>\n",
|
226 |
+
" </tr>\n",
|
227 |
+
" <tr>\n",
|
228 |
+
" <th>1</th>\n",
|
229 |
+
" <td>node_622005407</td>\n",
|
230 |
+
" <td>Gowanpada</td>\n",
|
231 |
+
" <td>Other</td>\n",
|
232 |
+
" </tr>\n",
|
233 |
+
" <tr>\n",
|
234 |
+
" <th>2</th>\n",
|
235 |
+
" <td>node_1646222635</td>\n",
|
236 |
+
" <td>gadakary bus stop</td>\n",
|
237 |
+
" <td>Transportation</td>\n",
|
238 |
+
" </tr>\n",
|
239 |
+
" <tr>\n",
|
240 |
+
" <th>3</th>\n",
|
241 |
+
" <td>node_1646222681</td>\n",
|
242 |
+
" <td>vishnu nagar bus stop</td>\n",
|
243 |
+
" <td>Other</td>\n",
|
244 |
+
" </tr>\n",
|
245 |
+
" <tr>\n",
|
246 |
+
" <th>4</th>\n",
|
247 |
+
" <td>node_2932495033</td>\n",
|
248 |
+
" <td>Sree Dutta mandir</td>\n",
|
249 |
+
" <td>Religious</td>\n",
|
250 |
+
" </tr>\n",
|
251 |
+
" <tr>\n",
|
252 |
+
" <th>5</th>\n",
|
253 |
+
" <td>node_11954176622</td>\n",
|
254 |
+
" <td>Gavhanpada</td>\n",
|
255 |
+
" <td>Other</td>\n",
|
256 |
+
" </tr>\n",
|
257 |
+
" <tr>\n",
|
258 |
+
" <th>6</th>\n",
|
259 |
+
" <td>way_25587616</td>\n",
|
260 |
+
" <td>Bhikaji Damaji Patil Marg</td>\n",
|
261 |
+
" <td>Transportation</td>\n",
|
262 |
+
" </tr>\n",
|
263 |
+
" <tr>\n",
|
264 |
+
" <th>7</th>\n",
|
265 |
+
" <td>way_122289587</td>\n",
|
266 |
+
" <td>Mulund - Trombay 220 KV line</td>\n",
|
267 |
+
" <td>Other</td>\n",
|
268 |
+
" </tr>\n",
|
269 |
+
" <tr>\n",
|
270 |
+
" <th>8</th>\n",
|
271 |
+
" <td>way_151783563</td>\n",
|
272 |
+
" <td>Laxman Umaji Gadkari Marg</td>\n",
|
273 |
+
" <td>Transportation</td>\n",
|
274 |
+
" </tr>\n",
|
275 |
+
" <tr>\n",
|
276 |
+
" <th>9</th>\n",
|
277 |
+
" <td>way_151783570</td>\n",
|
278 |
+
" <td>Vishnu Nagar Road</td>\n",
|
279 |
+
" <td>Transportation</td>\n",
|
280 |
+
" </tr>\n",
|
281 |
+
" </tbody>\n",
|
282 |
+
"</table>\n",
|
283 |
+
"</div>"
|
284 |
+
],
|
285 |
+
"text/plain": [
|
286 |
+
" ID Location Name Location Type\n",
|
287 |
+
"0 node_622002639 Mahul Other\n",
|
288 |
+
"1 node_622005407 Gowanpada Other\n",
|
289 |
+
"2 node_1646222635 gadakary bus stop Transportation\n",
|
290 |
+
"3 node_1646222681 vishnu nagar bus stop Other\n",
|
291 |
+
"4 node_2932495033 Sree Dutta mandir Religious\n",
|
292 |
+
"5 node_11954176622 Gavhanpada Other\n",
|
293 |
+
"6 way_25587616 Bhikaji Damaji Patil Marg Transportation\n",
|
294 |
+
"7 way_122289587 Mulund - Trombay 220 KV line Other\n",
|
295 |
+
"8 way_151783563 Laxman Umaji Gadkari Marg Transportation\n",
|
296 |
+
"9 way_151783570 Vishnu Nagar Road Transportation"
|
297 |
+
]
|
298 |
+
},
|
299 |
+
"execution_count": 93,
|
300 |
+
"metadata": {},
|
301 |
+
"output_type": "execute_result"
|
302 |
+
}
|
303 |
+
],
|
304 |
+
"source": [
|
305 |
+
"result_df.head(10)"
|
306 |
+
]
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"cell_type": "code",
|
310 |
+
"execution_count": 94,
|
311 |
+
"metadata": {},
|
312 |
+
"outputs": [
|
313 |
+
{
|
314 |
+
"data": {
|
315 |
+
"text/html": [
|
316 |
+
"<div>\n",
|
317 |
+
"<style scoped>\n",
|
318 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
319 |
+
" vertical-align: middle;\n",
|
320 |
+
" }\n",
|
321 |
+
"\n",
|
322 |
+
" .dataframe tbody tr th {\n",
|
323 |
+
" vertical-align: top;\n",
|
324 |
+
" }\n",
|
325 |
+
"\n",
|
326 |
+
" .dataframe thead th {\n",
|
327 |
+
" text-align: right;\n",
|
328 |
+
" }\n",
|
329 |
+
"</style>\n",
|
330 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
331 |
+
" <thead>\n",
|
332 |
+
" <tr style=\"text-align: right;\">\n",
|
333 |
+
" <th></th>\n",
|
334 |
+
" <th>ID</th>\n",
|
335 |
+
" <th>Location Name</th>\n",
|
336 |
+
" <th>Location Type</th>\n",
|
337 |
+
" </tr>\n",
|
338 |
+
" </thead>\n",
|
339 |
+
" <tbody>\n",
|
340 |
+
" <tr>\n",
|
341 |
+
" <th>11</th>\n",
|
342 |
+
" <td>way_430012316</td>\n",
|
343 |
+
" <td>track</td>\n",
|
344 |
+
" <td>Residential</td>\n",
|
345 |
+
" </tr>\n",
|
346 |
+
" <tr>\n",
|
347 |
+
" <th>12</th>\n",
|
348 |
+
" <td>way_430012318</td>\n",
|
349 |
+
" <td>Mumbai Refinery Mahul</td>\n",
|
350 |
+
" <td>Industrial</td>\n",
|
351 |
+
" </tr>\n",
|
352 |
+
" <tr>\n",
|
353 |
+
" <th>13</th>\n",
|
354 |
+
" <td>way_430012320</td>\n",
|
355 |
+
" <td>Mumbai Refinery</td>\n",
|
356 |
+
" <td>Industrial</td>\n",
|
357 |
+
" </tr>\n",
|
358 |
+
" </tbody>\n",
|
359 |
+
"</table>\n",
|
360 |
+
"</div>"
|
361 |
+
],
|
362 |
+
"text/plain": [
|
363 |
+
" ID Location Name Location Type\n",
|
364 |
+
"11 way_430012316 track Residential\n",
|
365 |
+
"12 way_430012318 Mumbai Refinery Mahul Industrial\n",
|
366 |
+
"13 way_430012320 Mumbai Refinery Industrial"
|
367 |
+
]
|
368 |
+
},
|
369 |
+
"execution_count": 94,
|
370 |
+
"metadata": {},
|
371 |
+
"output_type": "execute_result"
|
372 |
+
}
|
373 |
+
],
|
374 |
+
"source": [
|
375 |
+
"labelled_df = result_df[result_df['Location Type'] != 'Other']\n",
|
376 |
+
"labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']\n",
|
377 |
+
"labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']\n",
|
378 |
+
"labelled_df.head(10)"
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "code",
|
383 |
+
"execution_count": 95,
|
384 |
+
"metadata": {},
|
385 |
+
"outputs": [
|
386 |
+
{
|
387 |
+
"data": {
|
388 |
+
"text/html": [
|
389 |
+
"<div>\n",
|
390 |
+
"<style scoped>\n",
|
391 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
392 |
+
" vertical-align: middle;\n",
|
393 |
+
" }\n",
|
394 |
+
"\n",
|
395 |
+
" .dataframe tbody tr th {\n",
|
396 |
+
" vertical-align: top;\n",
|
397 |
+
" }\n",
|
398 |
+
"\n",
|
399 |
+
" .dataframe thead th {\n",
|
400 |
+
" text-align: right;\n",
|
401 |
+
" }\n",
|
402 |
+
"</style>\n",
|
403 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
404 |
+
" <thead>\n",
|
405 |
+
" <tr style=\"text-align: right;\">\n",
|
406 |
+
" <th></th>\n",
|
407 |
+
" <th>Location Name</th>\n",
|
408 |
+
" <th>Location Type</th>\n",
|
409 |
+
" </tr>\n",
|
410 |
+
" </thead>\n",
|
411 |
+
" <tbody>\n",
|
412 |
+
" <tr>\n",
|
413 |
+
" <th>0</th>\n",
|
414 |
+
" <td>track</td>\n",
|
415 |
+
" <td>Residential</td>\n",
|
416 |
+
" </tr>\n",
|
417 |
+
" <tr>\n",
|
418 |
+
" <th>1</th>\n",
|
419 |
+
" <td>Mumbai Refinery Mahul</td>\n",
|
420 |
+
" <td>Industrial</td>\n",
|
421 |
+
" </tr>\n",
|
422 |
+
" <tr>\n",
|
423 |
+
" <th>2</th>\n",
|
424 |
+
" <td>Mumbai Refinery</td>\n",
|
425 |
+
" <td>Industrial</td>\n",
|
426 |
+
" </tr>\n",
|
427 |
+
" </tbody>\n",
|
428 |
+
"</table>\n",
|
429 |
+
"</div>"
|
430 |
+
],
|
431 |
+
"text/plain": [
|
432 |
+
" Location Name Location Type\n",
|
433 |
+
"0 track Residential\n",
|
434 |
+
"1 Mumbai Refinery Mahul Industrial\n",
|
435 |
+
"2 Mumbai Refinery Industrial"
|
436 |
+
]
|
437 |
+
},
|
438 |
+
"execution_count": 95,
|
439 |
+
"metadata": {},
|
440 |
+
"output_type": "execute_result"
|
441 |
+
}
|
442 |
+
],
|
443 |
+
"source": [
|
444 |
+
"## removing duplicates\n",
|
445 |
+
"\n",
|
446 |
+
"loc_types = []\n",
|
447 |
+
"for row in labelled_df.iterrows():\n",
|
448 |
+
" loc_type = (row[1]['Location Name'], row[1]['Location Type'])\n",
|
449 |
+
" if loc_type not in loc_types:\n",
|
450 |
+
" loc_types.append(loc_type)\n",
|
451 |
+
"\n",
|
452 |
+
"labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])\n",
|
453 |
+
"labelled_df.head(20)"
|
454 |
+
]
|
455 |
+
},
|
456 |
+
{
|
457 |
+
"cell_type": "code",
|
458 |
+
"execution_count": 58,
|
459 |
+
"metadata": {},
|
460 |
+
"outputs": [],
|
461 |
+
"source": [
|
462 |
+
"row_of_dataset = ''\n",
|
463 |
+
"\n",
|
464 |
+
"for row in labelled_df.iterrows():\n",
|
465 |
+
" row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']\n",
|
466 |
+
" row_of_dataset += row_text + ', '"
|
467 |
+
]
|
468 |
+
},
|
469 |
+
{
|
470 |
+
"cell_type": "code",
|
471 |
+
"execution_count": 59,
|
472 |
+
"metadata": {},
|
473 |
+
"outputs": [
|
474 |
+
{
|
475 |
+
"data": {
|
476 |
+
"text/plain": [
|
477 |
+
"'Oswal Company Trees is a Natural, Newspaper stall is a Commercial, Shiv Polyclinic and Nursing Home is a Healthcare, राजपूत मेडिकल is a Healthcare, Bhabha Atomic Research Centre - BARC is a Industrial, BPCL Sports Club is a Leisure & Entertainment, New Bharat Nagar, Banjara tanda, Hasina Nagar is a Residential, Old Bharat Nagar is a Residential, Rashtriya Chemicals & Fertilizers is a Industrial, Koyna Colony is a Residential, D is a Residential, A-2 is a Residential, flip card is a Commercial, track is a Residential, Mumbai Refinery Mahul is a Industrial, Mumbai Refinery is a Industrial, Trombay Thermal Power Station is a Industrial, Vitta Sanchay Society is a Residential, E is a Residential, Acharya Sharad Narayan Udyan is a Leisure & Entertainment, bmc park is a Leisure & Entertainment, Mysore Colony Central Garden is a Leisure & Entertainment, BMC owned trees is a Natural, BMC PARK is a Leisure & Entertainment, Mysore colony eastern park is a Leisure & Entertainment, Trees owned by RCF is a Natural, Mysore Colony trees is a Natural, NAVAL KG School, TS MAHUL is a Educational, '"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
"execution_count": 59,
|
481 |
+
"metadata": {},
|
482 |
+
"output_type": "execute_result"
|
483 |
+
}
|
484 |
+
],
|
485 |
+
"source": [
|
486 |
+
"row_of_dataset"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"cell_type": "markdown",
|
491 |
+
"metadata": {},
|
492 |
+
"source": [
|
493 |
+
"This is one row of the dataset, now writing a function to extract all these rows from a given large map area"
|
494 |
+
]
|
495 |
+
},
|
496 |
+
{
|
497 |
+
"cell_type": "code",
|
498 |
+
"execution_count": 61,
|
499 |
+
"metadata": {},
|
500 |
+
"outputs": [],
|
501 |
+
"source": [
|
502 |
+
"## input point is at the bottom left of the map\n",
|
503 |
+
"\n",
|
504 |
+
"def calculate_distant_points(lat: float, lon: float, distance: float) -> tuple:\n",
|
505 |
+
" # Earth's radius in meters\n",
|
506 |
+
" R = 6371000\n",
|
507 |
+
"\n",
|
508 |
+
" # Convert latitude and longitude to radians\n",
|
509 |
+
" lat_rad = math.radians(lat)\n",
|
510 |
+
" lon_rad = math.radians(lon)\n",
|
511 |
+
"\n",
|
512 |
+
" # Calculate the point with the same latitude (moving east-west)\n",
|
513 |
+
" delta_lon = distance / (R * math.cos(lat_rad))\n",
|
514 |
+
" lon1 = lon + math.degrees(delta_lon)\n",
|
515 |
+
" \n",
|
516 |
+
" # Calculate the point with the same longitude (moving north-south)\n",
|
517 |
+
" delta_lat = distance / R\n",
|
518 |
+
" lat2 = lat + math.degrees(delta_lat)\n",
|
519 |
+
"\n",
|
520 |
+
" return ((lat, lon1), (lat2, lon))"
|
521 |
+
]
|
522 |
+
},
|
523 |
+
{
|
524 |
+
"cell_type": "code",
|
525 |
+
"execution_count": 66,
|
526 |
+
"metadata": {},
|
527 |
+
"outputs": [
|
528 |
+
{
|
529 |
+
"name": "stdout",
|
530 |
+
"output_type": "stream",
|
531 |
+
"text": [
|
532 |
+
"Original point: (40.7128, -74.006)\n",
|
533 |
+
"Point 1000m east: (40.712800, -73.709386)\n",
|
534 |
+
"Point 1000m north: (40.937630, -74.006000)\n"
|
535 |
+
]
|
536 |
+
}
|
537 |
+
],
|
538 |
+
"source": [
|
539 |
+
"if __name__ == \"__main__\":\n",
|
540 |
+
" latitude = 40.7128 # New York City latitude\n",
|
541 |
+
" longitude = -74.0060 # New York City longitude\n",
|
542 |
+
" distance = 1000*25 # 1000 meters\n",
|
543 |
+
"\n",
|
544 |
+
" result = calculate_distant_points(latitude, longitude, distance)\n",
|
545 |
+
" print(f\"Original point: ({latitude}, {longitude})\")\n",
|
546 |
+
" print(f\"Point 1000m east: ({result[0][0]:.6f}, {result[0][1]:.6f})\")\n",
|
547 |
+
" print(f\"Point 1000m north: ({result[1][0]:.6f}, {result[1][1]:.6f})\")"
|
548 |
+
]
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"cell_type": "code",
|
552 |
+
"execution_count": 69,
|
553 |
+
"metadata": {},
|
554 |
+
"outputs": [
|
555 |
+
{
|
556 |
+
"name": "stdout",
|
557 |
+
"output_type": "stream",
|
558 |
+
"text": [
|
559 |
+
"Bottom Left: (40.7128, -74.006)\n",
|
560 |
+
"Top Left: (40.93763040147969, -74.006)\n",
|
561 |
+
"Bottom Right: (40.7128, -73.7093855252233)\n",
|
562 |
+
"Top Right: (40.93763040147969, -73.7093855252233)\n"
|
563 |
+
]
|
564 |
+
}
|
565 |
+
],
|
566 |
+
"source": [
|
567 |
+
"bottom_left_latitude = 40.7128\n",
|
568 |
+
"bottom_left_longitude = -74.0060\n",
|
569 |
+
"\n",
|
570 |
+
"result = calculate_distant_points(bottom_left_latitude, bottom_left_longitude, 1000*25)\n",
|
571 |
+
"\n",
|
572 |
+
"top_left_latitude = result[1][0]\n",
|
573 |
+
"top_left_longitude = result[1][1]\n",
|
574 |
+
"\n",
|
575 |
+
"bottom_right_latitude = result[0][0]\n",
|
576 |
+
"bottom_right_longitude = result[0][1]\n",
|
577 |
+
"\n",
|
578 |
+
"top_right_latitude = top_left_latitude\n",
|
579 |
+
"top_right_longitude = bottom_right_longitude\n",
|
580 |
+
"\n",
|
581 |
+
"print(f\"Bottom Left: ({bottom_left_latitude}, {bottom_left_longitude})\")\n",
|
582 |
+
"print(f\"Top Left: ({top_left_latitude}, {top_left_longitude})\")\n",
|
583 |
+
"print(f\"Bottom Right: ({bottom_right_latitude}, {bottom_right_longitude})\")\n",
|
584 |
+
"print(f\"Top Right: ({top_right_latitude}, {top_right_longitude})\")"
|
585 |
+
]
|
586 |
+
},
|
587 |
+
{
|
588 |
+
"cell_type": "code",
|
589 |
+
"execution_count": 71,
|
590 |
+
"metadata": {},
|
591 |
+
"outputs": [
|
592 |
+
{
|
593 |
+
"data": {
|
594 |
+
"text/plain": [
|
595 |
+
"(0.008993216059187433, 0.01186457899106813)"
|
596 |
+
]
|
597 |
+
},
|
598 |
+
"execution_count": 71,
|
599 |
+
"metadata": {},
|
600 |
+
"output_type": "execute_result"
|
601 |
+
}
|
602 |
+
],
|
603 |
+
"source": [
|
604 |
+
"latitude_shift = top_left_latitude - bottom_left_latitude\n",
|
605 |
+
"longitude_shift = bottom_right_longitude - bottom_left_longitude\n",
|
606 |
+
"\n",
|
607 |
+
"latitude_unit = latitude_shift / 25\n",
|
608 |
+
"longitude_unit = longitude_shift / 25\n",
|
609 |
+
"\n",
|
610 |
+
"latitude_unit, longitude_unit"
|
611 |
+
]
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"cell_type": "code",
|
615 |
+
"execution_count": 73,
|
616 |
+
"metadata": {},
|
617 |
+
"outputs": [],
|
618 |
+
"source": [
|
619 |
+
"## 2d map grid (0,0) --> bottom left\n",
|
620 |
+
"\n",
|
621 |
+
"def create_map_grid(bottom_left: Tuple[float, float], top_right: Tuple[float, float], rows: int, cols: int) -> List[List[Tuple[float, float]]]:\n",
|
622 |
+
" grid = []\n",
|
623 |
+
" lat_unit = (top_right[0] - bottom_left[0]) / rows\n",
|
624 |
+
" lon_unit = (top_right[1] - bottom_left[1]) / cols\n",
|
625 |
+
" \n",
|
626 |
+
" for i in range(rows):\n",
|
627 |
+
" row = []\n",
|
628 |
+
" for j in range(cols):\n",
|
629 |
+
" lat = bottom_left[0] + i * lat_unit\n",
|
630 |
+
" lon = bottom_left[1] + j * lon_unit\n",
|
631 |
+
" lat = lat + lat_unit / 2\n",
|
632 |
+
" lon = lon + lon_unit / 2\n",
|
633 |
+
" row.append((lat, lon))\n",
|
634 |
+
" grid.append(row)\n",
|
635 |
+
" \n",
|
636 |
+
" return grid"
|
637 |
+
]
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"cell_type": "code",
|
641 |
+
"execution_count": 79,
|
642 |
+
"metadata": {},
|
643 |
+
"outputs": [],
|
644 |
+
"source": [
|
645 |
+
"grid = create_map_grid((bottom_left_latitude, bottom_left_longitude), (top_right_latitude, top_right_longitude), 25, 25)"
|
646 |
+
]
|
647 |
+
},
|
648 |
+
{
|
649 |
+
"cell_type": "code",
|
650 |
+
"execution_count": 108,
|
651 |
+
"metadata": {},
|
652 |
+
"outputs": [],
|
653 |
+
"source": [
|
654 |
+
"grid_dataset = []\n",
|
655 |
+
"for i, row in enumerate(grid):\n",
|
656 |
+
" for j, point in enumerate(row):\n",
|
657 |
+
" \n",
|
658 |
+
" grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1]}\n",
|
659 |
+
" grid_dataset.append(grid_row)\n",
|
660 |
+
"\n",
|
661 |
+
"grid_df = pd.DataFrame(grid_dataset)"
|
662 |
+
]
|
663 |
+
},
|
664 |
+
{
|
665 |
+
"cell_type": "code",
|
666 |
+
"execution_count": 83,
|
667 |
+
"metadata": {},
|
668 |
+
"outputs": [],
|
669 |
+
"source": [
|
670 |
+
"left_lat = 18.889833\n",
|
671 |
+
"left_lon = 72.779844"
|
672 |
+
]
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"cell_type": "code",
|
676 |
+
"execution_count": 84,
|
677 |
+
"metadata": {},
|
678 |
+
"outputs": [],
|
679 |
+
"source": [
|
680 |
+
"res1 = calculate_distant_points(left_lat, left_lon, 1000*35)\n",
|
681 |
+
"\n",
|
682 |
+
"right_lat = res1[1][0]\n",
|
683 |
+
"right_lon = res1[0][1]"
|
684 |
+
]
|
685 |
+
},
|
686 |
+
{
|
687 |
+
"cell_type": "code",
|
688 |
+
"execution_count": 85,
|
689 |
+
"metadata": {},
|
690 |
+
"outputs": [],
|
691 |
+
"source": [
|
692 |
+
"grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), 35, 35)"
|
693 |
+
]
|
694 |
+
},
|
695 |
+
{
|
696 |
+
"cell_type": "code",
|
697 |
+
"execution_count": null,
|
698 |
+
"metadata": {},
|
699 |
+
"outputs": [],
|
700 |
+
"source": [
|
701 |
+
"grid_dataset = []\n",
|
702 |
+
"for i, row in enumerate(grid):\n",
|
703 |
+
" for j, point in enumerate(row):\n",
|
704 |
+
" grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1]}\n",
|
705 |
+
" grid_dataset.append(grid_row)\n",
|
706 |
+
"\n",
|
707 |
+
"grid_df = pd.DataFrame(grid_dataset)\n",
|
708 |
+
"grid_df.head(25)"
|
709 |
+
]
|
710 |
+
},
|
711 |
+
{
|
712 |
+
"cell_type": "code",
|
713 |
+
"execution_count": 106,
|
714 |
+
"metadata": {},
|
715 |
+
"outputs": [],
|
716 |
+
"source": [
|
717 |
+
"## entire pipeline\n",
|
718 |
+
"\n",
|
719 |
+
"left_lat = 18.889833\n",
|
720 |
+
"left_lon = 72.779844\n",
|
721 |
+
"dist = 35\n",
|
722 |
+
"\n",
|
723 |
+
"res1 = calculate_distant_points(left_lat, left_lon, 1000*dist)\n",
|
724 |
+
"\n",
|
725 |
+
"right_lat = res1[1][0]\n",
|
726 |
+
"right_lon = res1[0][1]\n",
|
727 |
+
"grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), dist, dist)\n",
|
728 |
+
"\n",
|
729 |
+
"grid_dataset = []\n",
|
730 |
+
"for i, row in enumerate(grid):\n",
|
731 |
+
" for j, point in enumerate(row):\n",
|
732 |
+
" result_df = get_osm_data(point[0], point[1], 710)\n",
|
733 |
+
" # print(result_df.head(3))\n",
|
734 |
+
" labelled_df = result_df[result_df['Location Type'] != 'Other']\n",
|
735 |
+
" labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']\n",
|
736 |
+
" labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']\n",
|
737 |
+
" loc_types = []\n",
|
738 |
+
" for row in labelled_df.iterrows():\n",
|
739 |
+
" loc_type = (row[1]['Location Name'], row[1]['Location Type'])\n",
|
740 |
+
" if loc_type not in loc_types:\n",
|
741 |
+
" loc_types.append(loc_type)\n",
|
742 |
+
"\n",
|
743 |
+
" labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])\n",
|
744 |
+
"\n",
|
745 |
+
" row_of_dataset = ''\n",
|
746 |
+
"\n",
|
747 |
+
" for row in labelled_df.iterrows():\n",
|
748 |
+
" row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']\n",
|
749 |
+
" row_of_dataset += row_text + '; '\n",
|
750 |
+
" ## replacing any coma in the text with a blank space\n",
|
751 |
+
"\n",
|
752 |
+
" row_of_dataset = row_of_dataset.replace(',', ' ')\n",
|
753 |
+
" \n",
|
754 |
+
" grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1], \"Map Data\": row_of_dataset}\n",
|
755 |
+
" grid_dataset.append(grid_row)\n",
|
756 |
+
"\n",
|
757 |
+
"grid_df = pd.DataFrame(grid_dataset)\n",
|
758 |
+
"grid_df.to_csv('MMR_DATASET.csv', index=False)"
|
759 |
+
]
|
760 |
+
},
|
761 |
+
{
|
762 |
+
"cell_type": "code",
|
763 |
+
"execution_count": 107,
|
764 |
+
"metadata": {},
|
765 |
+
"outputs": [
|
766 |
+
{
|
767 |
+
"data": {
|
768 |
+
"text/html": [
|
769 |
+
"<div>\n",
|
770 |
+
"<style scoped>\n",
|
771 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
772 |
+
" vertical-align: middle;\n",
|
773 |
+
" }\n",
|
774 |
+
"\n",
|
775 |
+
" .dataframe tbody tr th {\n",
|
776 |
+
" vertical-align: top;\n",
|
777 |
+
" }\n",
|
778 |
+
"\n",
|
779 |
+
" .dataframe thead th {\n",
|
780 |
+
" text-align: right;\n",
|
781 |
+
" }\n",
|
782 |
+
"</style>\n",
|
783 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
784 |
+
" <thead>\n",
|
785 |
+
" <tr style=\"text-align: right;\">\n",
|
786 |
+
" <th></th>\n",
|
787 |
+
" <th>row</th>\n",
|
788 |
+
" <th>col</th>\n",
|
789 |
+
" <th>latitude</th>\n",
|
790 |
+
" <th>longitude</th>\n",
|
791 |
+
" <th>Map Data</th>\n",
|
792 |
+
" </tr>\n",
|
793 |
+
" </thead>\n",
|
794 |
+
" <tbody>\n",
|
795 |
+
" <tr>\n",
|
796 |
+
" <th>0</th>\n",
|
797 |
+
" <td>0</td>\n",
|
798 |
+
" <td>0</td>\n",
|
799 |
+
" <td>18.894330</td>\n",
|
800 |
+
" <td>72.784597</td>\n",
|
801 |
+
" <td></td>\n",
|
802 |
+
" </tr>\n",
|
803 |
+
" <tr>\n",
|
804 |
+
" <th>1</th>\n",
|
805 |
+
" <td>0</td>\n",
|
806 |
+
" <td>1</td>\n",
|
807 |
+
" <td>18.894330</td>\n",
|
808 |
+
" <td>72.794102</td>\n",
|
809 |
+
" <td>Prongs Reef is a Natural,</td>\n",
|
810 |
+
" </tr>\n",
|
811 |
+
" <tr>\n",
|
812 |
+
" <th>2</th>\n",
|
813 |
+
" <td>0</td>\n",
|
814 |
+
" <td>2</td>\n",
|
815 |
+
" <td>18.894330</td>\n",
|
816 |
+
" <td>72.803607</td>\n",
|
817 |
+
" <td>United Services Club Golf Course is a Leisure ...</td>\n",
|
818 |
+
" </tr>\n",
|
819 |
+
" <tr>\n",
|
820 |
+
" <th>3</th>\n",
|
821 |
+
" <td>0</td>\n",
|
822 |
+
" <td>3</td>\n",
|
823 |
+
" <td>18.894330</td>\n",
|
824 |
+
" <td>72.813112</td>\n",
|
825 |
+
" <td>Indian Meterological Department is a Commercia...</td>\n",
|
826 |
+
" </tr>\n",
|
827 |
+
" <tr>\n",
|
828 |
+
" <th>4</th>\n",
|
829 |
+
" <td>1</td>\n",
|
830 |
+
" <td>0</td>\n",
|
831 |
+
" <td>18.903323</td>\n",
|
832 |
+
" <td>72.784597</td>\n",
|
833 |
+
" <td></td>\n",
|
834 |
+
" </tr>\n",
|
835 |
+
" <tr>\n",
|
836 |
+
" <th>5</th>\n",
|
837 |
+
" <td>1</td>\n",
|
838 |
+
" <td>1</td>\n",
|
839 |
+
" <td>18.903323</td>\n",
|
840 |
+
" <td>72.794102</td>\n",
|
841 |
+
" <td></td>\n",
|
842 |
+
" </tr>\n",
|
843 |
+
" <tr>\n",
|
844 |
+
" <th>6</th>\n",
|
845 |
+
" <td>1</td>\n",
|
846 |
+
" <td>2</td>\n",
|
847 |
+
" <td>18.903323</td>\n",
|
848 |
+
" <td>72.803607</td>\n",
|
849 |
+
" <td>Jagadish Canteen is a Food & Drink, Maratha St...</td>\n",
|
850 |
+
" </tr>\n",
|
851 |
+
" <tr>\n",
|
852 |
+
" <th>7</th>\n",
|
853 |
+
" <td>1</td>\n",
|
854 |
+
" <td>3</td>\n",
|
855 |
+
" <td>18.903323</td>\n",
|
856 |
+
" <td>72.813112</td>\n",
|
857 |
+
" <td>Indian Meterological Department is a Commercia...</td>\n",
|
858 |
+
" </tr>\n",
|
859 |
+
" <tr>\n",
|
860 |
+
" <th>8</th>\n",
|
861 |
+
" <td>2</td>\n",
|
862 |
+
" <td>0</td>\n",
|
863 |
+
" <td>18.912316</td>\n",
|
864 |
+
" <td>72.784597</td>\n",
|
865 |
+
" <td></td>\n",
|
866 |
+
" </tr>\n",
|
867 |
+
" <tr>\n",
|
868 |
+
" <th>9</th>\n",
|
869 |
+
" <td>2</td>\n",
|
870 |
+
" <td>1</td>\n",
|
871 |
+
" <td>18.912316</td>\n",
|
872 |
+
" <td>72.794102</td>\n",
|
873 |
+
" <td></td>\n",
|
874 |
+
" </tr>\n",
|
875 |
+
" <tr>\n",
|
876 |
+
" <th>10</th>\n",
|
877 |
+
" <td>2</td>\n",
|
878 |
+
" <td>2</td>\n",
|
879 |
+
" <td>18.912316</td>\n",
|
880 |
+
" <td>72.803607</td>\n",
|
881 |
+
" <td>Jagadish Canteen is a Food & Drink, Maratha St...</td>\n",
|
882 |
+
" </tr>\n",
|
883 |
+
" <tr>\n",
|
884 |
+
" <th>11</th>\n",
|
885 |
+
" <td>2</td>\n",
|
886 |
+
" <td>3</td>\n",
|
887 |
+
" <td>18.912316</td>\n",
|
888 |
+
" <td>72.813112</td>\n",
|
889 |
+
" <td>Cafe Coffee Day is a Food & Drink, King Plaza ...</td>\n",
|
890 |
+
" </tr>\n",
|
891 |
+
" <tr>\n",
|
892 |
+
" <th>12</th>\n",
|
893 |
+
" <td>3</td>\n",
|
894 |
+
" <td>0</td>\n",
|
895 |
+
" <td>18.921309</td>\n",
|
896 |
+
" <td>72.784597</td>\n",
|
897 |
+
" <td></td>\n",
|
898 |
+
" </tr>\n",
|
899 |
+
" <tr>\n",
|
900 |
+
" <th>13</th>\n",
|
901 |
+
" <td>3</td>\n",
|
902 |
+
" <td>1</td>\n",
|
903 |
+
" <td>18.921309</td>\n",
|
904 |
+
" <td>72.794102</td>\n",
|
905 |
+
" <td></td>\n",
|
906 |
+
" </tr>\n",
|
907 |
+
" <tr>\n",
|
908 |
+
" <th>14</th>\n",
|
909 |
+
" <td>3</td>\n",
|
910 |
+
" <td>2</td>\n",
|
911 |
+
" <td>18.921309</td>\n",
|
912 |
+
" <td>72.803607</td>\n",
|
913 |
+
" <td></td>\n",
|
914 |
+
" </tr>\n",
|
915 |
+
" <tr>\n",
|
916 |
+
" <th>15</th>\n",
|
917 |
+
" <td>3</td>\n",
|
918 |
+
" <td>3</td>\n",
|
919 |
+
" <td>18.921309</td>\n",
|
920 |
+
" <td>72.813112</td>\n",
|
921 |
+
" <td>Cafe Coffee Day is a Food & Drink, King Plaza ...</td>\n",
|
922 |
+
" </tr>\n",
|
923 |
+
" </tbody>\n",
|
924 |
+
"</table>\n",
|
925 |
+
"</div>"
|
926 |
+
],
|
927 |
+
"text/plain": [
|
928 |
+
" row col latitude longitude \\\n",
|
929 |
+
"0 0 0 18.894330 72.784597 \n",
|
930 |
+
"1 0 1 18.894330 72.794102 \n",
|
931 |
+
"2 0 2 18.894330 72.803607 \n",
|
932 |
+
"3 0 3 18.894330 72.813112 \n",
|
933 |
+
"4 1 0 18.903323 72.784597 \n",
|
934 |
+
"5 1 1 18.903323 72.794102 \n",
|
935 |
+
"6 1 2 18.903323 72.803607 \n",
|
936 |
+
"7 1 3 18.903323 72.813112 \n",
|
937 |
+
"8 2 0 18.912316 72.784597 \n",
|
938 |
+
"9 2 1 18.912316 72.794102 \n",
|
939 |
+
"10 2 2 18.912316 72.803607 \n",
|
940 |
+
"11 2 3 18.912316 72.813112 \n",
|
941 |
+
"12 3 0 18.921309 72.784597 \n",
|
942 |
+
"13 3 1 18.921309 72.794102 \n",
|
943 |
+
"14 3 2 18.921309 72.803607 \n",
|
944 |
+
"15 3 3 18.921309 72.813112 \n",
|
945 |
+
"\n",
|
946 |
+
" Map Data \n",
|
947 |
+
"0 \n",
|
948 |
+
"1 Prongs Reef is a Natural, \n",
|
949 |
+
"2 United Services Club Golf Course is a Leisure ... \n",
|
950 |
+
"3 Indian Meterological Department is a Commercia... \n",
|
951 |
+
"4 \n",
|
952 |
+
"5 \n",
|
953 |
+
"6 Jagadish Canteen is a Food & Drink, Maratha St... \n",
|
954 |
+
"7 Indian Meterological Department is a Commercia... \n",
|
955 |
+
"8 \n",
|
956 |
+
"9 \n",
|
957 |
+
"10 Jagadish Canteen is a Food & Drink, Maratha St... \n",
|
958 |
+
"11 Cafe Coffee Day is a Food & Drink, King Plaza ... \n",
|
959 |
+
"12 \n",
|
960 |
+
"13 \n",
|
961 |
+
"14 \n",
|
962 |
+
"15 Cafe Coffee Day is a Food & Drink, King Plaza ... "
|
963 |
+
]
|
964 |
+
},
|
965 |
+
"execution_count": 107,
|
966 |
+
"metadata": {},
|
967 |
+
"output_type": "execute_result"
|
968 |
+
}
|
969 |
+
],
|
970 |
+
"source": [
|
971 |
+
"grid_df.head(20)"
|
972 |
+
]
|
973 |
+
}
|
974 |
+
],
|
975 |
+
"metadata": {
|
976 |
+
"kernelspec": {
|
977 |
+
"display_name": "Python 3",
|
978 |
+
"language": "python",
|
979 |
+
"name": "python3"
|
980 |
+
},
|
981 |
+
"language_info": {
|
982 |
+
"codemirror_mode": {
|
983 |
+
"name": "ipython",
|
984 |
+
"version": 3
|
985 |
+
},
|
986 |
+
"file_extension": ".py",
|
987 |
+
"mimetype": "text/x-python",
|
988 |
+
"name": "python",
|
989 |
+
"nbconvert_exporter": "python",
|
990 |
+
"pygments_lexer": "ipython3",
|
991 |
+
"version": "3.12.0"
|
992 |
+
}
|
993 |
+
},
|
994 |
+
"nbformat": 4,
|
995 |
+
"nbformat_minor": 2
|
996 |
+
}
|
src/main.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
######################################## IMPORTING REQUIRED LIBRARIES ####################################
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
5 |
+
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
|
6 |
+
from utilities import get_data, input_filter, clean_data
|
7 |
+
|
8 |
+
|
9 |
+
################################################## INPUTS ################################################
|
10 |
+
|
11 |
+
left_lat = 18.889833
|
12 |
+
left_lon = 72.779844
|
13 |
+
dist = 35
|
14 |
+
|
15 |
+
def data_sourcing():
|
16 |
+
lat, lon = input_filter(lat = left_lat, lon=left_lon)
|
17 |
+
df = get_data(lat, lon, dist)
|
18 |
+
df.to_csv(f'{data_folder}/MMR_DATA.csv', index=False)
|
19 |
+
return df
|
20 |
+
|
21 |
+
def data_clean_for_training(df):
|
22 |
+
df = clean_data(df)
|
23 |
+
df.to_csv(f'{data_folder}/MMR_DATA_CLEAN.csv', index=False)
|
24 |
+
return df
|
25 |
+
|
26 |
+
|
27 |
+
if __name__ == '__main__':
|
28 |
+
|
29 |
+
df = data_sourcing() ## testing the data sourcing endpoint
|
30 |
+
if df:
|
31 |
+
print("Data loaded successfully !!")
|
32 |
+
|
33 |
+
clean_df = data_clean_for_training(df)
|
utilities/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .data_loader import get_data, input_filter
|
2 |
+
from .data_cleaner import clean_data
|
utilities/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (277 Bytes). View file
|
|
utilities/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (251 Bytes). View file
|
|
utilities/__pycache__/data_loader.cpython-311.pyc
ADDED
Binary file (10.1 kB). View file
|
|
utilities/__pycache__/data_loader.cpython-312.pyc
ADDED
Binary file (8.92 kB). View file
|
|
utilities/data_cleaner.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import nltk
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from nltk.stem import PorterStemmer
|
5 |
+
from nltk.stem import WordNetLemmatizer
|
6 |
+
|
7 |
+
def clean_text(text):
|
8 |
+
nltk.download('stopwords')
|
9 |
+
nltk.download('wordnet')
|
10 |
+
|
11 |
+
stop_words = set(stopwords.words('english'))
|
12 |
+
stemmer = PorterStemmer()
|
13 |
+
lemmatizer = WordNetLemmatizer()
|
14 |
+
|
15 |
+
text = re.sub(r'[^\w\s]', '', text)
|
16 |
+
text = text.lower()
|
17 |
+
text = [word for word in text.split() if word not in stop_words]
|
18 |
+
text = [stemmer.stem(word) for word in text]
|
19 |
+
text = [lemmatizer.lemmatize(word) for word in text]
|
20 |
+
return ' '.join(text)
|
21 |
+
|
22 |
+
def clean_data(df):
|
23 |
+
df['Map Data'] = df['Map Data'].fillna('')
|
24 |
+
df = df[df['Map Data'].str.len() > 0]
|
25 |
+
df = df[df['Map Data'].str.len() < 5000]
|
26 |
+
# df['Map Data'] = df['Map Data'].apply(clean_text)
|
27 |
+
return df
|
utilities/data_loader.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
import math
|
5 |
+
from typing import Tuple, List, Dict
|
6 |
+
|
7 |
+
def fetch_osm_data(lat: float, lon: float, radius: int) -> List[Dict]:
|
8 |
+
overpass_url = "http://overpass-api.de/api/interpreter"
|
9 |
+
overpass_query = f"""
|
10 |
+
[out:json];
|
11 |
+
(
|
12 |
+
node["name"](around:{radius},{lat},{lon});
|
13 |
+
way["name"](around:{radius},{lat},{lon});
|
14 |
+
relation["name"](around:{radius},{lat},{lon});
|
15 |
+
);
|
16 |
+
out center;
|
17 |
+
"""
|
18 |
+
|
19 |
+
response = requests.get(overpass_url, params={'data': overpass_query})
|
20 |
+
data = response.json()
|
21 |
+
return data['elements']
|
22 |
+
|
23 |
+
def determine_location_type(tags: Dict[str, str]) -> str:
|
24 |
+
# Residential
|
25 |
+
if 'building' in tags and tags['building'] in ['residential', 'house', 'apartments', 'detached', 'terrace', 'dormitory', 'bungalow']:
|
26 |
+
return 'Residential'
|
27 |
+
|
28 |
+
# Commercial
|
29 |
+
if any(key in tags for key in ['shop', 'office', 'craft']):
|
30 |
+
return 'Commercial'
|
31 |
+
if 'building' in tags and tags['building'] in ['commercial', 'office', 'retail', 'supermarket', 'kiosk']:
|
32 |
+
return 'Commercial'
|
33 |
+
|
34 |
+
# Industrial
|
35 |
+
if 'building' in tags and tags['building'] in ['industrial', 'warehouse', 'factory', 'manufacture']:
|
36 |
+
return 'Industrial'
|
37 |
+
if 'industrial' in tags or 'industry' in tags:
|
38 |
+
return 'Industrial'
|
39 |
+
|
40 |
+
# Educational
|
41 |
+
if 'amenity' in tags and tags['amenity'] in ['school', 'university', 'college', 'library', 'kindergarten', 'language_school']:
|
42 |
+
return 'Educational'
|
43 |
+
|
44 |
+
# Healthcare
|
45 |
+
if 'amenity' in tags and tags['amenity'] in ['hospital', 'clinic', 'doctors', 'dentist', 'pharmacy', 'veterinary']:
|
46 |
+
return 'Healthcare'
|
47 |
+
|
48 |
+
# Food & Drink
|
49 |
+
if 'amenity' in tags and tags['amenity'] in ['restaurant', 'cafe', 'bar', 'fast_food', 'pub', 'food_court']:
|
50 |
+
return 'Food & Drink'
|
51 |
+
|
52 |
+
# Leisure & Entertainment
|
53 |
+
if 'leisure' in tags or 'tourism' in tags:
|
54 |
+
return 'Leisure & Entertainment'
|
55 |
+
if 'amenity' in tags and tags['amenity'] in ['theatre', 'cinema', 'nightclub', 'arts_centre', 'community_centre']:
|
56 |
+
return 'Leisure & Entertainment'
|
57 |
+
|
58 |
+
# Transportation
|
59 |
+
if 'amenity' in tags and tags['amenity'] in ['parking', 'bicycle_parking', 'bus_station', 'ferry_terminal']:
|
60 |
+
return 'Transportation'
|
61 |
+
if 'highway' in tags or 'railway' in tags or 'aeroway' in tags:
|
62 |
+
return 'Transportation'
|
63 |
+
|
64 |
+
# Religious
|
65 |
+
if 'amenity' in tags and tags['amenity'] in ['place_of_worship', 'monastery']:
|
66 |
+
return 'Religious'
|
67 |
+
|
68 |
+
# Government & Public Services
|
69 |
+
if 'amenity' in tags and tags['amenity'] in ['townhall', 'courthouse', 'police', 'fire_station', 'post_office']:
|
70 |
+
return 'Government & Public Services'
|
71 |
+
|
72 |
+
# Parks & Recreation
|
73 |
+
if 'leisure' in tags and tags['leisure'] in ['park', 'playground', 'sports_centre', 'stadium', 'garden']:
|
74 |
+
return 'Parks & Recreation'
|
75 |
+
|
76 |
+
# Natural
|
77 |
+
if 'natural' in tags:
|
78 |
+
return 'Natural'
|
79 |
+
|
80 |
+
# Landuse
|
81 |
+
if 'landuse' in tags:
|
82 |
+
landuse = tags['landuse'].capitalize()
|
83 |
+
if landuse in ['Residential', 'Commercial', 'Industrial', 'Retail']:
|
84 |
+
return landuse
|
85 |
+
else:
|
86 |
+
return f'Landuse: {landuse}'
|
87 |
+
|
88 |
+
# If no specific category is found, return 'Other'
|
89 |
+
return 'Other'
|
90 |
+
|
91 |
+
def parse_osm_data(elements: List[Dict]) -> pd.DataFrame:
|
92 |
+
parsed_data = []
|
93 |
+
for element in elements:
|
94 |
+
tags = element.get('tags', {})
|
95 |
+
parsed_element = {
|
96 |
+
'ID': f"{element['type']}_{element['id']}",
|
97 |
+
'Location Name': tags.get('name', ''),
|
98 |
+
'Location Type': determine_location_type(tags)
|
99 |
+
}
|
100 |
+
parsed_data.append(parsed_element)
|
101 |
+
if len(parsed_data) == 0:
|
102 |
+
return pd.DataFrame(columns=['ID', 'Location Name', 'Location Type'])
|
103 |
+
return pd.DataFrame(parsed_data)
|
104 |
+
|
105 |
+
def get_osm_data(lat: float, lon: float, radius: int) -> pd.DataFrame:
|
106 |
+
raw_data = fetch_osm_data(lat, lon, radius)
|
107 |
+
return parse_osm_data(raw_data)
|
108 |
+
|
109 |
+
def dms_to_decimal(coord_str):
|
110 |
+
# Regular expression to match the coordinate format
|
111 |
+
pattern = r'(\d+)°(\d+)\'([\d.]+)"([NS])\s*(\d+)°(\d+)\'([\d.]+)"([EW])'
|
112 |
+
|
113 |
+
match = re.match(pattern, coord_str)
|
114 |
+
if not match:
|
115 |
+
raise ValueError("Invalid coordinate format. Expected format: 19°03'08.6\"N 72°54'06.0\"E")
|
116 |
+
|
117 |
+
lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()
|
118 |
+
|
119 |
+
# Convert to decimal degrees
|
120 |
+
lat = float(lat_deg) + float(lat_min)/60 + float(lat_sec)/3600
|
121 |
+
lon = float(lon_deg) + float(lon_min)/60 + float(lon_sec)/3600
|
122 |
+
|
123 |
+
# Adjust sign based on direction
|
124 |
+
if lat_dir == 'S':
|
125 |
+
lat = -lat
|
126 |
+
if lon_dir == 'W':
|
127 |
+
lon = -lon
|
128 |
+
|
129 |
+
return lat, lon
|
130 |
+
|
131 |
+
|
132 |
+
def calculate_distant_points(lat: float, lon: float, distance: float) -> tuple:
|
133 |
+
# Earth's radius in meters
|
134 |
+
R = 6371000
|
135 |
+
|
136 |
+
# Convert latitude and longitude to radians
|
137 |
+
lat_rad = math.radians(lat)
|
138 |
+
lon_rad = math.radians(lon)
|
139 |
+
|
140 |
+
# Calculate the point with the same latitude (moving east-west)
|
141 |
+
delta_lon = distance / (R * math.cos(lat_rad))
|
142 |
+
lon1 = lon + math.degrees(delta_lon)
|
143 |
+
|
144 |
+
# Calculate the point with the same longitude (moving north-south)
|
145 |
+
delta_lat = distance / R
|
146 |
+
lat2 = lat + math.degrees(delta_lat)
|
147 |
+
|
148 |
+
return ((lat, lon1), (lat2, lon))
|
149 |
+
|
150 |
+
## 2d map grid (0,0) --> bottom left
|
151 |
+
|
152 |
+
def create_map_grid(bottom_left: Tuple[float, float], top_right: Tuple[float, float], rows: int, cols: int) -> List[List[Tuple[float, float]]]:
|
153 |
+
grid = []
|
154 |
+
lat_unit = (top_right[0] - bottom_left[0]) / rows
|
155 |
+
lon_unit = (top_right[1] - bottom_left[1]) / cols
|
156 |
+
|
157 |
+
for i in range(rows):
|
158 |
+
row = []
|
159 |
+
for j in range(cols):
|
160 |
+
lat = bottom_left[0] + i * lat_unit
|
161 |
+
lon = bottom_left[1] + j * lon_unit
|
162 |
+
lat = lat + lat_unit / 2
|
163 |
+
lon = lon + lon_unit / 2
|
164 |
+
row.append((lat, lon))
|
165 |
+
grid.append(row)
|
166 |
+
|
167 |
+
return grid
|
168 |
+
|
169 |
+
## entire pipeline
|
170 |
+
|
171 |
+
left_lat = 18.889833
|
172 |
+
left_lon = 72.779844
|
173 |
+
dist = 35
|
174 |
+
|
175 |
+
def input_filter(lat=None, lon=None, string=None):
|
176 |
+
if lat != None:
|
177 |
+
return (lat, lon)
|
178 |
+
elif string != None:
|
179 |
+
latitude, longitude = dms_to_decimal(string)
|
180 |
+
return (latitude, longitude)
|
181 |
+
else:
|
182 |
+
return None
|
183 |
+
|
184 |
+
def get_data(bottom_left_lat, bottom_left_lon, dist):
|
185 |
+
|
186 |
+
result = calculate_distant_points(bottom_left_lat, bottom_left_lon, 1000*dist)
|
187 |
+
|
188 |
+
top_right_lat = result[1][0]
|
189 |
+
top_right_lon = result[0][1]
|
190 |
+
grid = create_map_grid((left_lat, left_lon), (top_right_lat, top_right_lon), dist, dist)
|
191 |
+
|
192 |
+
grid_dataset = []
|
193 |
+
for i, row in enumerate(grid):
|
194 |
+
for j, point in enumerate(row):
|
195 |
+
result_df = get_osm_data(point[0], point[1], 710)
|
196 |
+
# print(result_df.head(3))
|
197 |
+
labelled_df = result_df[result_df['Location Type'] != 'Other']
|
198 |
+
labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']
|
199 |
+
labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']
|
200 |
+
loc_types = []
|
201 |
+
for row in labelled_df.iterrows():
|
202 |
+
loc_type = (row[1]['Location Name'], row[1]['Location Type'])
|
203 |
+
if loc_type not in loc_types:
|
204 |
+
loc_types.append(loc_type)
|
205 |
+
|
206 |
+
labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])
|
207 |
+
|
208 |
+
row_of_dataset = ''
|
209 |
+
|
210 |
+
for row in labelled_df.iterrows():
|
211 |
+
row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']
|
212 |
+
row_of_dataset += row_text + '; '
|
213 |
+
## replacing any coma in the text with a blank space
|
214 |
+
|
215 |
+
row_of_dataset = row_of_dataset.replace(',', ' ')
|
216 |
+
|
217 |
+
grid_row = {"row": i, "col": j, "latitude": point[0], "longitude": point[1], "Map Data": row_of_dataset}
|
218 |
+
grid_dataset.append(grid_row)
|
219 |
+
|
220 |
+
grid_df = pd.DataFrame(grid_dataset)
|
221 |
+
return grid_df
|
222 |
+
# grid_df.to_csv('MMR_DATASET.csv', index=False)
|
utils/__init__.py
DELETED
File without changes
|