akhil-vaidya commited on
Commit
c161b3b
·
1 Parent(s): d217164

feat: data-pipelines

Browse files

implemented data loading pipelines, data cleaning and basic EDA along with TFIDF vectorization

data/MMR_DATA.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/__init__.py DELETED
File without changes
notebooks/EDA.ipynb ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## __Exploratory Data Analysis__"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 53,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "## importing libraries\n",
17
+ "\n",
18
+ "import numpy as numpy\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import seaborn as sns\n",
22
+ "\n",
23
+ "import sys\n",
24
+ "import os\n",
25
+ "\n",
26
+ "import re\n",
27
+ "import nltk\n",
28
+ "from nltk.corpus import stopwords\n",
29
+ "from nltk.stem import PorterStemmer\n",
30
+ "from nltk.stem import WordNetLemmatizer"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 6,
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "data": {
40
+ "text/html": [
41
+ "<div>\n",
42
+ "<style scoped>\n",
43
+ " .dataframe tbody tr th:only-of-type {\n",
44
+ " vertical-align: middle;\n",
45
+ " }\n",
46
+ "\n",
47
+ " .dataframe tbody tr th {\n",
48
+ " vertical-align: top;\n",
49
+ " }\n",
50
+ "\n",
51
+ " .dataframe thead th {\n",
52
+ " text-align: right;\n",
53
+ " }\n",
54
+ "</style>\n",
55
+ "<table border=\"1\" class=\"dataframe\">\n",
56
+ " <thead>\n",
57
+ " <tr style=\"text-align: right;\">\n",
58
+ " <th></th>\n",
59
+ " <th>row</th>\n",
60
+ " <th>col</th>\n",
61
+ " <th>latitude</th>\n",
62
+ " <th>longitude</th>\n",
63
+ " <th>Map Data</th>\n",
64
+ " </tr>\n",
65
+ " </thead>\n",
66
+ " <tbody>\n",
67
+ " <tr>\n",
68
+ " <th>0</th>\n",
69
+ " <td>0</td>\n",
70
+ " <td>0</td>\n",
71
+ " <td>18.89433</td>\n",
72
+ " <td>72.784597</td>\n",
73
+ " <td>NaN</td>\n",
74
+ " </tr>\n",
75
+ " <tr>\n",
76
+ " <th>1</th>\n",
77
+ " <td>0</td>\n",
78
+ " <td>1</td>\n",
79
+ " <td>18.89433</td>\n",
80
+ " <td>72.794102</td>\n",
81
+ " <td>Prongs Reef is a Natural;</td>\n",
82
+ " </tr>\n",
83
+ " <tr>\n",
84
+ " <th>2</th>\n",
85
+ " <td>0</td>\n",
86
+ " <td>2</td>\n",
87
+ " <td>18.89433</td>\n",
88
+ " <td>72.803607</td>\n",
89
+ " <td>United Services Club Golf Course is a Leisure ...</td>\n",
90
+ " </tr>\n",
91
+ " <tr>\n",
92
+ " <th>3</th>\n",
93
+ " <td>0</td>\n",
94
+ " <td>3</td>\n",
95
+ " <td>18.89433</td>\n",
96
+ " <td>72.813112</td>\n",
97
+ " <td>Indian Meterological Department is a Commercia...</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>4</th>\n",
101
+ " <td>0</td>\n",
102
+ " <td>4</td>\n",
103
+ " <td>18.89433</td>\n",
104
+ " <td>72.822617</td>\n",
105
+ " <td>NaN</td>\n",
106
+ " </tr>\n",
107
+ " </tbody>\n",
108
+ "</table>\n",
109
+ "</div>"
110
+ ],
111
+ "text/plain": [
112
+ " row col latitude longitude \\\n",
113
+ "0 0 0 18.89433 72.784597 \n",
114
+ "1 0 1 18.89433 72.794102 \n",
115
+ "2 0 2 18.89433 72.803607 \n",
116
+ "3 0 3 18.89433 72.813112 \n",
117
+ "4 0 4 18.89433 72.822617 \n",
118
+ "\n",
119
+ " Map Data \n",
120
+ "0 NaN \n",
121
+ "1 Prongs Reef is a Natural; \n",
122
+ "2 United Services Club Golf Course is a Leisure ... \n",
123
+ "3 Indian Meterological Department is a Commercia... \n",
124
+ "4 NaN "
125
+ ]
126
+ },
127
+ "execution_count": 6,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
+ "source": [
133
+ "data_folder = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
134
+ "data_file = os.path.join(data_folder, 'MMR_DATA.csv')\n",
135
+ "df = pd.read_csv(data_file)\n",
136
+ "df.head()"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 7,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "data": {
146
+ "text/html": [
147
+ "<div>\n",
148
+ "<style scoped>\n",
149
+ " .dataframe tbody tr th:only-of-type {\n",
150
+ " vertical-align: middle;\n",
151
+ " }\n",
152
+ "\n",
153
+ " .dataframe tbody tr th {\n",
154
+ " vertical-align: top;\n",
155
+ " }\n",
156
+ "\n",
157
+ " .dataframe thead th {\n",
158
+ " text-align: right;\n",
159
+ " }\n",
160
+ "</style>\n",
161
+ "<table border=\"1\" class=\"dataframe\">\n",
162
+ " <thead>\n",
163
+ " <tr style=\"text-align: right;\">\n",
164
+ " <th></th>\n",
165
+ " <th>row</th>\n",
166
+ " <th>col</th>\n",
167
+ " <th>latitude</th>\n",
168
+ " <th>longitude</th>\n",
169
+ " <th>Map Data</th>\n",
170
+ " </tr>\n",
171
+ " </thead>\n",
172
+ " <tbody>\n",
173
+ " <tr>\n",
174
+ " <th>0</th>\n",
175
+ " <td>0</td>\n",
176
+ " <td>0</td>\n",
177
+ " <td>18.89433</td>\n",
178
+ " <td>72.784597</td>\n",
179
+ " <td></td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>1</th>\n",
183
+ " <td>0</td>\n",
184
+ " <td>1</td>\n",
185
+ " <td>18.89433</td>\n",
186
+ " <td>72.794102</td>\n",
187
+ " <td>Prongs Reef is a Natural;</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>2</th>\n",
191
+ " <td>0</td>\n",
192
+ " <td>2</td>\n",
193
+ " <td>18.89433</td>\n",
194
+ " <td>72.803607</td>\n",
195
+ " <td>United Services Club Golf Course is a Leisure ...</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>3</th>\n",
199
+ " <td>0</td>\n",
200
+ " <td>3</td>\n",
201
+ " <td>18.89433</td>\n",
202
+ " <td>72.813112</td>\n",
203
+ " <td>Indian Meterological Department is a Commercia...</td>\n",
204
+ " </tr>\n",
205
+ " <tr>\n",
206
+ " <th>4</th>\n",
207
+ " <td>0</td>\n",
208
+ " <td>4</td>\n",
209
+ " <td>18.89433</td>\n",
210
+ " <td>72.822617</td>\n",
211
+ " <td></td>\n",
212
+ " </tr>\n",
213
+ " </tbody>\n",
214
+ "</table>\n",
215
+ "</div>"
216
+ ],
217
+ "text/plain": [
218
+ " row col latitude longitude \\\n",
219
+ "0 0 0 18.89433 72.784597 \n",
220
+ "1 0 1 18.89433 72.794102 \n",
221
+ "2 0 2 18.89433 72.803607 \n",
222
+ "3 0 3 18.89433 72.813112 \n",
223
+ "4 0 4 18.89433 72.822617 \n",
224
+ "\n",
225
+ " Map Data \n",
226
+ "0 \n",
227
+ "1 Prongs Reef is a Natural; \n",
228
+ "2 United Services Club Golf Course is a Leisure ... \n",
229
+ "3 Indian Meterological Department is a Commercia... \n",
230
+ "4 "
231
+ ]
232
+ },
233
+ "execution_count": 7,
234
+ "metadata": {},
235
+ "output_type": "execute_result"
236
+ }
237
+ ],
238
+ "source": [
239
+ "## filling the NaN values in the Map Data Column with empty string\n",
240
+ "\n",
241
+ "df['Map Data'] = df['Map Data'].fillna('')\n",
242
+ "df.head()"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 20,
248
+ "metadata": {},
249
+ "outputs": [
250
+ {
251
+ "data": {
252
+ "text/plain": [
253
+ "1225"
254
+ ]
255
+ },
256
+ "execution_count": 20,
257
+ "metadata": {},
258
+ "output_type": "execute_result"
259
+ }
260
+ ],
261
+ "source": [
262
+ "len(df)"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 42,
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": [
271
+ "df_len_explore = df.copy()"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 43,
277
+ "metadata": {},
278
+ "outputs": [
279
+ {
280
+ "data": {
281
+ "text/html": [
282
+ "<div>\n",
283
+ "<style scoped>\n",
284
+ " .dataframe tbody tr th:only-of-type {\n",
285
+ " vertical-align: middle;\n",
286
+ " }\n",
287
+ "\n",
288
+ " .dataframe tbody tr th {\n",
289
+ " vertical-align: top;\n",
290
+ " }\n",
291
+ "\n",
292
+ " .dataframe thead th {\n",
293
+ " text-align: right;\n",
294
+ " }\n",
295
+ "</style>\n",
296
+ "<table border=\"1\" class=\"dataframe\">\n",
297
+ " <thead>\n",
298
+ " <tr style=\"text-align: right;\">\n",
299
+ " <th></th>\n",
300
+ " <th>Map Data</th>\n",
301
+ " </tr>\n",
302
+ " </thead>\n",
303
+ " <tbody>\n",
304
+ " <tr>\n",
305
+ " <th>0</th>\n",
306
+ " <td></td>\n",
307
+ " </tr>\n",
308
+ " <tr>\n",
309
+ " <th>1</th>\n",
310
+ " <td>Prongs Reef is a Natural;</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>2</th>\n",
314
+ " <td>United Services Club Golf Course is a Leisure ...</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>3</th>\n",
318
+ " <td>Indian Meterological Department is a Commercia...</td>\n",
319
+ " </tr>\n",
320
+ " <tr>\n",
321
+ " <th>4</th>\n",
322
+ " <td></td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "</div>"
327
+ ],
328
+ "text/plain": [
329
+ " Map Data\n",
330
+ "0 \n",
331
+ "1 Prongs Reef is a Natural; \n",
332
+ "2 United Services Club Golf Course is a Leisure ...\n",
333
+ "3 Indian Meterological Department is a Commercia...\n",
334
+ "4 "
335
+ ]
336
+ },
337
+ "execution_count": 43,
338
+ "metadata": {},
339
+ "output_type": "execute_result"
340
+ }
341
+ ],
342
+ "source": [
343
+ "## dropping the columns that are not needed for the analysis\n",
344
+ "\n",
345
+ "df_len_explore = df_len_explore.drop(columns=['row', 'col', 'latitude', 'longitude'])\n",
346
+ "df_len_explore.head()"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 44,
352
+ "metadata": {},
353
+ "outputs": [
354
+ {
355
+ "data": {
356
+ "text/plain": [
357
+ "791"
358
+ ]
359
+ },
360
+ "execution_count": 44,
361
+ "metadata": {},
362
+ "output_type": "execute_result"
363
+ }
364
+ ],
365
+ "source": [
366
+ "## dropping the rows with 0 string length and string length > 5000\n",
367
+ "\n",
368
+ "df_len_explore = df_len_explore[df_len_explore['Map Data'].str.len() > 0]\n",
369
+ "df_len_explore = df_len_explore[df_len_explore['Map Data'].str.len() < 5000]\n",
370
+ "len(df_len_explore)"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 45,
376
+ "metadata": {},
377
+ "outputs": [
378
+ {
379
+ "name": "stdout",
380
+ "output_type": "stream",
381
+ "text": [
382
+ "Discarded rows: 434 / 1225\n"
383
+ ]
384
+ }
385
+ ],
386
+ "source": [
387
+ "print('Discarded rows: ', len(df) - len(df_len_explore), '/', len(df))"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 52,
393
+ "metadata": {},
394
+ "outputs": [
395
+ {
396
+ "name": "stdout",
397
+ "output_type": "stream",
398
+ "text": [
399
+ "Mean string length: 834.7509481668774\n",
400
+ "80th percentile string length: 1560.0\n"
401
+ ]
402
+ }
403
+ ],
404
+ "source": [
405
+ "## mean of the string length\n",
406
+ "\n",
407
+ "print('Mean string length: ', df_len_explore['Map Data'].str.len().mean())\n",
408
+ "print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 51,
414
+ "metadata": {},
415
+ "outputs": [
416
+ {
417
+ "data": {
418
+ "text/plain": [
419
+ "<matplotlib.lines.Line2D at 0x1c5b27a9790>"
420
+ ]
421
+ },
422
+ "execution_count": 51,
423
+ "metadata": {},
424
+ "output_type": "execute_result"
425
+ },
426
+ {
427
+ "data": {
428
+ "image/png": "",
429
+ "text/plain": [
430
+ "<Figure size 640x480 with 1 Axes>"
431
+ ]
432
+ },
433
+ "metadata": {},
434
+ "output_type": "display_data"
435
+ }
436
+ ],
437
+ "source": [
438
+ "## plotting lenth of strings in the Map Data Column, mean and 75th percentile\n",
439
+ "\n",
440
+ "df_len_explore['Map Data'].str.len().plot(kind='hist', bins=100)\n",
441
+ "plt.axvline(df_len_explore['Map Data'].str.len().mean(), color='red', linestyle='dashed', linewidth=2)\n",
442
+ "plt.axvline(df_len_explore['Map Data'].str.len().quantile(0.80), color='green', linestyle='dashed', linewidth=2)"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "markdown",
447
+ "metadata": {},
448
+ "source": [
449
+ "### Same visualization, post cleaning"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": 54,
455
+ "metadata": {},
456
+ "outputs": [
457
+ {
458
+ "name": "stderr",
459
+ "output_type": "stream",
460
+ "text": [
461
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Akhil\n",
462
+ "[nltk_data] PC\\AppData\\Roaming\\nltk_data...\n",
463
+ "[nltk_data] Package stopwords is already up-to-date!\n",
464
+ "[nltk_data] Downloading package wordnet to C:\\Users\\Akhil\n",
465
+ "[nltk_data] PC\\AppData\\Roaming\\nltk_data...\n",
466
+ "[nltk_data] Package wordnet is already up-to-date!\n"
467
+ ]
468
+ }
469
+ ],
470
+ "source": [
471
+ "nltk.download('stopwords')\n",
472
+ "nltk.download('wordnet')\n",
473
+ "\n",
474
+ "stop_words = set(stopwords.words('english'))\n",
475
+ "stemmer = PorterStemmer()\n",
476
+ "lemmatizer = WordNetLemmatizer()"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": 55,
482
+ "metadata": {},
483
+ "outputs": [],
484
+ "source": [
485
+ "## cleaning the strings, stemming and lemmatizing\n",
486
+ "\n",
487
+ "def clean_text(text):\n",
488
+ " text = re.sub(r'[^\\w\\s]', '', text)\n",
489
+ " text = text.lower()\n",
490
+ " text = [word for word in text.split() if word not in stop_words]\n",
491
+ " text = [stemmer.stem(word) for word in text] \n",
492
+ " text = [lemmatizer.lemmatize(word) for word in text]\n",
493
+ " return ' '.join(text)\n",
494
+ "\n",
495
+ "df_len_explore['Map Data'] = df_len_explore['Map Data'].apply(clean_text)"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 56,
501
+ "metadata": {},
502
+ "outputs": [
503
+ {
504
+ "name": "stdout",
505
+ "output_type": "stream",
506
+ "text": [
507
+ "Mean string length: 596.3046776232617\n",
508
+ "80th percentile string length: 1114.0\n"
509
+ ]
510
+ }
511
+ ],
512
+ "source": [
513
+ "print('Mean string length: ', df_len_explore['Map Data'].str.len().mean())\n",
514
+ "print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": 57,
520
+ "metadata": {},
521
+ "outputs": [
522
+ {
523
+ "data": {
524
+ "text/plain": [
525
+ "<matplotlib.lines.Line2D at 0x1c5b5e37b00>"
526
+ ]
527
+ },
528
+ "execution_count": 57,
529
+ "metadata": {},
530
+ "output_type": "execute_result"
531
+ },
532
+ {
533
+ "data": {
534
+ "image/png": "",
535
+ "text/plain": [
536
+ "<Figure size 640x480 with 1 Axes>"
537
+ ]
538
+ },
539
+ "metadata": {},
540
+ "output_type": "display_data"
541
+ }
542
+ ],
543
+ "source": [
544
+ "df_len_explore['Map Data'].str.len().plot(kind='hist', bins=100)\n",
545
+ "plt.axvline(df_len_explore['Map Data'].str.len().mean(), color='red', linestyle='dashed', linewidth=2)\n",
546
+ "plt.axvline(df_len_explore['Map Data'].str.len().quantile(0.80), color='green', linestyle='dashed', linewidth=2)"
547
+ ]
548
+ },
549
+ {
550
+ "cell_type": "code",
551
+ "execution_count": 59,
552
+ "metadata": {},
553
+ "outputs": [
554
+ {
555
+ "name": "stdout",
556
+ "output_type": "stream",
557
+ "text": [
558
+ "Original data length: 1225\n",
559
+ "Number of blanks: 407\n",
560
+ "Number of strings with length > 5000: 27\n",
561
+ "Number of useful rows: 791\n"
562
+ ]
563
+ }
564
+ ],
565
+ "source": [
566
+ "## Final Summary\n",
567
+ "\n",
568
+ "print('Original data length: ', len(df))\n",
569
+ "print('Number of blanks: ', len(df) - len(df[df['Map Data'].str.len() > 0]))\n",
570
+ "print('Number of strings with length > 5000: ', len(df[df['Map Data'].str.len() > 5000]))\n",
571
+ "print('Number of useful rows: ', len(df_len_explore))"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "code",
576
+ "execution_count": 62,
577
+ "metadata": {},
578
+ "outputs": [
579
+ {
580
+ "name": "stdout",
581
+ "output_type": "stream",
582
+ "text": [
583
+ "Post cleaning data length: 791\n",
584
+ "Avg string length: 596.3046776232617\n",
585
+ "Median string length: 192.0\n",
586
+ "25th percentile string length: 43.5\n",
587
+ "80th percentile string length: 1114.0\n"
588
+ ]
589
+ }
590
+ ],
591
+ "source": [
592
+ "print('Post cleaning data length: ', len(df_len_explore))\n",
593
+ "print('Avg string length: ', df_len_explore['Map Data'].str.len().mean())\n",
594
+ "print('Median string length: ', df_len_explore['Map Data'].str.len().median())\n",
595
+ "print('25th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.25)) \n",
596
+ "print('80th percentile string length: ', df_len_explore['Map Data'].str.len().quantile(0.8))"
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "markdown",
601
+ "metadata": {},
602
+ "source": [
603
+ "Highly uneven string length distribution.\n",
604
+ "- 33% of the total data is useless, ie. blank rows\n",
605
+ "- 25% of the strings have length less than 45 characters\n",
606
+ "- 50% of the strings have length less than 200 characters\n",
607
+ "- 30% of the strings have length between 200 - 1100 characters (huge variation)\n",
608
+ "- 20% of the strings have length greater than 1100 characters\n",
609
+ "- about 5% of strings are longer than 5000 characters (wont be used in training)"
610
+ ]
611
+ },
612
+ {
613
+ "cell_type": "code",
614
+ "execution_count": null,
615
+ "metadata": {},
616
+ "outputs": [],
617
+ "source": [
618
+ "## function to clean the given data frame\n",
619
+ "\n",
620
+ "def clean_text(text):\n",
621
+ " text = re.sub(r'[^\\w\\s]', '', text)\n",
622
+ " text = text.lower()\n",
623
+ " text = [word for word in text.split() if word not in stop_words]\n",
624
+ " text = [stemmer.stem(word) for word in text] \n",
625
+ " text = [lemmatizer.lemmatize(word) for word in text]\n",
626
+ " return ' '.join(text)\n",
627
+ "\n",
628
+ "def clean_data(df):\n",
629
+ " df['Map Data'] = df['Map Data'].fillna('')\n",
630
+ " df = df[df['Map Data'].str.len() > 0]\n",
631
+ " df = df[df['Map Data'].str.len() < 5000]\n",
632
+ " df['Map Data'] = df['Map Data'].apply(clean_text)\n",
633
+ " return df"
634
+ ]
635
+ }
636
+ ],
637
+ "metadata": {
638
+ "kernelspec": {
639
+ "display_name": "Python 3",
640
+ "language": "python",
641
+ "name": "python3"
642
+ },
643
+ "language_info": {
644
+ "codemirror_mode": {
645
+ "name": "ipython",
646
+ "version": 3
647
+ },
648
+ "file_extension": ".py",
649
+ "mimetype": "text/x-python",
650
+ "name": "python",
651
+ "nbconvert_exporter": "python",
652
+ "pygments_lexer": "ipython3",
653
+ "version": "3.12.0"
654
+ }
655
+ },
656
+ "nbformat": 4,
657
+ "nbformat_minor": 2
658
+ }
notebooks/TFIDF.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/__init__.py DELETED
File without changes
notebooks/data_loading.ipynb ADDED
@@ -0,0 +1,996 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## __Data Pipelines__ \n",
8
+ "Loading data from OpenStreetMap using overpass API"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 60,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "import requests\n",
18
+ "import pandas as pd\n",
19
+ "import re\n",
20
+ "import math\n",
21
+ "from typing import Tuple, List, Dict"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 97,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "def fetch_osm_data(lat: float, lon: float, radius: int) -> List[Dict]:\n",
31
+ " overpass_url = \"http://overpass-api.de/api/interpreter\"\n",
32
+ " overpass_query = f\"\"\"\n",
33
+ " [out:json];\n",
34
+ " (\n",
35
+ " node[\"name\"](around:{radius},{lat},{lon});\n",
36
+ " way[\"name\"](around:{radius},{lat},{lon});\n",
37
+ " relation[\"name\"](around:{radius},{lat},{lon});\n",
38
+ " );\n",
39
+ " out center;\n",
40
+ " \"\"\"\n",
41
+ " \n",
42
+ " response = requests.get(overpass_url, params={'data': overpass_query})\n",
43
+ " data = response.json()\n",
44
+ " return data['elements']\n",
45
+ "\n",
46
+ "def determine_location_type(tags: Dict[str, str]) -> str:\n",
47
+ " # Residential\n",
48
+ " if 'building' in tags and tags['building'] in ['residential', 'house', 'apartments', 'detached', 'terrace', 'dormitory', 'bungalow']:\n",
49
+ " return 'Residential'\n",
50
+ " \n",
51
+ " # Commercial\n",
52
+ " if any(key in tags for key in ['shop', 'office', 'craft']):\n",
53
+ " return 'Commercial'\n",
54
+ " if 'building' in tags and tags['building'] in ['commercial', 'office', 'retail', 'supermarket', 'kiosk']:\n",
55
+ " return 'Commercial'\n",
56
+ " \n",
57
+ " # Industrial\n",
58
+ " if 'building' in tags and tags['building'] in ['industrial', 'warehouse', 'factory', 'manufacture']:\n",
59
+ " return 'Industrial'\n",
60
+ " if 'industrial' in tags or 'industry' in tags:\n",
61
+ " return 'Industrial'\n",
62
+ " \n",
63
+ " # Educational\n",
64
+ " if 'amenity' in tags and tags['amenity'] in ['school', 'university', 'college', 'library', 'kindergarten', 'language_school']:\n",
65
+ " return 'Educational'\n",
66
+ " \n",
67
+ " # Healthcare\n",
68
+ " if 'amenity' in tags and tags['amenity'] in ['hospital', 'clinic', 'doctors', 'dentist', 'pharmacy', 'veterinary']:\n",
69
+ " return 'Healthcare'\n",
70
+ " \n",
71
+ " # Food & Drink\n",
72
+ " if 'amenity' in tags and tags['amenity'] in ['restaurant', 'cafe', 'bar', 'fast_food', 'pub', 'food_court']:\n",
73
+ " return 'Food & Drink'\n",
74
+ " \n",
75
+ " # Leisure & Entertainment\n",
76
+ " if 'leisure' in tags or 'tourism' in tags:\n",
77
+ " return 'Leisure & Entertainment'\n",
78
+ " if 'amenity' in tags and tags['amenity'] in ['theatre', 'cinema', 'nightclub', 'arts_centre', 'community_centre']:\n",
79
+ " return 'Leisure & Entertainment'\n",
80
+ " \n",
81
+ " # Transportation\n",
82
+ " if 'amenity' in tags and tags['amenity'] in ['parking', 'bicycle_parking', 'bus_station', 'ferry_terminal']:\n",
83
+ " return 'Transportation'\n",
84
+ " if 'highway' in tags or 'railway' in tags or 'aeroway' in tags:\n",
85
+ " return 'Transportation'\n",
86
+ " \n",
87
+ " # Religious\n",
88
+ " if 'amenity' in tags and tags['amenity'] in ['place_of_worship', 'monastery']:\n",
89
+ " return 'Religious'\n",
90
+ " \n",
91
+ " # Government & Public Services\n",
92
+ " if 'amenity' in tags and tags['amenity'] in ['townhall', 'courthouse', 'police', 'fire_station', 'post_office']:\n",
93
+ " return 'Government & Public Services'\n",
94
+ " \n",
95
+ " # Parks & Recreation\n",
96
+ " if 'leisure' in tags and tags['leisure'] in ['park', 'playground', 'sports_centre', 'stadium', 'garden']:\n",
97
+ " return 'Parks & Recreation'\n",
98
+ " \n",
99
+ " # Natural\n",
100
+ " if 'natural' in tags:\n",
101
+ " return 'Natural'\n",
102
+ " \n",
103
+ " # Landuse\n",
104
+ " if 'landuse' in tags:\n",
105
+ " landuse = tags['landuse'].capitalize()\n",
106
+ " if landuse in ['Residential', 'Commercial', 'Industrial', 'Retail']:\n",
107
+ " return landuse\n",
108
+ " else:\n",
109
+ " return f'Landuse: {landuse}'\n",
110
+ " \n",
111
+ " # If no specific category is found, return 'Other'\n",
112
+ " return 'Other'\n",
113
+ "\n",
114
+ "def parse_osm_data(elements: List[Dict]) -> pd.DataFrame:\n",
115
+ " parsed_data = []\n",
116
+ " for element in elements:\n",
117
+ " tags = element.get('tags', {})\n",
118
+ " parsed_element = {\n",
119
+ " 'ID': f\"{element['type']}_{element['id']}\",\n",
120
+ " 'Location Name': tags.get('name', ''),\n",
121
+ " 'Location Type': determine_location_type(tags)\n",
122
+ " }\n",
123
+ " parsed_data.append(parsed_element)\n",
124
+ " if len(parsed_data) == 0:\n",
125
+ " return pd.DataFrame(columns=['ID', 'Location Name', 'Location Type'])\n",
126
+ " return pd.DataFrame(parsed_data)\n",
127
+ "\n",
128
+ "def get_osm_data(lat: float, lon: float, radius: int) -> pd.DataFrame:\n",
129
+ " raw_data = fetch_osm_data(lat, lon, radius)\n",
130
+ " return parse_osm_data(raw_data)\n",
131
+ "\n",
132
+ "def dms_to_decimal(coord_str):\n",
133
+ " # Regular expression to match the coordinate format\n",
134
+ " pattern = r'(\\d+)°(\\d+)\\'([\\d.]+)\"([NS])\\s*(\\d+)°(\\d+)\\'([\\d.]+)\"([EW])'\n",
135
+ " \n",
136
+ " match = re.match(pattern, coord_str)\n",
137
+ " if not match:\n",
138
+ " raise ValueError(\"Invalid coordinate format. Expected format: 19°03'08.6\\\"N 72°54'06.0\\\"E\")\n",
139
+ "\n",
140
+ " lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()\n",
141
+ "\n",
142
+ " # Convert to decimal degrees\n",
143
+ " lat = float(lat_deg) + float(lat_min)/60 + float(lat_sec)/3600\n",
144
+ " lon = float(lon_deg) + float(lon_min)/60 + float(lon_sec)/3600\n",
145
+ "\n",
146
+ " # Adjust sign based on direction\n",
147
+ " if lat_dir == 'S':\n",
148
+ " lat = -lat\n",
149
+ " if lon_dir == 'W':\n",
150
+ " lon = -lon\n",
151
+ "\n",
152
+ " return lat, lon"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 91,
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "Latitude: 19.015805555555556\n",
165
+ "Longitude: 72.89944444444446\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "coord_str = '19°00\\'56.9\"N 72°53\\'58.0\"E'\n",
171
+ "radius_meters = 1000\n",
172
+ "try:\n",
173
+ " latitude, longitude = dms_to_decimal(coord_str)\n",
174
+ " print(f\"Latitude: {latitude}\")\n",
175
+ " print(f\"Longitude: {longitude}\")\n",
176
+ "except ValueError as e:\n",
177
+ " print(f\"Error: {e}\")"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 92,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "result_df = get_osm_data(latitude, longitude, radius_meters)"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 93,
192
+ "metadata": {},
193
+ "outputs": [
194
+ {
195
+ "data": {
196
+ "text/html": [
197
+ "<div>\n",
198
+ "<style scoped>\n",
199
+ " .dataframe tbody tr th:only-of-type {\n",
200
+ " vertical-align: middle;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe tbody tr th {\n",
204
+ " vertical-align: top;\n",
205
+ " }\n",
206
+ "\n",
207
+ " .dataframe thead th {\n",
208
+ " text-align: right;\n",
209
+ " }\n",
210
+ "</style>\n",
211
+ "<table border=\"1\" class=\"dataframe\">\n",
212
+ " <thead>\n",
213
+ " <tr style=\"text-align: right;\">\n",
214
+ " <th></th>\n",
215
+ " <th>ID</th>\n",
216
+ " <th>Location Name</th>\n",
217
+ " <th>Location Type</th>\n",
218
+ " </tr>\n",
219
+ " </thead>\n",
220
+ " <tbody>\n",
221
+ " <tr>\n",
222
+ " <th>0</th>\n",
223
+ " <td>node_622002639</td>\n",
224
+ " <td>Mahul</td>\n",
225
+ " <td>Other</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>1</th>\n",
229
+ " <td>node_622005407</td>\n",
230
+ " <td>Gowanpada</td>\n",
231
+ " <td>Other</td>\n",
232
+ " </tr>\n",
233
+ " <tr>\n",
234
+ " <th>2</th>\n",
235
+ " <td>node_1646222635</td>\n",
236
+ " <td>gadakary bus stop</td>\n",
237
+ " <td>Transportation</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>3</th>\n",
241
+ " <td>node_1646222681</td>\n",
242
+ " <td>vishnu nagar bus stop</td>\n",
243
+ " <td>Other</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>4</th>\n",
247
+ " <td>node_2932495033</td>\n",
248
+ " <td>Sree Dutta mandir</td>\n",
249
+ " <td>Religious</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>5</th>\n",
253
+ " <td>node_11954176622</td>\n",
254
+ " <td>Gavhanpada</td>\n",
255
+ " <td>Other</td>\n",
256
+ " </tr>\n",
257
+ " <tr>\n",
258
+ " <th>6</th>\n",
259
+ " <td>way_25587616</td>\n",
260
+ " <td>Bhikaji Damaji Patil Marg</td>\n",
261
+ " <td>Transportation</td>\n",
262
+ " </tr>\n",
263
+ " <tr>\n",
264
+ " <th>7</th>\n",
265
+ " <td>way_122289587</td>\n",
266
+ " <td>Mulund - Trombay 220 KV line</td>\n",
267
+ " <td>Other</td>\n",
268
+ " </tr>\n",
269
+ " <tr>\n",
270
+ " <th>8</th>\n",
271
+ " <td>way_151783563</td>\n",
272
+ " <td>Laxman Umaji Gadkari Marg</td>\n",
273
+ " <td>Transportation</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>9</th>\n",
277
+ " <td>way_151783570</td>\n",
278
+ " <td>Vishnu Nagar Road</td>\n",
279
+ " <td>Transportation</td>\n",
280
+ " </tr>\n",
281
+ " </tbody>\n",
282
+ "</table>\n",
283
+ "</div>"
284
+ ],
285
+ "text/plain": [
286
+ " ID Location Name Location Type\n",
287
+ "0 node_622002639 Mahul Other\n",
288
+ "1 node_622005407 Gowanpada Other\n",
289
+ "2 node_1646222635 gadakary bus stop Transportation\n",
290
+ "3 node_1646222681 vishnu nagar bus stop Other\n",
291
+ "4 node_2932495033 Sree Dutta mandir Religious\n",
292
+ "5 node_11954176622 Gavhanpada Other\n",
293
+ "6 way_25587616 Bhikaji Damaji Patil Marg Transportation\n",
294
+ "7 way_122289587 Mulund - Trombay 220 KV line Other\n",
295
+ "8 way_151783563 Laxman Umaji Gadkari Marg Transportation\n",
296
+ "9 way_151783570 Vishnu Nagar Road Transportation"
297
+ ]
298
+ },
299
+ "execution_count": 93,
300
+ "metadata": {},
301
+ "output_type": "execute_result"
302
+ }
303
+ ],
304
+ "source": [
305
+ "result_df.head(10)"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": 94,
311
+ "metadata": {},
312
+ "outputs": [
313
+ {
314
+ "data": {
315
+ "text/html": [
316
+ "<div>\n",
317
+ "<style scoped>\n",
318
+ " .dataframe tbody tr th:only-of-type {\n",
319
+ " vertical-align: middle;\n",
320
+ " }\n",
321
+ "\n",
322
+ " .dataframe tbody tr th {\n",
323
+ " vertical-align: top;\n",
324
+ " }\n",
325
+ "\n",
326
+ " .dataframe thead th {\n",
327
+ " text-align: right;\n",
328
+ " }\n",
329
+ "</style>\n",
330
+ "<table border=\"1\" class=\"dataframe\">\n",
331
+ " <thead>\n",
332
+ " <tr style=\"text-align: right;\">\n",
333
+ " <th></th>\n",
334
+ " <th>ID</th>\n",
335
+ " <th>Location Name</th>\n",
336
+ " <th>Location Type</th>\n",
337
+ " </tr>\n",
338
+ " </thead>\n",
339
+ " <tbody>\n",
340
+ " <tr>\n",
341
+ " <th>11</th>\n",
342
+ " <td>way_430012316</td>\n",
343
+ " <td>track</td>\n",
344
+ " <td>Residential</td>\n",
345
+ " </tr>\n",
346
+ " <tr>\n",
347
+ " <th>12</th>\n",
348
+ " <td>way_430012318</td>\n",
349
+ " <td>Mumbai Refinery Mahul</td>\n",
350
+ " <td>Industrial</td>\n",
351
+ " </tr>\n",
352
+ " <tr>\n",
353
+ " <th>13</th>\n",
354
+ " <td>way_430012320</td>\n",
355
+ " <td>Mumbai Refinery</td>\n",
356
+ " <td>Industrial</td>\n",
357
+ " </tr>\n",
358
+ " </tbody>\n",
359
+ "</table>\n",
360
+ "</div>"
361
+ ],
362
+ "text/plain": [
363
+ " ID Location Name Location Type\n",
364
+ "11 way_430012316 track Residential\n",
365
+ "12 way_430012318 Mumbai Refinery Mahul Industrial\n",
366
+ "13 way_430012320 Mumbai Refinery Industrial"
367
+ ]
368
+ },
369
+ "execution_count": 94,
370
+ "metadata": {},
371
+ "output_type": "execute_result"
372
+ }
373
+ ],
374
+ "source": [
375
+ "labelled_df = result_df[result_df['Location Type'] != 'Other']\n",
376
+ "labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']\n",
377
+ "labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']\n",
378
+ "labelled_df.head(10)"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": 95,
384
+ "metadata": {},
385
+ "outputs": [
386
+ {
387
+ "data": {
388
+ "text/html": [
389
+ "<div>\n",
390
+ "<style scoped>\n",
391
+ " .dataframe tbody tr th:only-of-type {\n",
392
+ " vertical-align: middle;\n",
393
+ " }\n",
394
+ "\n",
395
+ " .dataframe tbody tr th {\n",
396
+ " vertical-align: top;\n",
397
+ " }\n",
398
+ "\n",
399
+ " .dataframe thead th {\n",
400
+ " text-align: right;\n",
401
+ " }\n",
402
+ "</style>\n",
403
+ "<table border=\"1\" class=\"dataframe\">\n",
404
+ " <thead>\n",
405
+ " <tr style=\"text-align: right;\">\n",
406
+ " <th></th>\n",
407
+ " <th>Location Name</th>\n",
408
+ " <th>Location Type</th>\n",
409
+ " </tr>\n",
410
+ " </thead>\n",
411
+ " <tbody>\n",
412
+ " <tr>\n",
413
+ " <th>0</th>\n",
414
+ " <td>track</td>\n",
415
+ " <td>Residential</td>\n",
416
+ " </tr>\n",
417
+ " <tr>\n",
418
+ " <th>1</th>\n",
419
+ " <td>Mumbai Refinery Mahul</td>\n",
420
+ " <td>Industrial</td>\n",
421
+ " </tr>\n",
422
+ " <tr>\n",
423
+ " <th>2</th>\n",
424
+ " <td>Mumbai Refinery</td>\n",
425
+ " <td>Industrial</td>\n",
426
+ " </tr>\n",
427
+ " </tbody>\n",
428
+ "</table>\n",
429
+ "</div>"
430
+ ],
431
+ "text/plain": [
432
+ " Location Name Location Type\n",
433
+ "0 track Residential\n",
434
+ "1 Mumbai Refinery Mahul Industrial\n",
435
+ "2 Mumbai Refinery Industrial"
436
+ ]
437
+ },
438
+ "execution_count": 95,
439
+ "metadata": {},
440
+ "output_type": "execute_result"
441
+ }
442
+ ],
443
+ "source": [
444
+ "## removing duplicates\n",
445
+ "\n",
446
+ "loc_types = []\n",
447
+ "for row in labelled_df.iterrows():\n",
448
+ " loc_type = (row[1]['Location Name'], row[1]['Location Type'])\n",
449
+ " if loc_type not in loc_types:\n",
450
+ " loc_types.append(loc_type)\n",
451
+ "\n",
452
+ "labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])\n",
453
+ "labelled_df.head(20)"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 58,
459
+ "metadata": {},
460
+ "outputs": [],
461
+ "source": [
462
+ "row_of_dataset = ''\n",
463
+ "\n",
464
+ "for row in labelled_df.iterrows():\n",
465
+ " row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']\n",
466
+ " row_of_dataset += row_text + ', '"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "execution_count": 59,
472
+ "metadata": {},
473
+ "outputs": [
474
+ {
475
+ "data": {
476
+ "text/plain": [
477
+ "'Oswal Company Trees is a Natural, Newspaper stall is a Commercial, Shiv Polyclinic and Nursing Home is a Healthcare, राजपूत मेडिकल is a Healthcare, Bhabha Atomic Research Centre - BARC is a Industrial, BPCL Sports Club is a Leisure & Entertainment, New Bharat Nagar, Banjara tanda, Hasina Nagar is a Residential, Old Bharat Nagar is a Residential, Rashtriya Chemicals & Fertilizers is a Industrial, Koyna Colony is a Residential, D is a Residential, A-2 is a Residential, flip card is a Commercial, track is a Residential, Mumbai Refinery Mahul is a Industrial, Mumbai Refinery is a Industrial, Trombay Thermal Power Station is a Industrial, Vitta Sanchay Society is a Residential, E is a Residential, Acharya Sharad Narayan Udyan is a Leisure & Entertainment, bmc park is a Leisure & Entertainment, Mysore Colony Central Garden is a Leisure & Entertainment, BMC owned trees is a Natural, BMC PARK is a Leisure & Entertainment, Mysore colony eastern park is a Leisure & Entertainment, Trees owned by RCF is a Natural, Mysore Colony trees is a Natural, NAVAL KG School, TS MAHUL is a Educational, '"
478
+ ]
479
+ },
480
+ "execution_count": 59,
481
+ "metadata": {},
482
+ "output_type": "execute_result"
483
+ }
484
+ ],
485
+ "source": [
486
+ "row_of_dataset"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "markdown",
491
+ "metadata": {},
492
+ "source": [
493
+ "This is one row of the dataset, now writing a function to extract all these rows from a given large map area"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 61,
499
+ "metadata": {},
500
+ "outputs": [],
501
+ "source": [
502
+ "## input point is at the bottom left of the map\n",
503
+ "\n",
504
+ "def calculate_distant_points(lat: float, lon: float, distance: float) -> tuple:\n",
505
+ " # Earth's radius in meters\n",
506
+ " R = 6371000\n",
507
+ "\n",
508
+ " # Convert latitude and longitude to radians\n",
509
+ " lat_rad = math.radians(lat)\n",
510
+ " lon_rad = math.radians(lon)\n",
511
+ "\n",
512
+ " # Calculate the point with the same latitude (moving east-west)\n",
513
+ " delta_lon = distance / (R * math.cos(lat_rad))\n",
514
+ " lon1 = lon + math.degrees(delta_lon)\n",
515
+ " \n",
516
+ " # Calculate the point with the same longitude (moving north-south)\n",
517
+ " delta_lat = distance / R\n",
518
+ " lat2 = lat + math.degrees(delta_lat)\n",
519
+ "\n",
520
+ " return ((lat, lon1), (lat2, lon))"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": 66,
526
+ "metadata": {},
527
+ "outputs": [
528
+ {
529
+ "name": "stdout",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "Original point: (40.7128, -74.006)\n",
533
+ "Point 1000m east: (40.712800, -73.709386)\n",
534
+ "Point 1000m north: (40.937630, -74.006000)\n"
535
+ ]
536
+ }
537
+ ],
538
+ "source": [
539
+ "if __name__ == \"__main__\":\n",
540
+ " latitude = 40.7128 # New York City latitude\n",
541
+ " longitude = -74.0060 # New York City longitude\n",
542
+ " distance = 1000*25 # 1000 meters\n",
543
+ "\n",
544
+ " result = calculate_distant_points(latitude, longitude, distance)\n",
545
+ " print(f\"Original point: ({latitude}, {longitude})\")\n",
546
+ " print(f\"Point 1000m east: ({result[0][0]:.6f}, {result[0][1]:.6f})\")\n",
547
+ " print(f\"Point 1000m north: ({result[1][0]:.6f}, {result[1][1]:.6f})\")"
548
+ ]
549
+ },
550
+ {
551
+ "cell_type": "code",
552
+ "execution_count": 69,
553
+ "metadata": {},
554
+ "outputs": [
555
+ {
556
+ "name": "stdout",
557
+ "output_type": "stream",
558
+ "text": [
559
+ "Bottom Left: (40.7128, -74.006)\n",
560
+ "Top Left: (40.93763040147969, -74.006)\n",
561
+ "Bottom Right: (40.7128, -73.7093855252233)\n",
562
+ "Top Right: (40.93763040147969, -73.7093855252233)\n"
563
+ ]
564
+ }
565
+ ],
566
+ "source": [
567
+ "bottom_left_latitude = 40.7128\n",
568
+ "bottom_left_longitude = -74.0060\n",
569
+ "\n",
570
+ "result = calculate_distant_points(bottom_left_latitude, bottom_left_longitude, 1000*25)\n",
571
+ "\n",
572
+ "top_left_latitude = result[1][0]\n",
573
+ "top_left_longitude = result[1][1]\n",
574
+ "\n",
575
+ "bottom_right_latitude = result[0][0]\n",
576
+ "bottom_right_longitude = result[0][1]\n",
577
+ "\n",
578
+ "top_right_latitude = top_left_latitude\n",
579
+ "top_right_longitude = bottom_right_longitude\n",
580
+ "\n",
581
+ "print(f\"Bottom Left: ({bottom_left_latitude}, {bottom_left_longitude})\")\n",
582
+ "print(f\"Top Left: ({top_left_latitude}, {top_left_longitude})\")\n",
583
+ "print(f\"Bottom Right: ({bottom_right_latitude}, {bottom_right_longitude})\")\n",
584
+ "print(f\"Top Right: ({top_right_latitude}, {top_right_longitude})\")"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": 71,
590
+ "metadata": {},
591
+ "outputs": [
592
+ {
593
+ "data": {
594
+ "text/plain": [
595
+ "(0.008993216059187433, 0.01186457899106813)"
596
+ ]
597
+ },
598
+ "execution_count": 71,
599
+ "metadata": {},
600
+ "output_type": "execute_result"
601
+ }
602
+ ],
603
+ "source": [
604
+ "latitude_shift = top_left_latitude - bottom_left_latitude\n",
605
+ "longitude_shift = bottom_right_longitude - bottom_left_longitude\n",
606
+ "\n",
607
+ "latitude_unit = latitude_shift / 25\n",
608
+ "longitude_unit = longitude_shift / 25\n",
609
+ "\n",
610
+ "latitude_unit, longitude_unit"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": 73,
616
+ "metadata": {},
617
+ "outputs": [],
618
+ "source": [
619
+ "## 2d map grid (0,0) --> bottom left\n",
620
+ "\n",
621
+ "def create_map_grid(bottom_left: Tuple[float, float], top_right: Tuple[float, float], rows: int, cols: int) -> List[List[Tuple[float, float]]]:\n",
622
+ " grid = []\n",
623
+ " lat_unit = (top_right[0] - bottom_left[0]) / rows\n",
624
+ " lon_unit = (top_right[1] - bottom_left[1]) / cols\n",
625
+ " \n",
626
+ " for i in range(rows):\n",
627
+ " row = []\n",
628
+ " for j in range(cols):\n",
629
+ " lat = bottom_left[0] + i * lat_unit\n",
630
+ " lon = bottom_left[1] + j * lon_unit\n",
631
+ " lat = lat + lat_unit / 2\n",
632
+ " lon = lon + lon_unit / 2\n",
633
+ " row.append((lat, lon))\n",
634
+ " grid.append(row)\n",
635
+ " \n",
636
+ " return grid"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 79,
642
+ "metadata": {},
643
+ "outputs": [],
644
+ "source": [
645
+ "grid = create_map_grid((bottom_left_latitude, bottom_left_longitude), (top_right_latitude, top_right_longitude), 25, 25)"
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "code",
650
+ "execution_count": 108,
651
+ "metadata": {},
652
+ "outputs": [],
653
+ "source": [
654
+ "grid_dataset = []\n",
655
+ "for i, row in enumerate(grid):\n",
656
+ " for j, point in enumerate(row):\n",
657
+ " \n",
658
+ " grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1]}\n",
659
+ " grid_dataset.append(grid_row)\n",
660
+ "\n",
661
+ "grid_df = pd.DataFrame(grid_dataset)"
662
+ ]
663
+ },
664
+ {
665
+ "cell_type": "code",
666
+ "execution_count": 83,
667
+ "metadata": {},
668
+ "outputs": [],
669
+ "source": [
670
+ "left_lat = 18.889833\n",
671
+ "left_lon = 72.779844"
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "execution_count": 84,
677
+ "metadata": {},
678
+ "outputs": [],
679
+ "source": [
680
+ "res1 = calculate_distant_points(left_lat, left_lon, 1000*35)\n",
681
+ "\n",
682
+ "right_lat = res1[1][0]\n",
683
+ "right_lon = res1[0][1]"
684
+ ]
685
+ },
686
+ {
687
+ "cell_type": "code",
688
+ "execution_count": 85,
689
+ "metadata": {},
690
+ "outputs": [],
691
+ "source": [
692
+ "grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), 35, 35)"
693
+ ]
694
+ },
695
+ {
696
+ "cell_type": "code",
697
+ "execution_count": null,
698
+ "metadata": {},
699
+ "outputs": [],
700
+ "source": [
701
+ "grid_dataset = []\n",
702
+ "for i, row in enumerate(grid):\n",
703
+ " for j, point in enumerate(row):\n",
704
+ " grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1]}\n",
705
+ " grid_dataset.append(grid_row)\n",
706
+ "\n",
707
+ "grid_df = pd.DataFrame(grid_dataset)\n",
708
+ "grid_df.head(25)"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": 106,
714
+ "metadata": {},
715
+ "outputs": [],
716
+ "source": [
717
+ "## entire pipeline\n",
718
+ "\n",
719
+ "left_lat = 18.889833\n",
720
+ "left_lon = 72.779844\n",
721
+ "dist = 35\n",
722
+ "\n",
723
+ "res1 = calculate_distant_points(left_lat, left_lon, 1000*dist)\n",
724
+ "\n",
725
+ "right_lat = res1[1][0]\n",
726
+ "right_lon = res1[0][1]\n",
727
+ "grid = create_map_grid((left_lat, left_lon), (right_lat, right_lon), dist, dist)\n",
728
+ "\n",
729
+ "grid_dataset = []\n",
730
+ "for i, row in enumerate(grid):\n",
731
+ " for j, point in enumerate(row):\n",
732
+ " result_df = get_osm_data(point[0], point[1], 710)\n",
733
+ " # print(result_df.head(3))\n",
734
+ " labelled_df = result_df[result_df['Location Type'] != 'Other']\n",
735
+ " labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']\n",
736
+ " labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']\n",
737
+ " loc_types = []\n",
738
+ " for row in labelled_df.iterrows():\n",
739
+ " loc_type = (row[1]['Location Name'], row[1]['Location Type'])\n",
740
+ " if loc_type not in loc_types:\n",
741
+ " loc_types.append(loc_type)\n",
742
+ "\n",
743
+ " labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])\n",
744
+ "\n",
745
+ " row_of_dataset = ''\n",
746
+ "\n",
747
+ " for row in labelled_df.iterrows():\n",
748
+ " row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']\n",
749
+ " row_of_dataset += row_text + '; '\n",
750
+ " ## replacing any coma in the text with a blank space\n",
751
+ "\n",
752
+ " row_of_dataset = row_of_dataset.replace(',', ' ')\n",
753
+ " \n",
754
+ " grid_row = {\"row\": i, \"col\": j, \"latitude\": point[0], \"longitude\": point[1], \"Map Data\": row_of_dataset}\n",
755
+ " grid_dataset.append(grid_row)\n",
756
+ "\n",
757
+ "grid_df = pd.DataFrame(grid_dataset)\n",
758
+ "grid_df.to_csv('MMR_DATASET.csv', index=False)"
759
+ ]
760
+ },
761
+ {
762
+ "cell_type": "code",
763
+ "execution_count": 107,
764
+ "metadata": {},
765
+ "outputs": [
766
+ {
767
+ "data": {
768
+ "text/html": [
769
+ "<div>\n",
770
+ "<style scoped>\n",
771
+ " .dataframe tbody tr th:only-of-type {\n",
772
+ " vertical-align: middle;\n",
773
+ " }\n",
774
+ "\n",
775
+ " .dataframe tbody tr th {\n",
776
+ " vertical-align: top;\n",
777
+ " }\n",
778
+ "\n",
779
+ " .dataframe thead th {\n",
780
+ " text-align: right;\n",
781
+ " }\n",
782
+ "</style>\n",
783
+ "<table border=\"1\" class=\"dataframe\">\n",
784
+ " <thead>\n",
785
+ " <tr style=\"text-align: right;\">\n",
786
+ " <th></th>\n",
787
+ " <th>row</th>\n",
788
+ " <th>col</th>\n",
789
+ " <th>latitude</th>\n",
790
+ " <th>longitude</th>\n",
791
+ " <th>Map Data</th>\n",
792
+ " </tr>\n",
793
+ " </thead>\n",
794
+ " <tbody>\n",
795
+ " <tr>\n",
796
+ " <th>0</th>\n",
797
+ " <td>0</td>\n",
798
+ " <td>0</td>\n",
799
+ " <td>18.894330</td>\n",
800
+ " <td>72.784597</td>\n",
801
+ " <td></td>\n",
802
+ " </tr>\n",
803
+ " <tr>\n",
804
+ " <th>1</th>\n",
805
+ " <td>0</td>\n",
806
+ " <td>1</td>\n",
807
+ " <td>18.894330</td>\n",
808
+ " <td>72.794102</td>\n",
809
+ " <td>Prongs Reef is a Natural,</td>\n",
810
+ " </tr>\n",
811
+ " <tr>\n",
812
+ " <th>2</th>\n",
813
+ " <td>0</td>\n",
814
+ " <td>2</td>\n",
815
+ " <td>18.894330</td>\n",
816
+ " <td>72.803607</td>\n",
817
+ " <td>United Services Club Golf Course is a Leisure ...</td>\n",
818
+ " </tr>\n",
819
+ " <tr>\n",
820
+ " <th>3</th>\n",
821
+ " <td>0</td>\n",
822
+ " <td>3</td>\n",
823
+ " <td>18.894330</td>\n",
824
+ " <td>72.813112</td>\n",
825
+ " <td>Indian Meterological Department is a Commercia...</td>\n",
826
+ " </tr>\n",
827
+ " <tr>\n",
828
+ " <th>4</th>\n",
829
+ " <td>1</td>\n",
830
+ " <td>0</td>\n",
831
+ " <td>18.903323</td>\n",
832
+ " <td>72.784597</td>\n",
833
+ " <td></td>\n",
834
+ " </tr>\n",
835
+ " <tr>\n",
836
+ " <th>5</th>\n",
837
+ " <td>1</td>\n",
838
+ " <td>1</td>\n",
839
+ " <td>18.903323</td>\n",
840
+ " <td>72.794102</td>\n",
841
+ " <td></td>\n",
842
+ " </tr>\n",
843
+ " <tr>\n",
844
+ " <th>6</th>\n",
845
+ " <td>1</td>\n",
846
+ " <td>2</td>\n",
847
+ " <td>18.903323</td>\n",
848
+ " <td>72.803607</td>\n",
849
+ " <td>Jagadish Canteen is a Food &amp; Drink, Maratha St...</td>\n",
850
+ " </tr>\n",
851
+ " <tr>\n",
852
+ " <th>7</th>\n",
853
+ " <td>1</td>\n",
854
+ " <td>3</td>\n",
855
+ " <td>18.903323</td>\n",
856
+ " <td>72.813112</td>\n",
857
+ " <td>Indian Meterological Department is a Commercia...</td>\n",
858
+ " </tr>\n",
859
+ " <tr>\n",
860
+ " <th>8</th>\n",
861
+ " <td>2</td>\n",
862
+ " <td>0</td>\n",
863
+ " <td>18.912316</td>\n",
864
+ " <td>72.784597</td>\n",
865
+ " <td></td>\n",
866
+ " </tr>\n",
867
+ " <tr>\n",
868
+ " <th>9</th>\n",
869
+ " <td>2</td>\n",
870
+ " <td>1</td>\n",
871
+ " <td>18.912316</td>\n",
872
+ " <td>72.794102</td>\n",
873
+ " <td></td>\n",
874
+ " </tr>\n",
875
+ " <tr>\n",
876
+ " <th>10</th>\n",
877
+ " <td>2</td>\n",
878
+ " <td>2</td>\n",
879
+ " <td>18.912316</td>\n",
880
+ " <td>72.803607</td>\n",
881
+ " <td>Jagadish Canteen is a Food &amp; Drink, Maratha St...</td>\n",
882
+ " </tr>\n",
883
+ " <tr>\n",
884
+ " <th>11</th>\n",
885
+ " <td>2</td>\n",
886
+ " <td>3</td>\n",
887
+ " <td>18.912316</td>\n",
888
+ " <td>72.813112</td>\n",
889
+ " <td>Cafe Coffee Day is a Food &amp; Drink, King Plaza ...</td>\n",
890
+ " </tr>\n",
891
+ " <tr>\n",
892
+ " <th>12</th>\n",
893
+ " <td>3</td>\n",
894
+ " <td>0</td>\n",
895
+ " <td>18.921309</td>\n",
896
+ " <td>72.784597</td>\n",
897
+ " <td></td>\n",
898
+ " </tr>\n",
899
+ " <tr>\n",
900
+ " <th>13</th>\n",
901
+ " <td>3</td>\n",
902
+ " <td>1</td>\n",
903
+ " <td>18.921309</td>\n",
904
+ " <td>72.794102</td>\n",
905
+ " <td></td>\n",
906
+ " </tr>\n",
907
+ " <tr>\n",
908
+ " <th>14</th>\n",
909
+ " <td>3</td>\n",
910
+ " <td>2</td>\n",
911
+ " <td>18.921309</td>\n",
912
+ " <td>72.803607</td>\n",
913
+ " <td></td>\n",
914
+ " </tr>\n",
915
+ " <tr>\n",
916
+ " <th>15</th>\n",
917
+ " <td>3</td>\n",
918
+ " <td>3</td>\n",
919
+ " <td>18.921309</td>\n",
920
+ " <td>72.813112</td>\n",
921
+ " <td>Cafe Coffee Day is a Food &amp; Drink, King Plaza ...</td>\n",
922
+ " </tr>\n",
923
+ " </tbody>\n",
924
+ "</table>\n",
925
+ "</div>"
926
+ ],
927
+ "text/plain": [
928
+ " row col latitude longitude \\\n",
929
+ "0 0 0 18.894330 72.784597 \n",
930
+ "1 0 1 18.894330 72.794102 \n",
931
+ "2 0 2 18.894330 72.803607 \n",
932
+ "3 0 3 18.894330 72.813112 \n",
933
+ "4 1 0 18.903323 72.784597 \n",
934
+ "5 1 1 18.903323 72.794102 \n",
935
+ "6 1 2 18.903323 72.803607 \n",
936
+ "7 1 3 18.903323 72.813112 \n",
937
+ "8 2 0 18.912316 72.784597 \n",
938
+ "9 2 1 18.912316 72.794102 \n",
939
+ "10 2 2 18.912316 72.803607 \n",
940
+ "11 2 3 18.912316 72.813112 \n",
941
+ "12 3 0 18.921309 72.784597 \n",
942
+ "13 3 1 18.921309 72.794102 \n",
943
+ "14 3 2 18.921309 72.803607 \n",
944
+ "15 3 3 18.921309 72.813112 \n",
945
+ "\n",
946
+ " Map Data \n",
947
+ "0 \n",
948
+ "1 Prongs Reef is a Natural, \n",
949
+ "2 United Services Club Golf Course is a Leisure ... \n",
950
+ "3 Indian Meterological Department is a Commercia... \n",
951
+ "4 \n",
952
+ "5 \n",
953
+ "6 Jagadish Canteen is a Food & Drink, Maratha St... \n",
954
+ "7 Indian Meterological Department is a Commercia... \n",
955
+ "8 \n",
956
+ "9 \n",
957
+ "10 Jagadish Canteen is a Food & Drink, Maratha St... \n",
958
+ "11 Cafe Coffee Day is a Food & Drink, King Plaza ... \n",
959
+ "12 \n",
960
+ "13 \n",
961
+ "14 \n",
962
+ "15 Cafe Coffee Day is a Food & Drink, King Plaza ... "
963
+ ]
964
+ },
965
+ "execution_count": 107,
966
+ "metadata": {},
967
+ "output_type": "execute_result"
968
+ }
969
+ ],
970
+ "source": [
971
+ "grid_df.head(20)"
972
+ ]
973
+ }
974
+ ],
975
+ "metadata": {
976
+ "kernelspec": {
977
+ "display_name": "Python 3",
978
+ "language": "python",
979
+ "name": "python3"
980
+ },
981
+ "language_info": {
982
+ "codemirror_mode": {
983
+ "name": "ipython",
984
+ "version": 3
985
+ },
986
+ "file_extension": ".py",
987
+ "mimetype": "text/x-python",
988
+ "name": "python",
989
+ "nbconvert_exporter": "python",
990
+ "pygments_lexer": "ipython3",
991
+ "version": "3.12.0"
992
+ }
993
+ },
994
+ "nbformat": 4,
995
+ "nbformat_minor": 2
996
+ }
src/main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ######################################## IMPORTING REQUIRED LIBRARIES ####################################
2
+ import os
3
+ import sys
4
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
6
+ from utilities import get_data, input_filter, clean_data
7
+
8
+
9
+ ################################################## INPUTS ################################################
10
+
11
+ left_lat = 18.889833
12
+ left_lon = 72.779844
13
+ dist = 35
14
+
15
+ def data_sourcing():
16
+ lat, lon = input_filter(lat = left_lat, lon=left_lon)
17
+ df = get_data(lat, lon, dist)
18
+ df.to_csv(f'{data_folder}/MMR_DATA.csv', index=False)
19
+ return df
20
+
21
+ def data_clean_for_training(df):
22
+ df = clean_data(df)
23
+ df.to_csv(f'{data_folder}/MMR_DATA_CLEAN.csv', index=False)
24
+ return df
25
+
26
+
27
+ if __name__ == '__main__':
28
+
29
+ df = data_sourcing() ## testing the data sourcing endpoint
30
+ if df:
31
+ print("Data loaded successfully !!")
32
+
33
+ clean_df = data_clean_for_training(df)
utilities/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .data_loader import get_data, input_filter
2
+ from .data_cleaner import clean_data
utilities/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (277 Bytes). View file
 
utilities/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (251 Bytes). View file
 
utilities/__pycache__/data_loader.cpython-311.pyc ADDED
Binary file (10.1 kB). View file
 
utilities/__pycache__/data_loader.cpython-312.pyc ADDED
Binary file (8.92 kB). View file
 
utilities/data_cleaner.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.stem import PorterStemmer
5
+ from nltk.stem import WordNetLemmatizer
6
+
7
+ def clean_text(text):
8
+ nltk.download('stopwords')
9
+ nltk.download('wordnet')
10
+
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = PorterStemmer()
13
+ lemmatizer = WordNetLemmatizer()
14
+
15
+ text = re.sub(r'[^\w\s]', '', text)
16
+ text = text.lower()
17
+ text = [word for word in text.split() if word not in stop_words]
18
+ text = [stemmer.stem(word) for word in text]
19
+ text = [lemmatizer.lemmatize(word) for word in text]
20
+ return ' '.join(text)
21
+
22
+ def clean_data(df):
23
+ df['Map Data'] = df['Map Data'].fillna('')
24
+ df = df[df['Map Data'].str.len() > 0]
25
+ df = df[df['Map Data'].str.len() < 5000]
26
+ # df['Map Data'] = df['Map Data'].apply(clean_text)
27
+ return df
utilities/data_loader.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import re
4
+ import math
5
+ from typing import Tuple, List, Dict
6
+
7
+ def fetch_osm_data(lat: float, lon: float, radius: int) -> List[Dict]:
8
+ overpass_url = "http://overpass-api.de/api/interpreter"
9
+ overpass_query = f"""
10
+ [out:json];
11
+ (
12
+ node["name"](around:{radius},{lat},{lon});
13
+ way["name"](around:{radius},{lat},{lon});
14
+ relation["name"](around:{radius},{lat},{lon});
15
+ );
16
+ out center;
17
+ """
18
+
19
+ response = requests.get(overpass_url, params={'data': overpass_query})
20
+ data = response.json()
21
+ return data['elements']
22
+
23
+ def determine_location_type(tags: Dict[str, str]) -> str:
24
+ # Residential
25
+ if 'building' in tags and tags['building'] in ['residential', 'house', 'apartments', 'detached', 'terrace', 'dormitory', 'bungalow']:
26
+ return 'Residential'
27
+
28
+ # Commercial
29
+ if any(key in tags for key in ['shop', 'office', 'craft']):
30
+ return 'Commercial'
31
+ if 'building' in tags and tags['building'] in ['commercial', 'office', 'retail', 'supermarket', 'kiosk']:
32
+ return 'Commercial'
33
+
34
+ # Industrial
35
+ if 'building' in tags and tags['building'] in ['industrial', 'warehouse', 'factory', 'manufacture']:
36
+ return 'Industrial'
37
+ if 'industrial' in tags or 'industry' in tags:
38
+ return 'Industrial'
39
+
40
+ # Educational
41
+ if 'amenity' in tags and tags['amenity'] in ['school', 'university', 'college', 'library', 'kindergarten', 'language_school']:
42
+ return 'Educational'
43
+
44
+ # Healthcare
45
+ if 'amenity' in tags and tags['amenity'] in ['hospital', 'clinic', 'doctors', 'dentist', 'pharmacy', 'veterinary']:
46
+ return 'Healthcare'
47
+
48
+ # Food & Drink
49
+ if 'amenity' in tags and tags['amenity'] in ['restaurant', 'cafe', 'bar', 'fast_food', 'pub', 'food_court']:
50
+ return 'Food & Drink'
51
+
52
+ # Leisure & Entertainment
53
+ if 'leisure' in tags or 'tourism' in tags:
54
+ return 'Leisure & Entertainment'
55
+ if 'amenity' in tags and tags['amenity'] in ['theatre', 'cinema', 'nightclub', 'arts_centre', 'community_centre']:
56
+ return 'Leisure & Entertainment'
57
+
58
+ # Transportation
59
+ if 'amenity' in tags and tags['amenity'] in ['parking', 'bicycle_parking', 'bus_station', 'ferry_terminal']:
60
+ return 'Transportation'
61
+ if 'highway' in tags or 'railway' in tags or 'aeroway' in tags:
62
+ return 'Transportation'
63
+
64
+ # Religious
65
+ if 'amenity' in tags and tags['amenity'] in ['place_of_worship', 'monastery']:
66
+ return 'Religious'
67
+
68
+ # Government & Public Services
69
+ if 'amenity' in tags and tags['amenity'] in ['townhall', 'courthouse', 'police', 'fire_station', 'post_office']:
70
+ return 'Government & Public Services'
71
+
72
+ # Parks & Recreation
73
+ if 'leisure' in tags and tags['leisure'] in ['park', 'playground', 'sports_centre', 'stadium', 'garden']:
74
+ return 'Parks & Recreation'
75
+
76
+ # Natural
77
+ if 'natural' in tags:
78
+ return 'Natural'
79
+
80
+ # Landuse
81
+ if 'landuse' in tags:
82
+ landuse = tags['landuse'].capitalize()
83
+ if landuse in ['Residential', 'Commercial', 'Industrial', 'Retail']:
84
+ return landuse
85
+ else:
86
+ return f'Landuse: {landuse}'
87
+
88
+ # If no specific category is found, return 'Other'
89
+ return 'Other'
90
+
91
+ def parse_osm_data(elements: List[Dict]) -> pd.DataFrame:
92
+ parsed_data = []
93
+ for element in elements:
94
+ tags = element.get('tags', {})
95
+ parsed_element = {
96
+ 'ID': f"{element['type']}_{element['id']}",
97
+ 'Location Name': tags.get('name', ''),
98
+ 'Location Type': determine_location_type(tags)
99
+ }
100
+ parsed_data.append(parsed_element)
101
+ if len(parsed_data) == 0:
102
+ return pd.DataFrame(columns=['ID', 'Location Name', 'Location Type'])
103
+ return pd.DataFrame(parsed_data)
104
+
105
+ def get_osm_data(lat: float, lon: float, radius: int) -> pd.DataFrame:
106
+ raw_data = fetch_osm_data(lat, lon, radius)
107
+ return parse_osm_data(raw_data)
108
+
109
+ def dms_to_decimal(coord_str):
110
+ # Regular expression to match the coordinate format
111
+ pattern = r'(\d+)°(\d+)\'([\d.]+)"([NS])\s*(\d+)°(\d+)\'([\d.]+)"([EW])'
112
+
113
+ match = re.match(pattern, coord_str)
114
+ if not match:
115
+ raise ValueError("Invalid coordinate format. Expected format: 19°03'08.6\"N 72°54'06.0\"E")
116
+
117
+ lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()
118
+
119
+ # Convert to decimal degrees
120
+ lat = float(lat_deg) + float(lat_min)/60 + float(lat_sec)/3600
121
+ lon = float(lon_deg) + float(lon_min)/60 + float(lon_sec)/3600
122
+
123
+ # Adjust sign based on direction
124
+ if lat_dir == 'S':
125
+ lat = -lat
126
+ if lon_dir == 'W':
127
+ lon = -lon
128
+
129
+ return lat, lon
130
+
131
+
132
+ def calculate_distant_points(lat: float, lon: float, distance: float) -> tuple:
133
+ # Earth's radius in meters
134
+ R = 6371000
135
+
136
+ # Convert latitude and longitude to radians
137
+ lat_rad = math.radians(lat)
138
+ lon_rad = math.radians(lon)
139
+
140
+ # Calculate the point with the same latitude (moving east-west)
141
+ delta_lon = distance / (R * math.cos(lat_rad))
142
+ lon1 = lon + math.degrees(delta_lon)
143
+
144
+ # Calculate the point with the same longitude (moving north-south)
145
+ delta_lat = distance / R
146
+ lat2 = lat + math.degrees(delta_lat)
147
+
148
+ return ((lat, lon1), (lat2, lon))
149
+
150
+ ## 2d map grid (0,0) --> bottom left
151
+
152
+ def create_map_grid(bottom_left: Tuple[float, float], top_right: Tuple[float, float], rows: int, cols: int) -> List[List[Tuple[float, float]]]:
153
+ grid = []
154
+ lat_unit = (top_right[0] - bottom_left[0]) / rows
155
+ lon_unit = (top_right[1] - bottom_left[1]) / cols
156
+
157
+ for i in range(rows):
158
+ row = []
159
+ for j in range(cols):
160
+ lat = bottom_left[0] + i * lat_unit
161
+ lon = bottom_left[1] + j * lon_unit
162
+ lat = lat + lat_unit / 2
163
+ lon = lon + lon_unit / 2
164
+ row.append((lat, lon))
165
+ grid.append(row)
166
+
167
+ return grid
168
+
169
+ ## entire pipeline
170
+
171
+ left_lat = 18.889833
172
+ left_lon = 72.779844
173
+ dist = 35
174
+
175
+ def input_filter(lat=None, lon=None, string=None):
176
+ if lat != None:
177
+ return (lat, lon)
178
+ elif string != None:
179
+ latitude, longitude = dms_to_decimal(string)
180
+ return (latitude, longitude)
181
+ else:
182
+ return None
183
+
184
+ def get_data(bottom_left_lat, bottom_left_lon, dist):
185
+
186
+ result = calculate_distant_points(bottom_left_lat, bottom_left_lon, 1000*dist)
187
+
188
+ top_right_lat = result[1][0]
189
+ top_right_lon = result[0][1]
190
+ grid = create_map_grid((left_lat, left_lon), (top_right_lat, top_right_lon), dist, dist)
191
+
192
+ grid_dataset = []
193
+ for i, row in enumerate(grid):
194
+ for j, point in enumerate(row):
195
+ result_df = get_osm_data(point[0], point[1], 710)
196
+ # print(result_df.head(3))
197
+ labelled_df = result_df[result_df['Location Type'] != 'Other']
198
+ labelled_df = labelled_df[labelled_df['Location Type'] != 'Religious']
199
+ labelled_df = labelled_df[labelled_df['Location Type'] != 'Transportation']
200
+ loc_types = []
201
+ for row in labelled_df.iterrows():
202
+ loc_type = (row[1]['Location Name'], row[1]['Location Type'])
203
+ if loc_type not in loc_types:
204
+ loc_types.append(loc_type)
205
+
206
+ labelled_df = pd.DataFrame(loc_types, columns=['Location Name', 'Location Type'])
207
+
208
+ row_of_dataset = ''
209
+
210
+ for row in labelled_df.iterrows():
211
+ row_text = row[1]['Location Name'] + ' is a ' + row[1]['Location Type']
212
+ row_of_dataset += row_text + '; '
213
+ ## replacing any coma in the text with a blank space
214
+
215
+ row_of_dataset = row_of_dataset.replace(',', ' ')
216
+
217
+ grid_row = {"row": i, "col": j, "latitude": point[0], "longitude": point[1], "Map Data": row_of_dataset}
218
+ grid_dataset.append(grid_row)
219
+
220
+ grid_df = pd.DataFrame(grid_dataset)
221
+ return grid_df
222
+ # grid_df.to_csv('MMR_DATASET.csv', index=False)
utils/__init__.py DELETED
File without changes