UNAVS commited on
Commit
72a6c46
1 Parent(s): 585ffb3

Add source codes and datasets for scraping+analysis+modelling

Browse files
final_code.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
result_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
sample_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
uni_course_scraping.ipynb ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd \n",
10
+ "import numpy as np \n",
11
+ "import re\n",
12
+ "\n",
13
+ "from time import sleep\n",
14
+ "from selenium import webdriver\n",
15
+ "from selenium.webdriver.common.by import By\n",
16
+ "from selenium.common.exceptions import NoSuchElementException\n",
17
+ "from selenium.webdriver.common.keys import Keys\n",
18
+ "from selenium.webdriver.support.select import Select\n",
19
+ "from selenium.webdriver.common.action_chains import ActionChains"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "### Initialize Browser (Chrome)\n",
29
+ "options = webdriver.ChromeOptions()\n",
30
+ "options.add_experimental_option('excludeSwitches', ['enable-logging'])\n",
31
+ "driver = webdriver.Chrome(options=options)"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {},
37
+ "source": [
38
+ "# Collect Data"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 6,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "0 https://iisma.kemdikbud.go.id/info/02-university-college-london/\n",
51
+ "1 https://iisma.kemdikbud.go.id/info/03-university-of-chicago/\n",
52
+ "2 https://iisma.kemdikbud.go.id/info/04-nanyang-technological-university/\n",
53
+ "3 https://iisma.kemdikbud.go.id/info/05-the-university-of-pennsylvania-college-of-liberal-and-professional-studies/\n",
54
+ "4 https://iisma.kemdikbud.go.id/info/06-yale-university/\n",
55
+ "5 https://iisma.kemdikbud.go.id/info/07-university-of-edinburgh/\n",
56
+ "6 https://iisma.kemdikbud.go.id/info/09-the-australian-national-university/\n",
57
+ "7 https://iisma.kemdikbud.go.id/info/10-university-of-melbourne/\n",
58
+ "8 https://iisma.kemdikbud.go.id/info/11-university-of-sydney/\n",
59
+ "9 https://iisma.kemdikbud.go.id/info/12-university-of-new-south-wales/\n",
60
+ "10 https://iisma.kemdikbud.go.id/info/13-university-of-british-columbia/\n",
61
+ "11 https://iisma.kemdikbud.go.id/info/14-the-university-of-queensland/\n",
62
+ "12 https://iisma.kemdikbud.go.id/info/15-monash-university/\n",
63
+ "13 https://iisma.kemdikbud.go.id/info/16-university-of-warwick/\n",
64
+ "14 https://iisma.kemdikbud.go.id/info/17-universiti-malaya/\n",
65
+ "15 https://iisma.kemdikbud.go.id/info/18-national-taiwan-university/\n",
66
+ "16 https://iisma.kemdikbud.go.id/info/18-osaka-university/\n",
67
+ "17 https://iisma.kemdikbud.go.id/info/20-ku-leuven/\n",
68
+ "18 https://iisma.kemdikbud.go.id/info/21-university-of-texas-at-austin/\n",
69
+ "19 https://iisma.kemdikbud.go.id/info/22-university-of-glasgow/\n",
70
+ "20 https://iisma.kemdikbud.go.id/info/23-korea-university/\n",
71
+ "21 https://iisma.kemdikbud.go.id/info/24-m-v-lomonosov-moscow-state-university/\n",
72
+ "22 https://iisma.kemdikbud.go.id/info/25-university-of-auckland/\n",
73
+ "23 https://iisma.kemdikbud.go.id/info/26-university-of-leeds/\n",
74
+ "24 https://iisma.kemdikbud.go.id/info/27-the-university-of-western-australia/\n",
75
+ "25 https://iisma.kemdikbud.go.id/info/28-university-of-birmingham/\n",
76
+ "26 https://iisma.kemdikbud.go.id/info/29-penn-state-university/\n",
77
+ "27 https://iisma.kemdikbud.go.id/info/30-university-of-california-davis/\n",
78
+ "28 https://iisma.kemdikbud.go.id/info/31-boston-university-metropolitan-college/\n",
79
+ "29 https://iisma.kemdikbud.go.id/info/32-the-university-of-adelaide/\n",
80
+ "30 https://iisma.kemdikbud.go.id/info/33-university-college-cork/\n",
81
+ "31 https://iisma.kemdikbud.go.id/info/34-queen-mary-university-of-london/\n",
82
+ "32 https://iisma.kemdikbud.go.id/info/35-uc-chile/\n",
83
+ "33 https://iisma.kemdikbud.go.id/info/36-newcastle-university/\n",
84
+ "34 https://iisma.kemdikbud.go.id/info/37-humboldt-universitat-zu-berlin/\n",
85
+ "35 https://iisma.kemdikbud.go.id/info/38-universiti-kebangsaan-malaysia/\n",
86
+ "36 https://iisma.kemdikbud.go.id/info/39-lancaster-university/\n",
87
+ "37 https://iisma.kemdikbud.go.id/info/40-universiti-sains-malaysia/\n",
88
+ "38 https://iisma.kemdikbud.go.id/info/41-grenoble-ecole-de-management/\n",
89
+ "39 https://iisma.kemdikbud.go.id/info/42-university-of-waterloo/\n",
90
+ "40 https://iisma.kemdikbud.go.id/info/43-university-of-york/\n",
91
+ "41 https://iisma.kemdikbud.go.id/info/44-hanyang-university-seoul-campus/\n",
92
+ "42 https://iisma.kemdikbud.go.id/info/45-michigan-state-university/\n",
93
+ "43 https://iisma.kemdikbud.go.id/info/46-western-university/\n",
94
+ "44 https://iisma.kemdikbud.go.id/info/47-sapienza-university-of-rome/\n",
95
+ "45 https://iisma.kemdikbud.go.id/info/48-university-college-dublin/\n",
96
+ "46 https://iisma.kemdikbud.go.id/info/49-university-of-twente/\n",
97
+ "47 https://iisma.kemdikbud.go.id/info/50-university-of-liverpool/\n",
98
+ "48 https://iisma.kemdikbud.go.id/info/51-university-of-otago/\n",
99
+ "49 https://iisma.kemdikbud.go.id/info/52-keio-university/\n",
100
+ "50 https://iisma.kemdikbud.go.id/info/53-universidad-autonoma-de-madrid/\n",
101
+ "51 https://iisma.kemdikbud.go.id/info/54-vrije-universiteit-amsterdam/\n",
102
+ "52 https://iisma.kemdikbud.go.id/info/55-chulalongkorn-university/\n",
103
+ "53 https://iisma.kemdikbud.go.id/info/56-arizona-state-university/\n",
104
+ "54 https://iisma.kemdikbud.go.id/info/57-radboud-university/\n",
105
+ "55 https://iisma.kemdikbud.go.id/info/58-university-of-sussex/\n",
106
+ "56 https://iisma.kemdikbud.go.id/info/59-maastricht-university/\n",
107
+ "57 https://iisma.kemdikbud.go.id/info/60-universitat-pompeu-fabra/\n",
108
+ "58 https://iisma.kemdikbud.go.id/info/61-university-of-leicester/\n",
109
+ "59 https://iisma.kemdikbud.go.id/info/62-victoria-university-of-wellington/\n",
110
+ "60 https://iisma.kemdikbud.go.id/info/63-university-of-padua/\n",
111
+ "61 https://iisma.kemdikbud.go.id/info/64-university-of-colorado-boulder/\n",
112
+ "62 https://iisma.kemdikbud.go.id/info/65-university-of-galway/\n",
113
+ "63 https://iisma.kemdikbud.go.id/info/66-university-of-canterbury/\n",
114
+ "64 https://iisma.kemdikbud.go.id/info/68-university-of-warsaw/\n",
115
+ "65 https://iisma.kemdikbud.go.id/info/69-university-of-tartu/\n",
116
+ "66 https://iisma.kemdikbud.go.id/info/71-national-taiwan-university-of-science-and-technology-taiwan-tech/\n",
117
+ "67 https://iisma.kemdikbud.go.id/info/72-university-of-pisa/\n",
118
+ "68 https://iisma.kemdikbud.go.id/info/73-leiden-university/\n",
119
+ "69 https://iisma.kemdikbud.go.id/info/73-middle-east-technical-university/\n",
120
+ "70 https://iisma.kemdikbud.go.id/info/74-singapore-management-university/\n",
121
+ "71 https://iisma.kemdikbud.go.id/info/75-university-of-szeged/\n",
122
+ "72 https://iisma.kemdikbud.go.id/info/76-palacky-university-olomouc/\n",
123
+ "73 https://iisma.kemdikbud.go.id/info/77-university-of-zagreb/\n",
124
+ "74 https://iisma.kemdikbud.go.id/info/78-vytautas-magnus-university/\n",
125
+ "75 https://iisma.kemdikbud.go.id/info/lolos-67-sciences-po/\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "link_df = pd.read_excel(\"C:/Users/mhani/Downloads/Link Kampus IISMA.xlsx\", header=None)\n",
131
+ "link_list = link_df[0].to_list()\n",
132
+ "for i in range(len(link_list)):\n",
133
+ " print(i, link_list[i])"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "uni_details = pd.DataFrame(columns=[\"ID\", \"Name\", \"Location\", \"Requirements\", \"Period\", \"Statistics\"])\n",
143
+ "uni_courses = pd.DataFrame(columns=[\"Univ_ID\", \"Course Name\", \"Details\"])"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 17,
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "for i in range(len(link_list)):\n",
153
+ " driver.get(link_list[i])\n",
154
+ " name = driver.find_element(By.XPATH, '//*[@class=\"elementor-heading-title elementor-size-default\"]').text\n",
155
+ " loc = driver.find_elements(By.XPATH, '//*[@class=\"elementor-widget-container\"]')[2].text\n",
156
+ "\n",
157
+ " tab_list = driver.find_elements(By.XPATH, '//*[@class=\"elementor-tab-title elementor-tab-desktop-title\"]')\n",
158
+ " tab_list[0].click()\n",
159
+ " req = driver.find_element(By.ID, 'elementor-tab-content-4502').text\n",
160
+ " tab_list[1].click()\n",
161
+ " period = driver.find_element(By.ID, 'elementor-tab-content-4503').text\n",
162
+ " tab_list[2].click()\n",
163
+ " stats = driver.find_element(By.ID, 'elementor-tab-content-4504').text\n",
164
+ "\n",
165
+ " uni_details.loc[len(uni_details)] = [i+1, name, loc, req, period, stats]\n",
166
+ "\n",
167
+ " course_list = driver.find_elements(By.XPATH, '//*[@class=\"elementor-toggle-title\"]')\n",
168
+ " for j in range(len(course_list)):\n",
169
+ " course_list[j].click()\n",
170
+ " course_name = course_list[j].text\n",
171
+ " detail_content = driver.find_elements(By.XPATH, '//*[@class=\"elementor-tab-content elementor-clearfix elementor-active\"]')[1]\n",
172
+ " inner_detail = detail_content.get_attribute('innerHTML')\n",
173
+ " clean_detail = re.sub('<[^<]+?>', ' ', inner_detail)\n",
174
+ " clean_detail = clean_detail.replace(\"&nbsp;\", \"\")\n",
175
+ " course_list[j].click()\n",
176
+ " sleep(1)\n",
177
+ " uni_courses.loc[len(uni_courses)] = [i+1, course_name, clean_detail]"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 20,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "uni_courses.to_excel(\"uni_courses.xlsx\", index=False)"
187
+ ]
188
+ }
189
+ ],
190
+ "metadata": {
191
+ "kernelspec": {
192
+ "display_name": "Python 3",
193
+ "language": "python",
194
+ "name": "python3"
195
+ },
196
+ "language_info": {
197
+ "codemirror_mode": {
198
+ "name": "ipython",
199
+ "version": 3
200
+ },
201
+ "file_extension": ".py",
202
+ "mimetype": "text/x-python",
203
+ "name": "python",
204
+ "nbconvert_exporter": "python",
205
+ "pygments_lexer": "ipython3",
206
+ "version": "3.10.0"
207
+ }
208
+ },
209
+ "nbformat": 4,
210
+ "nbformat_minor": 2
211
+ }
uni_courses.xlsx ADDED
Binary file (305 kB). View file