Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,594 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
from streamlit_lottie import st_lottie
|
6 |
+
from PIL import Image
|
7 |
+
import warnings
|
8 |
+
warnings.filterwarnings("ignore")
|
9 |
+
import requests
|
10 |
+
import pandas as pd
|
11 |
+
import numpy as np
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import bs4
|
14 |
+
from urllib.request import urlopen
|
15 |
+
import time
|
16 |
+
import re
|
17 |
+
import time
|
18 |
+
import matplotlib.pyplot as plt
|
19 |
+
import seaborn as sns
|
20 |
+
import matplotlib as mpl
|
21 |
+
import plotly
|
22 |
+
import plotly.express as px
|
23 |
+
import plotly.graph_objs as go
|
24 |
+
import plotly.offline as py
|
25 |
+
from plotly.offline import iplot
|
26 |
+
from plotly.subplots import make_subplots
|
27 |
+
import plotly.figure_factory as ff
|
28 |
+
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
29 |
+
from selenium import webdriver
|
30 |
+
from selenium.webdriver.common.by import By
|
31 |
+
from selenium.webdriver.common.keys import Keys
|
32 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
33 |
+
from selenium.webdriver.support import expected_conditions as EC
|
34 |
+
|
35 |
+
#Settings for using the driver without a UI
|
36 |
+
options = webdriver.ChromeOptions()
|
37 |
+
options.add_argument('--headless')
|
38 |
+
options.add_argument('--no-sandbox')
|
39 |
+
options.add_argument('--disable-dev-shm-usage')
|
40 |
+
|
41 |
+
options.add_argument("start-maximized")
|
42 |
+
options.add_argument("disable-infobars")
|
43 |
+
options.add_argument("--disable-extensions")
|
44 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
45 |
+
|
46 |
+
|
47 |
+
# wuzzuf function
|
48 |
+
def Wuzzuf_scrapping(job_type , job_num):
|
49 |
+
job1 = job_type.split(" ")[0]
|
50 |
+
job2 = job_type.split(" ")[1]
|
51 |
+
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
|
52 |
+
title = []
|
53 |
+
location = []
|
54 |
+
country = []
|
55 |
+
job_description = []
|
56 |
+
Job_Requirements =[]
|
57 |
+
company_name = []
|
58 |
+
links = []
|
59 |
+
Jop_type = []
|
60 |
+
Career_Level = []
|
61 |
+
company_logo = []
|
62 |
+
Job_Categories = []
|
63 |
+
Skills_And_Tools = []
|
64 |
+
Experience_Needed =[]
|
65 |
+
post_time = []
|
66 |
+
Title = []
|
67 |
+
pages_num = np.ceil(job_num/15)
|
68 |
+
|
69 |
+
|
70 |
+
for i in range(int(pages_num) ):
|
71 |
+
link_new = link1 +'&start='+str(i)
|
72 |
+
data = requests.get(link_new)
|
73 |
+
soup = BeautifulSoup(data.content)
|
74 |
+
Title = soup.find_all('h2' , {'class': 'css-m604qf'})
|
75 |
+
|
76 |
+
# to get the info about jobs
|
77 |
+
|
78 |
+
for x in range(0,len(Title)):
|
79 |
+
t = re.split('\(|\-',Title[x].find('a').text)
|
80 |
+
title.append(t[0].strip())
|
81 |
+
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
|
82 |
+
r = ""
|
83 |
+
for i in range(len(loc[:-1])):
|
84 |
+
r= r+ ', ' +loc[:-1][i].strip()
|
85 |
+
location.append(r.replace(',', '', 1).strip())
|
86 |
+
country.append(loc[-1].strip())
|
87 |
+
links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
|
88 |
+
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
|
89 |
+
company_name.append(m)
|
90 |
+
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
|
91 |
+
if len(c) ==1:
|
92 |
+
Jop_type.append(c[0].text)
|
93 |
+
else:
|
94 |
+
n =[]
|
95 |
+
for i in range(len(c)):
|
96 |
+
n.append(c[i].text)
|
97 |
+
Jop_type.append(n)
|
98 |
+
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
|
99 |
+
Career_Level.append(n[0].text)
|
100 |
+
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
|
101 |
+
|
102 |
+
yy = n[1].text.replace('·',' ').strip()
|
103 |
+
yy = re.findall('[0-9-+]*',yy)
|
104 |
+
y1 =""
|
105 |
+
for i in range(len(yy)):
|
106 |
+
|
107 |
+
if any(yy[i]):
|
108 |
+
y1 = y1+yy[i]
|
109 |
+
if y1 != "":
|
110 |
+
Experience_Needed.append(y1)
|
111 |
+
else:
|
112 |
+
Experience_Needed.append("Not Specified")
|
113 |
+
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
|
114 |
+
post_time.append(time.text)
|
115 |
+
|
116 |
+
# to get the logo of the company
|
117 |
+
|
118 |
+
data1 = requests.get(links[x])
|
119 |
+
soup1 = BeautifulSoup(data1.content)
|
120 |
+
company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
|
121 |
+
#time.sleep(4)
|
122 |
+
|
123 |
+
|
124 |
+
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
|
125 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
126 |
+
#driver.implicitly_wait(10)
|
127 |
+
driver.get(links[x])
|
128 |
+
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
|
129 |
+
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
|
130 |
+
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
|
131 |
+
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
|
132 |
+
dict_other = {}
|
133 |
+
|
134 |
+
new = all[0].text.split("\n\n")
|
135 |
+
|
136 |
+
if len(new)!=1 :
|
137 |
+
for i in range(len(new)):
|
138 |
+
result =[]
|
139 |
+
for k in (new[i].split('\n')[1:]):
|
140 |
+
result.append(k.replace("\u202f"," "))
|
141 |
+
dict_other[new[i].split('\n')[0]] = result
|
142 |
+
|
143 |
+
#result = re.sub('[\W_]+', '', ini_string)
|
144 |
+
|
145 |
+
Job_Requirements.append(dict_other)
|
146 |
+
|
147 |
+
else:
|
148 |
+
nn = new[0].replace("\u202f"," ")
|
149 |
+
Job_Requirements.append(nn.split('\n'))
|
150 |
+
|
151 |
+
|
152 |
+
# create data frame to combine all together
|
153 |
+
|
154 |
+
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
|
155 |
+
|
156 |
+
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
|
157 |
+
return df[:job_num]
|
158 |
+
|
159 |
+
|
160 |
+
# linkedin function
|
161 |
+
|
162 |
+
|
163 |
+
def LINKEDIN_Scrapping(job_search , num_jobs):
|
164 |
+
job1 = job_search.split(" ")[0]
|
165 |
+
job2 = job_search.split(" ")[1]
|
166 |
+
|
167 |
+
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
|
168 |
+
|
169 |
+
# FIRST get main informations about jobs
|
170 |
+
|
171 |
+
title = []
|
172 |
+
location = []
|
173 |
+
country = []
|
174 |
+
company_name = []
|
175 |
+
post_time = []
|
176 |
+
links =[]
|
177 |
+
# get the specific numbers of jobs
|
178 |
+
l1 = ""
|
179 |
+
ll =""
|
180 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
181 |
+
driver.get(link1)
|
182 |
+
SCROLL_PAUSE_TIME = 0.5
|
183 |
+
while True :
|
184 |
+
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
|
185 |
+
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
|
186 |
+
|
187 |
+
if len(l1) >= num_jobs:
|
188 |
+
break
|
189 |
+
time.sleep(3)
|
190 |
+
# Get scroll height
|
191 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
192 |
+
while True:
|
193 |
+
|
194 |
+
# Scroll down to bottom
|
195 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
196 |
+
|
197 |
+
# Wait to load page
|
198 |
+
time.sleep(SCROLL_PAUSE_TIME)
|
199 |
+
|
200 |
+
# Calculate new scroll height and compare with last scroll height
|
201 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
202 |
+
if new_height == last_height:
|
203 |
+
break
|
204 |
+
last_height = new_height
|
205 |
+
|
206 |
+
options.add_argument("window-size=1200x600")
|
207 |
+
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
|
208 |
+
print(len(l1))
|
209 |
+
time.sleep(2)
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
l2 = l1[:num_jobs]
|
214 |
+
|
215 |
+
for info in l2:
|
216 |
+
info_tot = info.text.split("\n")
|
217 |
+
if len(info_tot)==5:
|
218 |
+
title.append(info_tot[1])
|
219 |
+
location.append(info_tot[3])
|
220 |
+
company_name.append(info_tot[2])
|
221 |
+
post_time.append(info_tot[4])
|
222 |
+
else:
|
223 |
+
title.append(info_tot[1])
|
224 |
+
location.append(info_tot[3])
|
225 |
+
company_name.append(info_tot[2])
|
226 |
+
post_time.append(info_tot[5])
|
227 |
+
|
228 |
+
# get links for jobs
|
229 |
+
l3 = ll[:num_jobs]
|
230 |
+
for i in l3:
|
231 |
+
links.append(i.get_attribute('href'))
|
232 |
+
|
233 |
+
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
# GET DESCRIPTION AND LOGO
|
239 |
+
def all_description_LOGO(urls):
|
240 |
+
description =[]
|
241 |
+
LOGO =[]
|
242 |
+
for link in urls:
|
243 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
244 |
+
driver.get(link)
|
245 |
+
options.add_argument("window-size=1200x600")
|
246 |
+
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
|
247 |
+
qqq= 4+444*58/7+65
|
248 |
+
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
|
249 |
+
LOGO.append(K.get_attribute('src'))
|
250 |
+
time.sleep(3)
|
251 |
+
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
|
252 |
+
t_reverse=t.text[::-1]
|
253 |
+
|
254 |
+
if t_reverse[:9] =="erom wohs":
|
255 |
+
l = len(t.text)
|
256 |
+
strings=t.text[:l-9].split("\n")
|
257 |
+
strings[:] = [x for x in strings if x]
|
258 |
+
description.append(strings)
|
259 |
+
else:
|
260 |
+
strings=t.text.split("\n")
|
261 |
+
strings[:] = [x for x in strings if x]
|
262 |
+
description.append(strings)
|
263 |
+
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
|
264 |
+
|
265 |
+
return df_ml
|
266 |
+
|
267 |
+
# apply desc. and logo function
|
268 |
+
E = all_description_LOGO(links)
|
269 |
+
|
270 |
+
# other info function
|
271 |
+
def other(urls):
|
272 |
+
frames =[]
|
273 |
+
for url in urls:
|
274 |
+
data1 = requests.get(url)
|
275 |
+
soup1 = BeautifulSoup(data1.content)
|
276 |
+
j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
|
277 |
+
time.sleep(4)
|
278 |
+
jj=j.find_all('h3')
|
279 |
+
dic ={}
|
280 |
+
for i in range(len(jj)):
|
281 |
+
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
|
282 |
+
output = pd.DataFrame()
|
283 |
+
output = output.append(dic, ignore_index=True)
|
284 |
+
frames.append(output)
|
285 |
+
result = pd.concat(frames)
|
286 |
+
return result
|
287 |
+
|
288 |
+
# apply Other function
|
289 |
+
df = other(links)
|
290 |
+
df.fillna('Not_Found',inplace= True)
|
291 |
+
df.reset_index(inplace=True, drop=True)
|
292 |
+
|
293 |
+
# combine all together
|
294 |
+
result = pd.concat([df_ml,E, df ], axis=1)
|
295 |
+
|
296 |
+
return result
|
297 |
+
|
298 |
+
|
299 |
+
##################### map_bubble #####################
|
300 |
+
|
301 |
+
#### function to show map for loaction of the job
|
302 |
+
|
303 |
+
|
304 |
+
|
305 |
+
def map_bubble(df):
|
306 |
+
|
307 |
+
import requests
|
308 |
+
import urllib.parse
|
309 |
+
g =[]
|
310 |
+
for i in range(len(df.Location)):
|
311 |
+
|
312 |
+
if df.Location.loc[i].split(","):
|
313 |
+
g.append(df.Location.loc[i].split(",")[0])
|
314 |
+
else:
|
315 |
+
g.append(df.Location.loc[i])
|
316 |
+
df['new_loc']=g
|
317 |
+
if 'country' in df.columns:
|
318 |
+
df["full_location"] = df["new_loc"] + ", " +df["country"]
|
319 |
+
dict_cities = dict(df.full_location.value_counts())
|
320 |
+
else :
|
321 |
+
dict_cities = dict(df.new_loc.value_counts())
|
322 |
+
lat = []
|
323 |
+
lon = []
|
324 |
+
bubble_df = pd.DataFrame()
|
325 |
+
add=[]
|
326 |
+
val=[]
|
327 |
+
try:
|
328 |
+
for address in dict_cities.keys():
|
329 |
+
url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'
|
330 |
+
|
331 |
+
response = requests.get(url).json()
|
332 |
+
lat.append(response[0]["lat"])
|
333 |
+
lon.append(response[0]["lon"])
|
334 |
+
add.append(address)
|
335 |
+
val.append(dict_cities[address])
|
336 |
+
except:
|
337 |
+
pass
|
338 |
+
|
339 |
+
bubble_df['address'] =add
|
340 |
+
bubble_df['lat'] = lat
|
341 |
+
bubble_df['lon'] = lon
|
342 |
+
bubble_df['value'] = val
|
343 |
+
|
344 |
+
|
345 |
+
# import the library
|
346 |
+
import folium
|
347 |
+
|
348 |
+
# Make an empty map
|
349 |
+
m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
|
350 |
+
# add marker one by one on the map
|
351 |
+
for i in range(0,len(bubble_df)):
|
352 |
+
folium.Circle(
|
353 |
+
location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],
|
354 |
+
|
355 |
+
popup=bubble_df.iloc[i][['address','value']].values,
|
356 |
+
radius=float(bubble_df.iloc[i]['value'])*500,
|
357 |
+
color='#69b3a2',
|
358 |
+
fill=True,
|
359 |
+
fill_color='#69b3a2'
|
360 |
+
).add_to(m)
|
361 |
+
m
|
362 |
+
# Show the map again
|
363 |
+
return m
|
364 |
+
|
365 |
+
|
366 |
+
##########################
|
367 |
+
|
368 |
+
|
369 |
+
|
370 |
+
|
371 |
+
|
372 |
+
#########################
|
373 |
+
#### wuzzuf analysis
|
374 |
+
def wuzzuf_exp(df1):
|
375 |
+
top10_job_title = df1['Title'].value_counts()[:10]
|
376 |
+
fig1 = px.bar(y=top10_job_title.values,
|
377 |
+
x=top10_job_title.index,
|
378 |
+
color = top10_job_title.index,
|
379 |
+
color_discrete_sequence=px.colors.sequential.deep,
|
380 |
+
text=top10_job_title.values,
|
381 |
+
title= 'Top 10 Job Titles',
|
382 |
+
template= 'plotly_dark')
|
383 |
+
fig1.update_layout(height=500,width=500,
|
384 |
+
xaxis_title="Job Titles",
|
385 |
+
yaxis_title="count",
|
386 |
+
font = dict(size=17,family="Franklin Gothic"))
|
387 |
+
st.plotly_chart(fig1)
|
388 |
+
|
389 |
+
type_grouped = df1['Career_Level'].value_counts()
|
390 |
+
#e_type = ['Full-Time','Part-Time','Contract','Freelance']
|
391 |
+
e_type =dict(df1['Career_Level'].value_counts()).keys()
|
392 |
+
fig2 = px.bar(x = e_type, y = type_grouped.values,
|
393 |
+
color = type_grouped.index,
|
394 |
+
color_discrete_sequence=px.colors.sequential.dense,
|
395 |
+
template = 'plotly_dark',
|
396 |
+
text = type_grouped.values, title = 'Career Level Distribution')
|
397 |
+
fig2.update_layout( height=500, width=500,
|
398 |
+
xaxis_title="Career Level",
|
399 |
+
yaxis_title="count",
|
400 |
+
font = dict(size=17,family="Franklin Gothic"))
|
401 |
+
fig2.update_traces(width=0.5)
|
402 |
+
st.plotly_chart(fig2)
|
403 |
+
residence = df1['Location'].value_counts()
|
404 |
+
top10_employee_location = residence[:10]
|
405 |
+
fig3 = px.bar(y=top10_employee_location.values,
|
406 |
+
x=top10_employee_location.index,
|
407 |
+
color = top10_employee_location.index,
|
408 |
+
color_discrete_sequence=px.colors.sequential.deep,
|
409 |
+
text=top10_employee_location.values,
|
410 |
+
title= 'Top 10 Location of job',
|
411 |
+
template= 'plotly_dark')
|
412 |
+
fig3.update_layout(height=500,width=500,
|
413 |
+
xaxis_title="Location of job",
|
414 |
+
yaxis_title="count",
|
415 |
+
font = dict(size=17,family="Franklin Gothic"))
|
416 |
+
st.plotly_chart(fig3)
|
417 |
+
|
418 |
+
type_grouped = df1['Experience_Needed'].value_counts()
|
419 |
+
#e_type = ['Full-Time','Part-Time','Contract','Freelance']
|
420 |
+
e_type =dict(df1['Experience_Needed'].value_counts()).keys()
|
421 |
+
fig4 = px.bar(x = e_type, y = type_grouped.values,
|
422 |
+
color = type_grouped.index,
|
423 |
+
color_discrete_sequence=px.colors.sequential.dense,
|
424 |
+
template = 'plotly_dark',
|
425 |
+
text = type_grouped.values, title = ' Experience Level Distribution')
|
426 |
+
fig4.update_layout(height=500,width=500,
|
427 |
+
xaxis_title=" Experience Level (years)",
|
428 |
+
yaxis_title="count",
|
429 |
+
font = dict(size=17,family="Franklin Gothic"))
|
430 |
+
fig4.update_traces(width=0.5)
|
431 |
+
st.plotly_chart(fig4)
|
432 |
+
return
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
#########################
|
437 |
+
### linkedin analysis
|
438 |
+
|
439 |
+
def linkedin_exp(df1):
|
440 |
+
top10_job_title = df1['Title'].value_counts()[:10]
|
441 |
+
fig1 = px.bar(y=top10_job_title.values,
|
442 |
+
x=top10_job_title.index,
|
443 |
+
color = top10_job_title.index,
|
444 |
+
color_discrete_sequence=px.colors.sequential.deep,
|
445 |
+
text=top10_job_title.values,
|
446 |
+
title= 'Top 10 Job Titles',
|
447 |
+
template= 'plotly_dark')
|
448 |
+
fig1.update_layout(height=500,width=500,
|
449 |
+
xaxis_title="Job Titles",
|
450 |
+
yaxis_title="count",
|
451 |
+
font = dict(size=17,family="Franklin Gothic"))
|
452 |
+
st.plotly_chart(fig1)
|
453 |
+
|
454 |
+
type_grouped = df1['Employment type'].value_counts()
|
455 |
+
#e_type = ['Full-Time','Part-Time','Contract','Freelance']
|
456 |
+
e_type =dict(df1['Employment type'].value_counts()).keys()
|
457 |
+
fig2 = px.bar(x = e_type, y = type_grouped.values,
|
458 |
+
color = type_grouped.index,
|
459 |
+
color_discrete_sequence=px.colors.sequential.dense,
|
460 |
+
template = 'plotly_dark',
|
461 |
+
text = type_grouped.values, title = 'Employment type Distribution')
|
462 |
+
fig2.update_layout( height=500, width=500,
|
463 |
+
xaxis_title="Employment type",
|
464 |
+
yaxis_title="count",
|
465 |
+
font = dict(size=17,family="Franklin Gothic"))
|
466 |
+
fig2.update_traces(width=0.5)
|
467 |
+
st.plotly_chart(fig2)
|
468 |
+
residence = df1['Location'].value_counts()
|
469 |
+
top10_employee_location = residence[:10]
|
470 |
+
fig3 = px.bar(y=top10_employee_location.values,
|
471 |
+
x=top10_employee_location.index,
|
472 |
+
color = top10_employee_location.index,
|
473 |
+
color_discrete_sequence=px.colors.sequential.deep,
|
474 |
+
text=top10_employee_location.values,
|
475 |
+
title= 'Top 10 Location of job',
|
476 |
+
template= 'plotly_dark')
|
477 |
+
fig3.update_layout(height=500,width=500,
|
478 |
+
xaxis_title="Location of job",
|
479 |
+
yaxis_title="count",
|
480 |
+
font = dict(size=17,family="Franklin Gothic"))
|
481 |
+
st.plotly_chart(fig3)
|
482 |
+
|
483 |
+
type_grouped = df1['Seniority level'].value_counts()
|
484 |
+
#e_type = ['Full-Time','Part-Time','Contract','Freelance']
|
485 |
+
e_type =dict(df1['Seniority level'].value_counts()).keys()
|
486 |
+
fig4 = px.bar(x = e_type, y = type_grouped.values,
|
487 |
+
color = type_grouped.index,
|
488 |
+
color_discrete_sequence=px.colors.sequential.dense,
|
489 |
+
template = 'plotly_dark',
|
490 |
+
text = type_grouped.values, title = 'Seniority level Distribution')
|
491 |
+
fig4.update_layout(height=500,width=500,
|
492 |
+
xaxis_title="Seniority level",
|
493 |
+
yaxis_title="count",
|
494 |
+
font = dict(size=17,family="Franklin Gothic"))
|
495 |
+
fig4.update_traces(width=0.5)
|
496 |
+
st.plotly_chart(fig4)
|
497 |
+
return
|
498 |
+
|
499 |
+
|
500 |
+
########################
|
501 |
+
|
502 |
+
####################### stream lit app ################################
|
503 |
+
|
504 |
+
#site = ""
|
505 |
+
#job =""
|
506 |
+
#num_jobs = 0
|
507 |
+
|
508 |
+
st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
|
509 |
+
|
510 |
+
|
511 |
+
# ---- HEADER SECTION ----
|
512 |
+
with st.container():
|
513 |
+
left_column, right_column = st.columns(2)
|
514 |
+
with left_column:
|
515 |
+
st.subheader("Hi! I am Yassmen :wave:")
|
516 |
+
st.title("An Electronics and Communcation Engineer")
|
517 |
+
st.write(
|
518 |
+
"In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:"
|
519 |
+
)
|
520 |
+
st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
|
521 |
+
with right_column:
|
522 |
+
pass
|
523 |
+
# st_lottie(lottie_coding, height=300, key="coding")
|
524 |
+
|
525 |
+
|
526 |
+
|
527 |
+
import streamlit as st
|
528 |
+
from streamlit_option_menu import option_menu
|
529 |
+
|
530 |
+
#with st.sidebar:
|
531 |
+
# selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1)
|
532 |
+
|
533 |
+
webs =["Wuzzuf","Linkedin"]
|
534 |
+
jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"]
|
535 |
+
nums = np.arange(1,1000)
|
536 |
+
|
537 |
+
#with st.sidebar:
|
538 |
+
#if selected == "select website":
|
539 |
+
site = st.sidebar.selectbox("select one website", webs)
|
540 |
+
#elif selected == "search job":
|
541 |
+
job = st.sidebar.selectbox("select one job", jobs)
|
542 |
+
#elif selected == "numbers of jobs":
|
543 |
+
num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums)
|
544 |
+
|
545 |
+
|
546 |
+
|
547 |
+
import streamlit.components.v1 as components
|
548 |
+
|
549 |
+
import hydralit_components as hc
|
550 |
+
n2 = pd.DataFrame()
|
551 |
+
|
552 |
+
if st.sidebar.button('Start Scrapping'):
|
553 |
+
if site =="Wuzzuf":
|
554 |
+
|
555 |
+
with st.container():
|
556 |
+
st.write("---")
|
557 |
+
tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
|
558 |
+
with tab1 :
|
559 |
+
with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
|
560 |
+
time.sleep(5)
|
561 |
+
n1 = Wuzzuf_scrapping(job ,num_jobs )
|
562 |
+
try:
|
563 |
+
tab1.dataframe(n1)
|
564 |
+
except:
|
565 |
+
try:
|
566 |
+
tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success
|
567 |
+
except:
|
568 |
+
tab1.table(n1)
|
569 |
+
with tab2:
|
570 |
+
map_bubble(n1)
|
571 |
+
with tab3:
|
572 |
+
#tab3.plotly_chart(wuzzuf_exp(n1))
|
573 |
+
wuzzuf_exp(n1)
|
574 |
+
|
575 |
+
|
576 |
+
if site =="Linkedin":
|
577 |
+
with st.container():
|
578 |
+
st.write("---")
|
579 |
+
tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
|
580 |
+
with tab1 :
|
581 |
+
with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
|
582 |
+
time.sleep(5)
|
583 |
+
n1 = LINKEDIN_Scrapping(job ,num_jobs )
|
584 |
+
try:
|
585 |
+
tab1.dataframe(n1)
|
586 |
+
except:
|
587 |
+
try:
|
588 |
+
tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success
|
589 |
+
except:
|
590 |
+
tab1.table(n1)
|
591 |
+
with tab2:
|
592 |
+
map_bubble(n1)
|
593 |
+
with tab3:
|
594 |
+
linkedin_exp(n1) # WILL CHANGE
|