Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Yassmen commited on Oct 27, 2024

Commit

5a1c96e

verified ·

1 Parent(s): 5a89aa5

Create app.py

Browse files

Files changed (1) hide show

app.py +594 -0

app.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import streamlit as st
+import requests
+import numpy as np
+from streamlit_lottie import st_lottie
+from PIL import Image
+import warnings
+warnings.filterwarnings("ignore")
+import requests
+import pandas as pd
+import numpy as np
+from bs4 import BeautifulSoup
+import bs4
+from urllib.request import urlopen
+import time
+import re
+import time
+import matplotlib.pyplot as plt
+import seaborn as sns
+import matplotlib as mpl
+import plotly
+import plotly.express as px
+import plotly.graph_objs as go
+import plotly.offline as py
+from plotly.offline import iplot
+from plotly.subplots import make_subplots
+import plotly.figure_factory as ff
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+#Settings for using the driver without a UI
+options = webdriver.ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument("start-maximized")
+options.add_argument("disable-infobars")
+options.add_argument("--disable-extensions")
+driver = webdriver.Chrome('chromedriver',options=options)
+# wuzzuf function
+def Wuzzuf_scrapping(job_type , job_num):
+    job1 = job_type.split(" ")[0]
+    job2 = job_type.split(" ")[1]
+    link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
+    title = []
+    location = []
+    country = []
+    job_description = []
+    Job_Requirements =[]
+    company_name = []
+    links = []
+    Jop_type = []
+    Career_Level = []
+    company_logo = []
+    Job_Categories = []
+    Skills_And_Tools = []
+    Experience_Needed =[]
+    post_time = []
+    Title = []
+    pages_num = np.ceil(job_num/15)
+    for i in range(int(pages_num) ):
+      link_new = link1 +'&start='+str(i)
+      data  = requests.get(link_new)
+      soup  = BeautifulSoup(data.content)
+      Title = soup.find_all('h2' , {'class': 'css-m604qf'})
+  # to get the info about jobs
+      for x in range(0,len(Title)):
+        t = re.split('\(|\-',Title[x].find('a').text)
+        title.append(t[0].strip())
+        loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
+        r = ""
+        for i in range(len(loc[:-1])):
+          r= r+ ', ' +loc[:-1][i].strip()
+        location.append(r.replace(',', '', 1).strip())
+        country.append(loc[-1].strip())
+        links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
+        m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
+        company_name.append(m)
+        c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
+        if len(c) ==1:
+          Jop_type.append(c[0].text)
+        else:
+          n =[]
+          for i in range(len(c)):
+            n.append(c[i].text)
+          Jop_type.append(n)
+        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
+        Career_Level.append(n[0].text)
+        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
+        yy = n[1].text.replace('·',' ').strip()
+        yy = re.findall('[0-9-+]*',yy)
+        y1 =""
+        for i in range(len(yy)):
+          if any(yy[i]):
+            y1 = y1+yy[i]
+        if y1 != "":
+          Experience_Needed.append(y1)
+        else:
+          Experience_Needed.append("Not Specified")
+        time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
+        post_time.append(time.text)
+  # to get the logo of the company
+        data1  = requests.get(links[x])
+        soup1 = BeautifulSoup(data1.content)
+        company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
+        #time.sleep(4)
+  # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
+        driver = webdriver.Chrome('chromedriver',options=options)
+        #driver.implicitly_wait(10)
+        driver.get(links[x])
+        Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
+        Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
+        job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
+        all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
+        dict_other = {}
+        new = all[0].text.split("\n\n")
+        if len(new)!=1 :
+          for i in range(len(new)):
+            result =[]
+            for k in (new[i].split('\n')[1:]):
+              result.append(k.replace("\u202f"," "))
+              dict_other[new[i].split('\n')[0]] = result
+            #result = re.sub('[\W_]+', '', ini_string)
+          Job_Requirements.append(dict_other)
+        else:
+          nn = new[0].replace("\u202f"," ")
+          Job_Requirements.append(nn.split('\n'))
+  #  create data frame to combine all together
+    df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
+    df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
+    return df[:job_num]
+# linkedin function
+def LINKEDIN_Scrapping(job_search , num_jobs):
+  job1 = job_search.split(" ")[0]
+  job2 = job_search.split(" ")[1]
+  link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
+  # FIRST get main informations about jobs
+  title = []
+  location = []
+  country = []
+  company_name = []
+  post_time = []
+  links =[]
+  # get the specific numbers of jobs
+  l1 = ""
+  ll =""
+  driver = webdriver.Chrome('chromedriver',options=options)
+  driver.get(link1)
+  SCROLL_PAUSE_TIME = 0.5
+  while True :
+    l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
+    ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
+    if len(l1) >= num_jobs:
+      break
+    time.sleep(3)
+    # Get scroll height
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    while True:
+        # Scroll down to bottom
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        # Wait to load page
+        time.sleep(SCROLL_PAUSE_TIME)
+        # Calculate new scroll height and compare with last scroll height
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        last_height = new_height
+    options.add_argument("window-size=1200x600")
+    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
+    print(len(l1))
+    time.sleep(2)
+  l2 = l1[:num_jobs]
+  for info in l2:
+    info_tot = info.text.split("\n")
+    if len(info_tot)==5:
+      title.append(info_tot[1])
+      location.append(info_tot[3])
+      company_name.append(info_tot[2])
+      post_time.append(info_tot[4])
+    else:
+      title.append(info_tot[1])
+      location.append(info_tot[3])
+      company_name.append(info_tot[2])
+      post_time.append(info_tot[5])
+  # get links for jobs
+  l3 = ll[:num_jobs]
+  for i in l3:
+    links.append(i.get_attribute('href'))
+  df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
+    # GET DESCRIPTION AND LOGO
+  def all_description_LOGO(urls):
+    description =[]
+    LOGO =[]
+    for link in urls:
+      driver = webdriver.Chrome('chromedriver',options=options)
+      driver.get(link)
+      options.add_argument("window-size=1200x600")
+      WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
+      qqq= 4+444*58/7+65
+      K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
+      LOGO.append(K.get_attribute('src'))
+      time.sleep(3)
+      t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
+      t_reverse=t.text[::-1]
+      if t_reverse[:9] =="erom wohs":
+        l = len(t.text)
+        strings=t.text[:l-9].split("\n")
+        strings[:] = [x for x in strings if x]
+        description.append(strings)
+      else:
+        strings=t.text.split("\n")
+        strings[:] = [x for x in strings if x]
+        description.append(strings)
+    df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
+    return df_ml
+  # apply desc. and logo function
+  E = all_description_LOGO(links)
+  # other info function
+  def other(urls):
+    frames =[]
+    for url in urls:
+      data1 = requests.get(url)
+      soup1 = BeautifulSoup(data1.content)
+      j =  soup1.find('ul' , {'class': 'description__job-criteria-list'})
+      time.sleep(4)
+      jj=j.find_all('h3')
+      dic ={}
+      for i in range(len(jj)):
+        dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
+      output = pd.DataFrame()
+      output = output.append(dic, ignore_index=True)
+      frames.append(output)
+    result = pd.concat(frames)
+    return result
+  # apply Other function
+  df = other(links)
+  df.fillna('Not_Found',inplace= True)
+  df.reset_index(inplace=True, drop=True)
+ # combine all together
+  result = pd.concat([df_ml,E, df ], axis=1)
+  return result
+##################### map_bubble #####################
+#### function to show map for loaction of the job
+def map_bubble(df):
+  import requests
+  import urllib.parse
+  g =[]
+  for i  in range(len(df.Location)):
+    if  df.Location.loc[i].split(","):
+      g.append(df.Location.loc[i].split(",")[0])
+    else:
+      g.append(df.Location.loc[i])
+  df['new_loc']=g
+  if 'country' in df.columns:
+    df["full_location"] = df["new_loc"] + ", " +df["country"]
+    dict_cities = dict(df.full_location.value_counts())
+  else :
+    dict_cities = dict(df.new_loc.value_counts())
+  lat = []
+  lon = []
+  bubble_df = pd.DataFrame()
+  add=[]
+  val=[]
+  try:
+    for address in dict_cities.keys():
+      url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'
+      response = requests.get(url).json()
+      lat.append(response[0]["lat"])
+      lon.append(response[0]["lon"])
+      add.append(address)
+      val.append(dict_cities[address])
+  except:
+    pass
+  bubble_df['address'] =add
+  bubble_df['lat'] = lat
+  bubble_df['lon'] = lon
+  bubble_df['value'] = val
+  # import the library
+  import folium
+  # Make an empty map
+  m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
+  # add marker one by one on the map
+  for i in range(0,len(bubble_df)):
+    folium.Circle(
+        location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],
+        popup=bubble_df.iloc[i][['address','value']].values,
+        radius=float(bubble_df.iloc[i]['value'])*500,
+        color='#69b3a2',
+        fill=True,
+        fill_color='#69b3a2'
+    ).add_to(m)
+  m
+  # Show the map again
+  return m
+##########################
+#########################
+#### wuzzuf analysis
+def wuzzuf_exp(df1):
+  top10_job_title = df1['Title'].value_counts()[:10]
+  fig1 = px.bar(y=top10_job_title.values,
+              x=top10_job_title.index,
+              color = top10_job_title.index,
+              color_discrete_sequence=px.colors.sequential.deep,
+              text=top10_job_title.values,
+              title= 'Top 10 Job Titles',
+              template= 'plotly_dark')
+  fig1.update_layout(height=500,width=500,
+      xaxis_title="Job Titles",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  st.plotly_chart(fig1)
+  type_grouped = df1['Career_Level'].value_counts()
+  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
+  e_type =dict(df1['Career_Level'].value_counts()).keys()
+  fig2 = px.bar(x = e_type, y = type_grouped.values,
+        color = type_grouped.index,
+        color_discrete_sequence=px.colors.sequential.dense,
+        template = 'plotly_dark',
+        text = type_grouped.values, title = 'Career Level Distribution')
+  fig2.update_layout( height=500, width=500,
+      xaxis_title="Career Level",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  fig2.update_traces(width=0.5)
+  st.plotly_chart(fig2)
+  residence = df1['Location'].value_counts()
+  top10_employee_location = residence[:10]
+  fig3 = px.bar(y=top10_employee_location.values,
+              x=top10_employee_location.index,
+              color = top10_employee_location.index,
+              color_discrete_sequence=px.colors.sequential.deep,
+              text=top10_employee_location.values,
+              title= 'Top 10 Location of job',
+              template= 'plotly_dark')
+  fig3.update_layout(height=500,width=500,
+      xaxis_title="Location of job",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  st.plotly_chart(fig3)
+  type_grouped = df1['Experience_Needed'].value_counts()
+  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
+  e_type =dict(df1['Experience_Needed'].value_counts()).keys()
+  fig4 = px.bar(x = e_type, y = type_grouped.values,
+        color = type_grouped.index,
+        color_discrete_sequence=px.colors.sequential.dense,
+        template = 'plotly_dark',
+        text = type_grouped.values, title = ' Experience Level Distribution')
+  fig4.update_layout(height=500,width=500,
+      xaxis_title=" Experience Level (years)",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  fig4.update_traces(width=0.5)
+  st.plotly_chart(fig4)
+  return
+#########################
+### linkedin analysis
+def linkedin_exp(df1):
+  top10_job_title = df1['Title'].value_counts()[:10]
+  fig1 = px.bar(y=top10_job_title.values,
+              x=top10_job_title.index,
+              color = top10_job_title.index,
+              color_discrete_sequence=px.colors.sequential.deep,
+              text=top10_job_title.values,
+              title= 'Top 10 Job Titles',
+              template= 'plotly_dark')
+  fig1.update_layout(height=500,width=500,
+      xaxis_title="Job Titles",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  st.plotly_chart(fig1)
+  type_grouped = df1['Employment type'].value_counts()
+  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
+  e_type =dict(df1['Employment type'].value_counts()).keys()
+  fig2 = px.bar(x = e_type, y = type_grouped.values,
+        color = type_grouped.index,
+        color_discrete_sequence=px.colors.sequential.dense,
+        template = 'plotly_dark',
+        text = type_grouped.values, title = 'Employment type Distribution')
+  fig2.update_layout( height=500, width=500,
+      xaxis_title="Employment type",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  fig2.update_traces(width=0.5)
+  st.plotly_chart(fig2)
+  residence = df1['Location'].value_counts()
+  top10_employee_location = residence[:10]
+  fig3 = px.bar(y=top10_employee_location.values,
+              x=top10_employee_location.index,
+              color = top10_employee_location.index,
+              color_discrete_sequence=px.colors.sequential.deep,
+              text=top10_employee_location.values,
+              title= 'Top 10 Location of job',
+              template= 'plotly_dark')
+  fig3.update_layout(height=500,width=500,
+      xaxis_title="Location of job",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  st.plotly_chart(fig3)
+  type_grouped = df1['Seniority level'].value_counts()
+  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
+  e_type =dict(df1['Seniority level'].value_counts()).keys()
+  fig4 = px.bar(x = e_type, y = type_grouped.values,
+        color = type_grouped.index,
+        color_discrete_sequence=px.colors.sequential.dense,
+        template = 'plotly_dark',
+        text = type_grouped.values, title = 'Seniority level Distribution')
+  fig4.update_layout(height=500,width=500,
+      xaxis_title="Seniority level",
+      yaxis_title="count",
+      font = dict(size=17,family="Franklin Gothic"))
+  fig4.update_traces(width=0.5)
+  st.plotly_chart(fig4)
+  return
+########################
+####################### stream lit app ################################
+#site = ""
+#job =""
+#num_jobs = 0
+st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
+# ---- HEADER SECTION ----
+with st.container():
+    left_column, right_column = st.columns(2)
+    with left_column:
+        st.subheader("Hi! I am Yassmen :wave:")
+        st.title("An Electronics and Communcation Engineer")
+        st.write(
+            "In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:"
+        )
+        st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
+    with right_column:
+        pass
+       # st_lottie(lottie_coding, height=300, key="coding")
+import streamlit as st
+from streamlit_option_menu import option_menu
+#with st.sidebar:
+   # selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1)
+webs =["Wuzzuf","Linkedin"]
+jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"]
+nums = np.arange(1,1000)
+#with st.sidebar:
+  #if selected == "select website":
+site = st.sidebar.selectbox("select one website", webs)
+  #elif selected == "search job":
+job = st.sidebar.selectbox("select one job", jobs)
+  #elif selected == "numbers of jobs":
+num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums)
+import streamlit.components.v1 as components
+import hydralit_components as hc
+n2 = pd.DataFrame()
+if st.sidebar.button('Start Scrapping'):
+  if site =="Wuzzuf":
+    with st.container():
+        st.write("---")
+        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
+        with tab1 :
+          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
+            time.sleep(5)
+            n1 = Wuzzuf_scrapping(job ,num_jobs )
+            try:
+              tab1.dataframe(n1)
+            except:
+              try:
+                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
+              except:
+                tab1.table(n1)
+        with tab2:
+          map_bubble(n1)
+        with tab3:
+          #tab3.plotly_chart(wuzzuf_exp(n1))
+          wuzzuf_exp(n1)
+  if site =="Linkedin":
+    with st.container():
+        st.write("---")
+        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
+        with tab1 :
+          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
+            time.sleep(5)
+            n1 = LINKEDIN_Scrapping(job ,num_jobs )
+            try:
+              tab1.dataframe(n1)
+            except:
+              try:
+                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
+              except:
+                tab1.table(n1)
+        with tab2:
+          map_bubble(n1)
+        with tab3:
+          linkedin_exp(n1)  # WILL CHANGE