Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

job_easz / app.py

Niharmahesh

Update app.py

e883314 verified 4 months ago

raw

history blame

8.82 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	from huggingface_hub import HfApi
	import io
	from datetime import datetime, timedelta
	import time
	import pyarrow as pa
	import pyarrow.parquet as pq
	import math
	import re
	# Set page config for a wider layout and custom theme
	st.set_page_config(layout="wide", page_title="Job Listings Dashboard")

	# Custom CSS for black background and styling
	st.markdown("""
	<style>
	.stApp {
	background-color: #000000;
	color: #FFFFFF;
	}
	.stButton>button {
	background-color: #4e79a7;
	color: white;
	}
	.stSelectbox, .stMultiSelect {
	color: #FFFFFF;
	}
	.stDataFrame {
	background-color: #1E1E1E;
	}
	.plotly-graph-div {
	background-color: #1E1E1E;
	}
	.big-font {
	font-size: 48px;
	font-weight: bold;
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<style>
	h1 {
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	# Hugging Face setup
	HF_TOKEN = st.secrets["HF_TOKEN"]
	HF_USERNAME = st.secrets["HF_USERNAME"]
	DATASET_NAME = "jobeasz"

	@st.cache_data(ttl=3600)
	def load_and_concat_data():
	api = HfApi()
	dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
	csv_files = [file for file in dataset_files if file.endswith('.csv')]

	all_data = []
	for file in csv_files:
	try:
	file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
	df = pd.read_csv(file_content, engine='pyarrow')
	all_data.append(df)
	except Exception:
	pass # Silently skip files that can't be processed

	if not all_data:
	return pd.DataFrame()

	concatenated_df = pd.concat(all_data, ignore_index=True)

	columns_to_keep = [
	'site', 'job_url', 'title', 'company', 'location',
	'job_type', 'date_posted', 'is_remote', 'company_url'
	]
	filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
	filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')

	# Drop duplicates and rows with NaT in date_posted
	filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
	#filtering based on data in 2024
	filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
	# Convert titles and company name to lowercase
	filtered_df['title'] = filtered_df['title'].str.lower()
	filtered_df['company'] = filtered_df['company'].str.lower()

	# Function to clean the location
	def clean_location(location):
	if pd.isna(location):
	return location # Return NaN as is
	# Convert to lowercase
	location = location.lower()
	# Remove ', us' or ', usa' from the end using regex
	location = re.sub(r',\s*(us\|usa)$', '', location)
	return location

	# Clean the location in place
	filtered_df['location'] = filtered_df['location'].apply(clean_location)
	#added new line to drop duplciate records
	filtered_df = filtered_df.drop_duplicates()

	return filtered_df

	@st.cache_data()
	def get_unique_values(df):
	return {
	'companies': df['company'].unique(),
	'locations': df['location'].unique(),
	'job_types': df['job_type'].unique(),
	'Role_Name': df['title'].unique(),
	'Date_posted': df['date_posted'].unique()

	}

	def create_chart(data, _x, y, title, color_sequence):
	fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
	fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
	return fig

	def create_time_series(df, time_unit='day'):
	if time_unit == 'week':
	# Group by week and year
	df_by_date = df.groupby(df['date_posted'].dt.to_period('W')).size().reset_index(name='count')
	df_by_date['date_posted'] = df_by_date['date_posted'].dt.to_timestamp()
	else:
	# Keep daily grouping as before
	df_by_date = df.groupby('date_posted').size().reset_index(name='count')

	fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
	fig.update_layout(
	plot_bgcolor='rgba(0,0,0,0)',
	paper_bgcolor='rgba(0,0,0,0)',
	font_color='#FFFFFF',
	xaxis_title="Date",
	yaxis_title="Number of Job Postings"
	)

	# Adjust x-axis ticks for weekly view
	if time_unit == 'week':
	fig.update_xaxes(
	dtick="W1",
	tickformat="%d %b %Y",
	ticklabelmode="period"
	)

	return fig
	def display_dashboard(df):
	top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)

	today = datetime.now().date()
	jobs_today = df[df['date_posted'].dt.date == today].shape[0]

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Job Postings Overview")
	st.metric("Total Job Postings", len(df))
	st.metric("Unique Companies", df['company'].nunique())
	st.metric("Job Postings Today", jobs_today)

	min_date = df['date_posted'].min().date()
	max_date = df['date_posted'].max().date()
	st.write(f"Job postings from {min_date} to {max_date}")

	with col2:
	fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
	st.plotly_chart(fig, use_container_width=True)

	# Job Postings Over Time Chart
	fig_time_series = create_time_series(df)
	st.plotly_chart(fig_time_series, use_container_width=True)

	col3, col4 = st.columns(2)

	with col3:
	fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
	st.plotly_chart(fig, use_container_width=True)

	with col4:
	fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
	st.plotly_chart(fig, use_container_width=True)
	def display_about_page():
	st.markdown("""
	## What is this application?
	The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles.
	### Key Features:
	- Interactive Dashboard: Visualize job market trends with dynamic charts and graphs.
	- Data Explorer: Dive deep into individual job listings with advanced filtering options.
	- Real-time Data: Fetch the latest job data from our Hugging Face dataset.
	## How to use this application
	### Dashboard
	1. Navigate to the Dashboard using the sidebar.
	2. View overall statistics such as total job postings, unique companies, and today's postings.
	3. Explore interactive charts showing:
	- Top companies hiring
	- Job postings over time
	- Top locations for job opportunities
	- Most common job titles
	### Data Explorer
	1. Switch to the Data Explorer using the sidebar.
	2. Choose between viewing all data or applying filters.
	3. Use the multi-select dropdowns to filter by:
	- Companies
	- Locations
	- Job Types
	4. Browse the filtered job listings table.
	5. Click on job or company links to view more details on the original posting site.
	## Data Source
	This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily.
	## Contact
	For questions, feedback, or collaboration opportunities, feel free to reach out:
	- LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/)
	""")

	# Add a clickable LinkedIn button
	linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/"
	st.markdown(f"""
	<a href="{linkedin_url}" target="_blank">
	<img src="https://content.linkedin.com/content/dam/me/business/en-us/amp/brand-site/v2/bg/LI-Logo.svg.original.svg" width="100">
	</a>
	""", unsafe_allow_html=True)
	def main():
	st.title("Job Easz")

	df = load_and_concat_data()

	if df.empty:
	st.error("No data available. Please check your dataset.")
	return

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"])

	if page == "Dashboard":
	display_dashboard(df)
	elif page == "Data Explorer":
	display_data_explorer(df)
	elif page == "About":
	display_about_page()

	if __name__ == "__main__":
	main()