Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

job_easz / app.py

Niharmahesh

Update app.py

b5907fd verified 4 months ago

raw

history blame

9.99 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	from huggingface_hub import HfApi
	import io
	from datetime import datetime, timedelta
	import time
	import pyarrow as pa
	import pyarrow.parquet as pq
	import math
	# Set page config for a wider layout and custom theme
	st.set_page_config(layout="wide", page_title="Job Listings Dashboard")

	# Custom CSS for black background and styling
	st.markdown("""
	<style>
	.stApp {
	background-color: #000000;
	color: #FFFFFF;
	}
	.stButton>button {
	background-color: #4e79a7;
	color: white;
	}
	.stSelectbox, .stMultiSelect {
	color: #FFFFFF;
	}
	.stDataFrame {
	background-color: #1E1E1E;
	}
	.plotly-graph-div {
	background-color: #1E1E1E;
	}
	.big-font {
	font-size: 48px;
	font-weight: bold;
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<style>
	h1 {
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	# Hugging Face setup
	HF_TOKEN = st.secrets["HF_TOKEN"]
	HF_USERNAME = st.secrets["HF_USERNAME"]
	DATASET_NAME = "jobeasz"

	@st.cache_data(ttl=3600)
	def load_and_concat_data():
	api = HfApi()
	dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
	csv_files = [file for file in dataset_files if file.endswith('.csv')]

	all_data = []
	for file in csv_files:
	try:
	file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
	df = pd.read_csv(file_content, engine='pyarrow')
	all_data.append(df)
	except Exception:
	pass # Silently skip files that can't be processed

	if not all_data:
	return pd.DataFrame()

	concatenated_df = pd.concat(all_data, ignore_index=True)

	columns_to_keep = [
	'site', 'job_url', 'title', 'company', 'location',
	'job_type', 'date_posted', 'is_remote', 'company_url'
	]
	filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
	filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')

	# Drop duplicates and rows with NaT in date_posted
	filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])

	return filtered_df

	@st.cache_data()
	def get_unique_values(df):
	return {
	'companies': df['company'].unique(),
	'locations': df['location'].unique(),
	'job_types': df['job_type'].unique(),
	'Role_Name': df['title'].unique()
	}

	def create_chart(data, _x, y, title, color_sequence):
	fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
	fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
	return fig

	def create_time_series(df):
	df_by_date = df.groupby('date_posted').size().reset_index(name='count')
	fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
	fig.update_layout(
	plot_bgcolor='rgba(0,0,0,0)',
	paper_bgcolor='rgba(0,0,0,0)',
	font_color='#FFFFFF',
	xaxis_title="Date",
	yaxis_title="Number of Job Postings"
	)
	return fig

	@st.cache_data
	def prepare_dashboard_data(df):
	top_companies = df['company'].value_counts().head(10)
	top_locations = df['location'].value_counts().head(10)
	top_job_titles = df['title'].value_counts().head(20)
	df_by_date = df.groupby('date_posted').size().reset_index(name='count')
	return top_companies, top_locations, top_job_titles, df_by_date

	def display_dashboard(df):
	top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)

	today = datetime.now().date()
	jobs_today = df[df['date_posted'].dt.date == today].shape[0]

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Job Postings Overview")
	st.metric("Total Job Postings", len(df))
	st.metric("Unique Companies", df['company'].nunique())
	st.metric("Job Postings Today", jobs_today)

	min_date = df['date_posted'].min().date()
	max_date = df['date_posted'].max().date()
	st.write(f"Job postings from {min_date} to {max_date}")

	with col2:
	fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
	st.plotly_chart(fig, use_container_width=True)

	# Job Postings Over Time Chart
	fig_time_series = create_time_series(df)
	st.plotly_chart(fig_time_series, use_container_width=True)

	col3, col4 = st.columns(2)

	with col3:
	fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
	st.plotly_chart(fig, use_container_width=True)

	with col4:
	fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
	st.plotly_chart(fig, use_container_width=True)
	@st.cache_data
	def filter_dataframe(df, companies, locations, job_types,Role_Name):
	filtered_df = df
	if companies:
	filtered_df = filtered_df[filtered_df['company'].isin(companies)]
	if locations:
	filtered_df = filtered_df[filtered_df['location'].isin(locations)]
	if job_types:
	filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
	if Role_Name:
	filtered_df = filtered_df[filtered_df['title'].isin(job_types)]
	return filtered_df

	def display_data_explorer(df):
	st.subheader("Data Explorer")

	show_all = st.radio("Display", ("All Data", "Filtered Data"))

	if show_all == "Filtered Data":
	unique_values = get_unique_values(df)
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	companies = st.multiselect("Select Companies", options=unique_values['companies'])
	with col2:
	locations = st.multiselect("Select Locations", options=unique_values['locations'])
	with col3:
	job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
	with col4:
	Role_type = st.multiselect("Select Role Types", options=unique_values['Role_Name'])

	filtered_df = filter_dataframe(df, companies, locations, job_types, Role_type)
	else:
	filtered_df = df

	st.write(f"Showing {len(filtered_df)} job listings")

	# Pagination
	items_per_page = 15
	num_pages = math.ceil(len(filtered_df) / items_per_page)

	col1, col2, col3 = st.columns([1, 3, 1])
	with col2:
	page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)

	start_idx = (page - 1) * items_per_page
	end_idx = start_idx + items_per_page

	page_df = filtered_df.iloc[start_idx:end_idx]

	def make_clickable(url):
	return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'

	page_df['job_url'] = page_df['job_url'].apply(make_clickable)
	page_df['company_url'] = page_df['company_url'].apply(make_clickable)

	st.write(page_df.to_html(escape=False, index=False), unsafe_allow_html=True)

	col1, col2, col3 = st.columns([1, 3, 1])
	with col2:
	st.write(f"Page {page} of {num_pages}")
	def display_about_page():
	st.markdown("""
	## What is this application?

	The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles.

	### Key Features:
	- Interactive Dashboard: Visualize job market trends with dynamic charts and graphs.
	- Data Explorer: Dive deep into individual job listings with advanced filtering options.
	- Real-time Data: Fetch the latest job data from our Hugging Face dataset.

	## How to use this application

	### Dashboard
	1. Navigate to the Dashboard using the sidebar.
	2. View overall statistics such as total job postings, unique companies, and today's postings.
	3. Explore interactive charts showing:
	- Top companies hiring
	- Job postings over time
	- Top locations for job opportunities
	- Most common job titles

	### Data Explorer
	1. Switch to the Data Explorer using the sidebar.
	2. Choose between viewing all data or applying filters.
	3. Use the multi-select dropdowns to filter by:
	- Companies
	- Locations
	- Job Types
	4. Browse the filtered job listings table.
	5. Click on job or company links to view more details on the original posting site.

	## Data Source
	This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily.

	## Contact
	For questions, feedback, or collaboration opportunities, feel free to reach out:

	- LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/)
	""")

	# Add a clickable LinkedIn button
	linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/"
	st.markdown(f"""
	<a href="{linkedin_url}" target="_blank">
	<img src="https://content.linkedin.com/content/dam/me/business/en-us/amp/brand-site/v2/bg/LI-Logo.svg.original.svg" width="100">
	</a>
	""", unsafe_allow_html=True)
	def main():
	st.title("Job Easz")

	df = load_and_concat_data()

	if df.empty:
	st.error("No data available. Please check your dataset.")
	return

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"])

	if page == "Dashboard":
	display_dashboard(df)
	elif page == "Data Explorer":
	display_data_explorer(df)
	elif page == "About":
	display_about_page()

	if __name__ == "__main__":
	main()