job_easz / app.py
Niharmahesh's picture
Update app.py
e883314 verified
raw
history blame
8.82 kB
import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta
import time
import pyarrow as pa
import pyarrow.parquet as pq
import math
import re
# Set page config for a wider layout and custom theme
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
# Custom CSS for black background and styling
st.markdown("""
<style>
.stApp {
background-color: #000000;
color: #FFFFFF;
}
.stButton>button {
background-color: #4e79a7;
color: white;
}
.stSelectbox, .stMultiSelect {
color: #FFFFFF;
}
.stDataFrame {
background-color: #1E1E1E;
}
.plotly-graph-div {
background-color: #1E1E1E;
}
.big-font {
font-size: 48px;
font-weight: bold;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
st.markdown("""
<style>
h1 {
text-align: center;
}
</style>
""", unsafe_allow_html=True)
# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"
@st.cache_data(ttl=3600)
def load_and_concat_data():
api = HfApi()
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
csv_files = [file for file in dataset_files if file.endswith('.csv')]
all_data = []
for file in csv_files:
try:
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
df = pd.read_csv(file_content, engine='pyarrow')
all_data.append(df)
except Exception:
pass # Silently skip files that can't be processed
if not all_data:
return pd.DataFrame()
concatenated_df = pd.concat(all_data, ignore_index=True)
columns_to_keep = [
'site', 'job_url', 'title', 'company', 'location',
'job_type', 'date_posted', 'is_remote', 'company_url'
]
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
# Drop duplicates and rows with NaT in date_posted
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
#filtering based on data in 2024
filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
# Convert titles and company name to lowercase
filtered_df['title'] = filtered_df['title'].str.lower()
filtered_df['company'] = filtered_df['company'].str.lower()
# Function to clean the location
def clean_location(location):
if pd.isna(location):
return location # Return NaN as is
# Convert to lowercase
location = location.lower()
# Remove ', us' or ', usa' from the end using regex
location = re.sub(r',\s*(us|usa)$', '', location)
return location
# Clean the location in place
filtered_df['location'] = filtered_df['location'].apply(clean_location)
#added new line to drop duplciate records
filtered_df = filtered_df.drop_duplicates()
return filtered_df
@st.cache_data()
def get_unique_values(df):
return {
'companies': df['company'].unique(),
'locations': df['location'].unique(),
'job_types': df['job_type'].unique(),
'Role_Name': df['title'].unique(),
'Date_posted': df['date_posted'].unique()
}
def create_chart(data, _x, y, title, color_sequence):
fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
return fig
def create_time_series(df, time_unit='day'):
if time_unit == 'week':
# Group by week and year
df_by_date = df.groupby(df['date_posted'].dt.to_period('W')).size().reset_index(name='count')
df_by_date['date_posted'] = df_by_date['date_posted'].dt.to_timestamp()
else:
# Keep daily grouping as before
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
fig.update_layout(
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
font_color='#FFFFFF',
xaxis_title="Date",
yaxis_title="Number of Job Postings"
)
# Adjust x-axis ticks for weekly view
if time_unit == 'week':
fig.update_xaxes(
dtick="W1",
tickformat="%d %b %Y",
ticklabelmode="period"
)
return fig
def display_dashboard(df):
top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)
today = datetime.now().date()
jobs_today = df[df['date_posted'].dt.date == today].shape[0]
col1, col2 = st.columns(2)
with col1:
st.subheader("Job Postings Overview")
st.metric("Total Job Postings", len(df))
st.metric("Unique Companies", df['company'].nunique())
st.metric("Job Postings Today", jobs_today)
min_date = df['date_posted'].min().date()
max_date = df['date_posted'].max().date()
st.write(f"Job postings from {min_date} to {max_date}")
with col2:
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
st.plotly_chart(fig, use_container_width=True)
# Job Postings Over Time Chart
fig_time_series = create_time_series(df)
st.plotly_chart(fig_time_series, use_container_width=True)
col3, col4 = st.columns(2)
with col3:
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
st.plotly_chart(fig, use_container_width=True)
with col4:
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
st.plotly_chart(fig, use_container_width=True)
def display_about_page():
st.markdown("""
## What is this application?
The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles.
### Key Features:
- **Interactive Dashboard**: Visualize job market trends with dynamic charts and graphs.
- **Data Explorer**: Dive deep into individual job listings with advanced filtering options.
- **Real-time Data**: Fetch the latest job data from our Hugging Face dataset.
## How to use this application
### Dashboard
1. Navigate to the Dashboard using the sidebar.
2. View overall statistics such as total job postings, unique companies, and today's postings.
3. Explore interactive charts showing:
- Top companies hiring
- Job postings over time
- Top locations for job opportunities
- Most common job titles
### Data Explorer
1. Switch to the Data Explorer using the sidebar.
2. Choose between viewing all data or applying filters.
3. Use the multi-select dropdowns to filter by:
- Companies
- Locations
- Job Types
4. Browse the filtered job listings table.
5. Click on job or company links to view more details on the original posting site.
## Data Source
This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily.
## Contact
For questions, feedback, or collaboration opportunities, feel free to reach out:
- LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/)
""")
# Add a clickable LinkedIn button
linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/"
st.markdown(f"""
<a href="{linkedin_url}" target="_blank">
<img src="https://content.linkedin.com/content/dam/me/business/en-us/amp/brand-site/v2/bg/LI-Logo.svg.original.svg" width="100">
</a>
""", unsafe_allow_html=True)
def main():
st.title("Job Easz")
df = load_and_concat_data()
if df.empty:
st.error("No data available. Please check your dataset.")
return
# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"])
if page == "Dashboard":
display_dashboard(df)
elif page == "Data Explorer":
display_data_explorer(df)
elif page == "About":
display_about_page()
if __name__ == "__main__":
main()