import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta
import time
# Set page config for a wider layout and custom theme
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
# Custom CSS for black background and styling
st.markdown("""
""", unsafe_allow_html=True)
# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"
@st.cache_data(ttl=3600)
def load_and_concat_data():
api = HfApi()
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
csv_files = [file for file in dataset_files if file.endswith('.csv')]
all_data = []
for file in csv_files:
try:
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
df = pd.read_csv(file_content)
all_data.append(df)
except Exception:
pass # Silently skip files that can't be processed
if not all_data:
return pd.DataFrame()
concatenated_df = pd.concat(all_data, ignore_index=True)
columns_to_keep = [
'site', 'job_url', 'title', 'company', 'location',
'job_type', 'date_posted', 'is_remote', 'company_url'
]
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
# Drop duplicates and rows with NaT in date_posted
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
return filtered_df
@st.cache_data()
def get_unique_values(df):
return {
'companies': df['company'].unique(),
'locations': df['location'].unique(),
'job_types': df['job_type'].unique()
}
def create_chart(data, _x, y, title, color_sequence):
fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
return fig
def create_time_series(df):
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
return fig
def display_dashboard(df):
col1, col2 = st.columns(2)
with col1:
st.subheader("Job Postings Overview")
st.metric("Total Job Postings", len(df))
st.metric("Unique Companies", df['company'].nunique())
st.metric("Unique Locations", df['location'].nunique())
min_date = df['date_posted'].min().date()
max_date = df['date_posted'].max().date()
st.write(f"Job postings from {min_date} to {max_date}")
with col2:
top_companies = df['company'].value_counts().head(10)
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
st.plotly_chart(fig, use_container_width=True)
# Job Postings Over Time Chart
fig_time_series = create_time_series(df)
st.plotly_chart(fig_time_series, use_container_width=True)
col3, col4 = st.columns(2)
with col3:
top_locations = df['location'].value_counts().head(10)
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
st.plotly_chart(fig, use_container_width=True)
with col4:
job_types = df['job_type'].value_counts()
fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
st.plotly_chart(fig, use_container_width=True)
@st.cache_data
def filter_dataframe(df, companies, locations, job_types):
filtered_df = df
if companies:
filtered_df = filtered_df[filtered_df['company'].isin(companies)]
if locations:
filtered_df = filtered_df[filtered_df['location'].isin(locations)]
if job_types:
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
return filtered_df
def display_data_explorer(df):
st.subheader("Data Explorer")
show_all = st.radio("Display", ("All Data", "Filtered Data"))
if show_all == "Filtered Data":
unique_values = get_unique_values(df)
col1, col2, col3 = st.columns(3)
with col1:
companies = st.multiselect("Select Companies", options=unique_values['companies'])
with col2:
locations = st.multiselect("Select Locations", options=unique_values['locations'])
with col3:
job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
filtered_df = filter_dataframe(df, companies, locations, job_types)
else:
filtered_df = df
st.write(f"Showing {len(filtered_df)} job listings")
def make_clickable(url):
return f'Link'
filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
def main():
st.title("Job Listings Dashboard")
df = load_and_concat_data()
if df.empty:
st.error("No data available. Please check your dataset.")
return
# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
if page == "Dashboard":
display_dashboard(df)
elif page "Data Explorer":
display_data_explorer(df)
if __name__ == "__main__":
main()