import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta
# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"
@st.cache_data(ttl=3600) # Cache for 1 hour
def load_and_concat_data():
api = HfApi()
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
csv_files = [file for file in dataset_files if file.endswith('.csv')]
all_data = []
for file in csv_files:
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
df = pd.read_csv(file_content)
all_data.append(df)
concatenated_df = pd.concat(all_data, ignore_index=True)
concatenated_df = concatenated_df.drop_duplicates()
# Ensure 'date_posted' is in datetime format
concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'])
# Filter columns
columns_to_keep = [
'site', 'job_url', 'title', 'company', 'location',
'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
]
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
return filtered_df
def filter_data(df, start_date, end_date, selected_locations, selected_roles):
filtered_df = df[
(df['date_posted'].dt.date >= start_date) &
(df['date_posted'].dt.date <= end_date)
]
if selected_locations:
filtered_df = filtered_df[filtered_df['location'].isin(selected_locations)]
if selected_roles:
filtered_df = filtered_df[filtered_df['title'].isin(selected_roles)]
return filtered_df
def dashboard():
st.title("Job Listings Dashboard")
df = load_and_concat_data()
st.sidebar.header("Filters")
start_date = st.sidebar.date_input("Start Date", df['date_posted'].min().date())
end_date = st.sidebar.date_input("End Date", df['date_posted'].max().date())
locations = st.sidebar.multiselect("Locations", options=df['location'].unique())
roles = st.sidebar.multiselect("Job Roles", options=df['title'].unique())
filtered_df = filter_data(df, start_date, end_date, locations, roles)
st.metric("Total Job Postings", len(filtered_df))
daily_postings = filtered_df.groupby(filtered_df['date_posted'].dt.date).size().reset_index(name='count')
fig_time_series = px.line(daily_postings, x='date_posted', y='count', title='Daily Job Postings')
st.plotly_chart(fig_time_series)
location_counts = filtered_df['location'].value_counts().head(10)
fig_location = px.bar(location_counts, x=location_counts.index, y=location_counts.values, title='Top 10 Locations')
st.plotly_chart(fig_location)
role_counts = filtered_df['title'].value_counts().head(10)
fig_role = px.bar(role_counts, x=role_counts.index, y=role_counts.values, title='Top 10 Job Roles')
st.plotly_chart(fig_role)
st.subheader("Recent Job Postings")
recent_postings = filtered_df.sort_values('date_posted', ascending=False).head(5)
for _, job in recent_postings.iterrows():
st.write(f"**{job['title']}** - {job['company']} - {job['location']} - {job['date_posted'].date()}")
def data_table():
st.title("Full Job Listings Data")
df = load_and_concat_data()
st.dataframe(
df.style.format({
'job_url': lambda x: f'Link',
'company_url': lambda x: f'Link' if pd.notnull(x) else ''
}),
unsafe_allow_html=True
)
def main():
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Dashboard", "Data Table"])
if page == "Dashboard":
dashboard()
elif page == "Data Table":
data_table()
if __name__ == "__main__":
main()