Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
from huggingface_hub import HfApi | |
import io | |
from datetime import datetime, timedelta | |
# Hugging Face setup | |
HF_TOKEN = st.secrets["HF_TOKEN"] | |
HF_USERNAME = st.secrets["HF_USERNAME"] | |
DATASET_NAME = "jobeasz" | |
# Cache for 1 hour | |
def load_and_concat_data(): | |
api = HfApi() | |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") | |
csv_files = [file for file in dataset_files if file.endswith('.csv')] | |
all_data = [] | |
for file in csv_files: | |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) | |
df = pd.read_csv(file_content) | |
all_data.append(df) | |
concatenated_df = pd.concat(all_data, ignore_index=True) | |
concatenated_df = concatenated_df.drop_duplicates() | |
# Ensure 'date_posted' is in datetime format | |
concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted']) | |
# Filter columns | |
columns_to_keep = [ | |
'site', 'job_url', 'title', 'company', 'location', | |
'job_type', 'date_posted', 'is_remote', 'description', 'company_url' | |
] | |
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) | |
return filtered_df | |
def filter_data(df, start_date, end_date, selected_locations, selected_roles): | |
filtered_df = df[ | |
(df['date_posted'].dt.date >= start_date) & | |
(df['date_posted'].dt.date <= end_date) | |
] | |
if selected_locations: | |
filtered_df = filtered_df[filtered_df['location'].isin(selected_locations)] | |
if selected_roles: | |
filtered_df = filtered_df[filtered_df['title'].isin(selected_roles)] | |
return filtered_df | |
def dashboard(): | |
st.title("Job Listings Dashboard") | |
df = load_and_concat_data() | |
st.sidebar.header("Filters") | |
start_date = st.sidebar.date_input("Start Date", df['date_posted'].min().date()) | |
end_date = st.sidebar.date_input("End Date", df['date_posted'].max().date()) | |
locations = st.sidebar.multiselect("Locations", options=df['location'].unique()) | |
roles = st.sidebar.multiselect("Job Roles", options=df['title'].unique()) | |
filtered_df = filter_data(df, start_date, end_date, locations, roles) | |
st.metric("Total Job Postings", len(filtered_df)) | |
daily_postings = filtered_df.groupby(filtered_df['date_posted'].dt.date).size().reset_index(name='count') | |
fig_time_series = px.line(daily_postings, x='date_posted', y='count', title='Daily Job Postings') | |
st.plotly_chart(fig_time_series) | |
location_counts = filtered_df['location'].value_counts().head(10) | |
fig_location = px.bar(location_counts, x=location_counts.index, y=location_counts.values, title='Top 10 Locations') | |
st.plotly_chart(fig_location) | |
role_counts = filtered_df['title'].value_counts().head(10) | |
fig_role = px.bar(role_counts, x=role_counts.index, y=role_counts.values, title='Top 10 Job Roles') | |
st.plotly_chart(fig_role) | |
st.subheader("Recent Job Postings") | |
recent_postings = filtered_df.sort_values('date_posted', ascending=False).head(5) | |
for _, job in recent_postings.iterrows(): | |
st.write(f"**{job['title']}** - {job['company']} - {job['location']} - {job['date_posted'].date()}") | |
def data_table(): | |
st.title("Full Job Listings Data") | |
df = load_and_concat_data() | |
st.dataframe( | |
df.style.format({ | |
'job_url': lambda x: f'<a href="{x}" target="_blank">Link</a>', | |
'company_url': lambda x: f'<a href="{x}" target="_blank">Link</a>' if pd.notnull(x) else '' | |
}), | |
unsafe_allow_html=True | |
) | |
def main(): | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Dashboard", "Data Table"]) | |
if page == "Dashboard": | |
dashboard() | |
elif page == "Data Table": | |
data_table() | |
if __name__ == "__main__": | |
main() |