import streamlit as st import pandas as pd import plotly.express as px from huggingface_hub import HfApi import io from datetime import datetime, timedelta import time # Set page config for a wider layout and custom theme st.set_page_config(layout="wide", page_title="Job Listings Dashboard") # Custom CSS for black background and styling st.markdown(""" """, unsafe_allow_html=True) # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: try: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) df = pd.read_csv(file_content) all_data.append(df) except Exception: pass # Silently skip files that can't be processed if not all_data: return pd.DataFrame() concatenated_df = pd.concat(all_data, ignore_index=True) columns_to_keep = [ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url' ] filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') # Drop duplicates and rows with NaT in date_posted filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted']) return filtered_df @st.cache_data() def get_unique_values(df): return { 'companies': df['company'].unique(), 'locations': df['location'].unique(), 'job_types': df['job_type'].unique() } def create_chart(data, _x, y, title, color_sequence): fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def create_time_series(df): df_by_date = df.groupby('date_posted').size().reset_index(name='count') fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def display_dashboard(df): col1, col2 = st.columns(2) with col1: st.subheader("Job Postings Overview") st.metric("Total Job Postings", len(df)) st.metric("Unique Companies", df['company'].nunique()) st.metric("Unique Locations", df['location'].nunique()) min_date = df['date_posted'].min().date() max_date = df['date_posted'].max().date() st.write(f"Job postings from {min_date} to {max_date}") with col2: top_companies = df['company'].value_counts().head(10) fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) st.plotly_chart(fig, use_container_width=True) # Job Postings Over Time Chart fig_time_series = create_time_series(df) st.plotly_chart(fig_time_series, use_container_width=True) col3, col4 = st.columns(2) with col3: top_locations = df['location'].value_counts().head(10) fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) st.plotly_chart(fig, use_container_width=True) with col4: job_types = df['job_type'].value_counts() fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') st.plotly_chart(fig, use_container_width=True) @st.cache_data def filter_dataframe(df, companies, locations, job_types): filtered_df = df if companies: filtered_df = filtered_df[filtered_df['company'].isin(companies)] if locations: filtered_df = filtered_df[filtered_df['location'].isin(locations)] if job_types: filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] return filtered_df def display_data_explorer(df): st.subheader("Data Explorer") show_all = st.radio("Display", ("All Data", "Filtered Data")) if show_all == "Filtered Data": unique_values = get_unique_values(df) col1, col2, col3 = st.columns(3) with col1: companies = st.multiselect("Select Companies", options=unique_values['companies']) with col2: locations = st.multiselect("Select Locations", options=unique_values['locations']) with col3: job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) filtered_df = filter_dataframe(df, companies, locations, job_types) else: filtered_df = df st.write(f"Showing {len(filtered_df)} job listings") def make_clickable(url): return f'Link' filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable) filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable) st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True) def main(): st.title("Job Listings Dashboard") df = load_and_concat_data() if df.empty: st.error("No data available. Please check your dataset.") return # Sidebar for navigation st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"]) if page == "Dashboard": display_dashboard(df) elif page "Data Explorer": display_data_explorer(df) if __name__ == "__main__": main()