import streamlit as st import pandas as pd import plotly.express as px from huggingface_hub import HfApi import io from datetime import datetime, timedelta import time # Set page config for a wider layout and custom theme st.set_page_config(layout="wide", page_title="Job Listings Dashboard") # Custom CSS for black background and styling st.markdown(""" """, unsafe_allow_html=True) # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: try: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) df = pd.read_csv(file_content) all_data.append(df) except Exception: pass # Silently skip files that can't be processed if not all_data: return pd.DataFrame() concatenated_df = pd.concat(all_data, ignore_index=True) columns_to_keep = [ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url' ] filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') # Drop duplicates and rows with NaT in date_posted filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted']) return filtered_df @st.cache_data() def get_unique_values(df): return { 'companies': df['company'].unique(), 'locations': df['location'].unique(), 'job_types': df['job_type'].unique() } def create_chart(data, _x, y, title, color_sequence): fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def create_time_series(df): df_by_date = df.groupby('date_posted').size().reset_index(name='count') fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def parse_locations(df): valid_locations = [ "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "San Jose, CA", "Austin, TX", "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", "San Francisco Bay Area", "Washington, D.C.", "Boston, MA", "Seattle, WA", "Denver, CO", "Nashville, TN", "Baltimore, MD", "Portland, OR", "Las Vegas, NV", "Milwaukee, WI", "Albuquerque, NM", "Tucson, AZ", "Fresno, CA", "Sacramento, CA", "Long Beach, CA", "Kansas City, MO", "Mesa, AZ", "Atlanta, GA", "Colorado Springs, CO", "Raleigh, NC", "Omaha, NE", "Miami, FL", "Oakland, CA", "Minneapolis, MN", "Tulsa, OK", "Cleveland, OH", "Wichita, KS", "Arlington, TX", "New Orleans, LA", "Bakersfield, CA", "Tampa, FL", "Honolulu, HI", "Aurora, CO", "Anaheim, CA", "Santa Ana, CA", "St. Louis, MO", "Riverside, CA", "Corpus Christi, TX", "Lexington, KY", "Pittsburgh, PA", "Anchorage, AK", "Stockton, CA", "Cincinnati, OH", "St. Paul, MN", "Toledo, OH", "Newark, NJ", "Greensboro, NC", "Plano, TX", "Henderson, NV", "Lincoln, NE", "Buffalo, NY", "Fort Wayne, IN", "Jersey City, NJ", "Chula Vista, CA", "Orlando, FL", "St. Petersburg, FL", "Norfolk, VA", "Chandler, AZ", "Laredo, TX", "Madison, WI", "Durham, NC", "Lubbock, TX", "Winston-Salem, NC", "Garland, TX", "Glendale, AZ", "Hialeah, FL", "Reno, NV", "Baton Rouge, LA", "Irvine, CA", "Chesapeake, VA", "Irving, TX", "Scottsdale, AZ", "North Las Vegas, NV", "Fremont, CA", "Gilbert, AZ", "San Bernardino, CA", "Boise, ID", "Birmingham, AL" ] df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other')) return df def display_dashboard(df): df = parse_locations(df) col1, col2 = st.columns(2) with col1: st.subheader("Job Postings Overview") st.metric("Total Job Postings", len(df)) st.metric("Unique Companies", df['company'].nunique()) st.metric("Unique Locations", df['parsed_location'].nunique()) min_date = df['date_posted'].min().date() max_date = df['date_posted'].max().date() st.write(f"Job postings from {min_date} to {max_date}") with col2: top_companies = df['company'].value_counts().head(10) fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) st.plotly_chart(fig, use_container_width=True) # Job Postings Over Time Chart fig_time_series = create_time_series(df) st.plotly_chart(fig_time_series, use_container_width=True) col3, col4 = st.columns(2) with col3: top_locations = df['parsed_location'].value_counts().head(10) fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) st.plotly_chart(fig, use_container_width=True) with col4: top_job_titles = df['title'].value_counts().head(20) fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f']) st.plotly_chart(fig, use_container_width=True) @st.cache_data def filter_dataframe(df, companies, locations, job_types): filtered_df = df if companies: filtered_df = filtered_df[filtered_df['company'].isin(companies)] if locations: filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)] if job_types: filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] return filtered_df def display_data_explorer(df): st.subheader("Data Explorer") show_all = st.radio("Display", ("All Data", "Filtered Data")) if show_all == "Filtered Data": unique_values = get_unique_values(df) col1, col2, col3 = st.columns(3) with col1: companies = st.multiselect("Select Companies", options=unique_values['companies']) with col2: locations = st.multiselect("Select Locations", options=df['parsed_location'].unique()) with col3: job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) filtered_df = filter_dataframe(df, companies, locations, job_types) else: filtered_df = df st.write(f"Showing {len(filtered_df)} job listings") def make_clickable(url): return f'Link' filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable) filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable) st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True) def main(): st.title("Job Listings Dashboard") df = load_and_concat_data() if df.empty: st.error("No data available. Please check your dataset.") return df = parse_locations(df) # Sidebar for navigation st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"]) if page == "Dashboard": display_dashboard(df) elif page == "Data Explorer": display_data_explorer(df) if __name__ == "__main__": main()