import streamlit as st import pandas as pd import plotly.express as px from huggingface_hub import HfApi import io from datetime import datetime, timedelta import time import pyarrow as pa import pyarrow.parquet as pq import math # Set page config for a wider layout and custom theme st.set_page_config(layout="wide", page_title="Job Listings Dashboard") # Custom CSS for black background and styling st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: try: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) df = pd.read_csv(file_content, engine='pyarrow') all_data.append(df) except Exception: pass # Silently skip files that can't be processed if not all_data: return pd.DataFrame() concatenated_df = pd.concat(all_data, ignore_index=True) columns_to_keep = [ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url' ] filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') # Drop duplicates and rows with NaT in date_posted filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted']) return filtered_df @st.cache_data() def get_unique_values(df): return { 'companies': df['company'].unique(), 'locations': df['location'].unique(), 'job_types': df['job_type'].unique(), 'Role_Name': df['title'].unique(), 'Date_posted': df['date_posted'].unique() } def create_chart(data, _x, y, title, color_sequence): fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def create_time_series(df): df_by_date = df.groupby('date_posted').size().reset_index(name='count') fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) fig.update_layout( plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF', xaxis_title="Date", yaxis_title="Number of Job Postings" ) return fig @st.cache_data def prepare_dashboard_data(df): top_companies = df['company'].value_counts().head(10) top_locations = df['location'].value_counts().head(10) top_job_titles = df['title'].value_counts().head(20) df_by_date = df.groupby('date_posted').size().reset_index(name='count') return top_companies, top_locations, top_job_titles, df_by_date def display_dashboard(df): top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df) today = datetime.now().date() jobs_today = df[df['date_posted'].dt.date == today].shape[0] col1, col2 = st.columns(2) with col1: st.subheader("Job Postings Overview") st.metric("Total Job Postings", len(df)) st.metric("Unique Companies", df['company'].nunique()) st.metric("Job Postings Today", jobs_today) min_date = df['date_posted'].min().date() max_date = df['date_posted'].max().date() st.write(f"Job postings from {min_date} to {max_date}") with col2: fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) st.plotly_chart(fig, use_container_width=True) # Job Postings Over Time Chart fig_time_series = create_time_series(df) st.plotly_chart(fig_time_series, use_container_width=True) col3, col4 = st.columns(2) with col3: fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) st.plotly_chart(fig, use_container_width=True) with col4: fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f']) st.plotly_chart(fig, use_container_width=True) @st.cache_data def filter_dataframe(df, companies, locations, job_types,Role_Name,Date_posted): filtered_df = df if companies: filtered_df = filtered_df[filtered_df['company'].isin(companies)] if locations: filtered_df = filtered_df[filtered_df['location'].isin(locations)] if job_types: filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] if Role_Name: filtered_df = filtered_df[filtered_df['title'].isin(Role_name)] if Date_posted: filtered_df = filtered_df[filtered_df['date_posted'].isin(Date_posted)] return filtered_df def display_data_explorer(df): st.subheader("Data Explorer") show_all = st.radio("Display", ("All Data", "Filtered Data")) if show_all == "Filtered Data": unique_values = get_unique_values(df) col1, col2, col3, col4,col5 = st.columns(5) with col1: companies = st.multiselect("Select Companies", options=unique_values['companies']) with col2: locations = st.multiselect("Select Locations", options=unique_values['locations']) with col3: job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) with col4: Role_type = st.multiselect("Select Role Types", options=unique_values['Role_Name']) with col5: Date_posted = st.multiselect("Select Role Types", options=unique_values['Date_posted']) filtered_df = filter_dataframe(df, companies, locations, job_types, Role_type,Date_posted) else: filtered_df = df st.write(f"Showing {len(filtered_df)} job listings") # Pagination items_per_page = 15 num_pages = math.ceil(len(filtered_df) / items_per_page) col1, col2, col3 = st.columns([1, 3, 1]) with col2: page = st.number_input("Page", min_value=1, max_value=num_pages, value=1) start_idx = (page - 1) * items_per_page end_idx = start_idx + items_per_page page_df = filtered_df.iloc[start_idx:end_idx] def make_clickable(url): return f'Link' page_df['job_url'] = page_df['job_url'].apply(make_clickable) page_df['company_url'] = page_df['company_url'].apply(make_clickable) st.write(page_df.to_html(escape=False, index=False), unsafe_allow_html=True) col1, col2, col3 = st.columns([1, 3, 1]) with col2: st.write(f"Page {page} of {num_pages}") def display_about_page(): st.markdown(""" ## What is this application? The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles. ### Key Features: - **Interactive Dashboard**: Visualize job market trends with dynamic charts and graphs. - **Data Explorer**: Dive deep into individual job listings with advanced filtering options. - **Real-time Data**: Fetch the latest job data from our Hugging Face dataset. ## How to use this application ### Dashboard 1. Navigate to the Dashboard using the sidebar. 2. View overall statistics such as total job postings, unique companies, and today's postings. 3. Explore interactive charts showing: - Top companies hiring - Job postings over time - Top locations for job opportunities - Most common job titles ### Data Explorer 1. Switch to the Data Explorer using the sidebar. 2. Choose between viewing all data or applying filters. 3. Use the multi-select dropdowns to filter by: - Companies - Locations - Job Types 4. Browse the filtered job listings table. 5. Click on job or company links to view more details on the original posting site. ## Data Source This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily. ## Contact For questions, feedback, or collaboration opportunities, feel free to reach out: - LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/) """) # Add a clickable LinkedIn button linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/" st.markdown(f""" """, unsafe_allow_html=True) def main(): st.title("Job Easz") df = load_and_concat_data() if df.empty: st.error("No data available. Please check your dataset.") return # Sidebar for navigation st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"]) if page == "Dashboard": display_dashboard(df) elif page == "Data Explorer": display_data_explorer(df) elif page == "About": display_about_page() if __name__ == "__main__": main()