import streamlit as st import pandas as pd import plotly.express as px from huggingface_hub import HfApi import io from datetime import datetime, timedelta import time import pyarrow as pa import pyarrow.parquet as pq import math import re from concurrent.futures import ThreadPoolExecutor, as_completed # Set page config for a wider layout and custom theme st.set_page_config(layout="wide", page_title="Job Listings Dashboard") # Custom CSS for black background and styling st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: try: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) # Use PyArrow to read CSV read_options = csv.ReadOptions(column_names=[ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url' ]) parse_options = csv.ParseOptions(delimiter=',') convert_options = csv.ConvertOptions( timestamp_parsers=['%Y-%m-%d'] ) table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options) all_data.append(table) except Exception as e: print(f"Error processing file {file}: {str(e)}") if not all_data: return pa.Table.from_pandas(pd.DataFrame()) # Concatenate all tables concatenated_table = pa.concat_tables(all_data) # Filter for 2024 data mask = pc.year(concatenated_table['date_posted']) == 2024 filtered_table = concatenated_table.filter(mask) # Convert titles and company names to lowercase filtered_table = filtered_table.set_column( filtered_table.schema.get_field_index('title'), 'title', pc.utf8_lower(filtered_table['title']) ) filtered_table = filtered_table.set_column( filtered_table.schema.get_field_index('company'), 'company', pc.utf8_lower(filtered_table['company']) ) # Clean location def clean_location(location): if location is None: return None location = location.lower() return re.sub(r',\s*(us|usa)$', '', location) cleaned_locations = pc.map(filtered_table['location'], clean_location) filtered_table = filtered_table.set_column( filtered_table.schema.get_field_index('location'), 'location', cleaned_locations ) # Remove duplicates filtered_table = filtered_table.group_by(filtered_table.column_names).aggregate([]) # Convert to pandas DataFrame for compatibility with the rest of your code filtered_df = filtered_table.to_pandas() return filtered_df @st.cache_data() def get_unique_values(df): return { 'companies': df['company'].unique(), 'locations': df['location'].unique(), 'job_types': df['job_type'].unique(), 'Role_Name': df['title'].unique(), 'Date_posted': df['date_posted'].unique() } @st.cache_data def prepare_dashboard_data(df): top_companies = df['company'].value_counts().head(10) top_locations = df['location'].value_counts().head(10) top_job_titles = df['title'].value_counts().head(20) df_by_date = df.groupby('date_posted').size().reset_index(name='count') return top_companies, top_locations, top_job_titles, df_by_date def create_chart(data, _x, y, title, color_sequence): fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') return fig def create_time_series(df, time_unit='day'): if time_unit == 'week': # Group by week and year df_by_date = df.groupby(df['date_posted'].dt.to_period('W')).size().reset_index(name='count') df_by_date['date_posted'] = df_by_date['date_posted'].dt.to_timestamp() else: # Keep daily grouping as before df_by_date = df.groupby('date_posted').size().reset_index(name='count') fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) fig.update_layout( plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF', xaxis_title="Date", yaxis_title="Number of Job Postings" ) # Adjust x-axis ticks for weekly view if time_unit == 'week': fig.update_xaxes( dtick="W1", tickformat="%d %b %Y", ticklabelmode="period" ) return fig def display_dashboard(df): top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df) today = datetime.now().date() jobs_today = df[df['date_posted'].dt.date == today].shape[0] col1, col2 = st.columns(2) with col1: st.subheader("Job Postings Overview") st.metric("Total Job Postings", len(df)) st.metric("Unique Companies", df['company'].nunique()) st.metric("Job Postings Today", jobs_today) min_date = df['date_posted'].min().date() max_date = df['date_posted'].max().date() st.write(f"Job postings from {min_date} to {max_date}") with col2: fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) st.plotly_chart(fig, use_container_width=True) # Job Postings Over Time Chart fig_time_series = create_time_series(df) st.plotly_chart(fig_time_series, use_container_width=True) col3, col4 = st.columns(2) with col3: fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) st.plotly_chart(fig, use_container_width=True) with col4: fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f']) st.plotly_chart(fig, use_container_width=True) @st.cache_data def filter_dataframe(df, companies, locations, job_types,Role_Name,Date_posted): filtered_df = df if companies: filtered_df = filtered_df[filtered_df['company'].isin(companies)] if locations: filtered_df = filtered_df[filtered_df['location'].isin(locations)] if job_types: filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] if Role_Name: filtered_df = filtered_df[filtered_df['title'].isin(Role_Name)] if Date_posted: filtered_df = filtered_df[filtered_df['date_posted'].isin(Date_posted)] return filtered_df def display_data_explorer(df): st.subheader("Data Explorer") show_all = st.radio("Display", ("All Data", "Filtered Data")) if show_all == "Filtered Data": unique_values = get_unique_values(df) col1, col2, col3, col4,col5 = st.columns(5) with col1: companies = st.multiselect("Select Companies", options=unique_values['companies']) with col2: locations = st.multiselect("Select Locations", options=unique_values['locations']) with col3: job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) with col4: Role_Name = st.multiselect("Select Role Types", options=unique_values['Role_Name']) with col5: Date_posted = st.multiselect("Select Date Posted", options=unique_values['Date_posted']) filtered_df = filter_dataframe(df, companies, locations, job_types, Role_Name,Date_posted) else: filtered_df = df st.write(f"Showing {len(filtered_df)} job listings") # Pagination items_per_page = 15 num_pages = math.ceil(len(filtered_df) / items_per_page) col1, col2, col3 = st.columns([1, 3, 1]) with col2: page = st.number_input("Page", min_value=1, max_value=num_pages, value=1) start_idx = (page - 1) * items_per_page end_idx = start_idx + items_per_page page_df = filtered_df.iloc[start_idx:end_idx] def make_clickable(url): return f'Link' page_df['job_url'] = page_df['job_url'].apply(make_clickable) page_df['company_url'] = page_df['company_url'].apply(make_clickable) st.write(page_df.to_html(escape=False, index=False), unsafe_allow_html=True) col1, col2, col3 = st.columns([1, 3, 1]) with col2: st.write(f"Page {page} of {num_pages}") def display_about_page(): st.markdown(""" ## What is this application? The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles. ### Key Features: - **Interactive Dashboard**: Visualize job market trends with dynamic charts and graphs. - **Data Explorer**: Dive deep into individual job listings with advanced filtering options. - **Real-time Data**: Fetch the latest job data from our Hugging Face dataset. ## How to use this application ### Dashboard 1. Navigate to the Dashboard using the sidebar. 2. View overall statistics such as total job postings, unique companies, and today's postings. 3. Explore interactive charts showing: - Top companies hiring - Job postings over time - Top locations for job opportunities - Most common job titles ### Data Explorer 1. Switch to the Data Explorer using the sidebar. 2. Choose between viewing all data or applying filters. 3. Use the multi-select dropdowns to filter by: - Companies - Locations - Job Types 4. Browse the filtered job listings table. 5. Click on job or company links to view more details on the original posting site. ## Data Source This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily. ## Contact For questions, feedback, or collaboration opportunities, feel free to reach out: - LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/) """) # Add a clickable LinkedIn button linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/" st.markdown(f""" """, unsafe_allow_html=True) def main(): st.title("Job Easz") df = load_and_concat_data() if df.empty: st.error("No data available. Please check your dataset.") return # Sidebar for navigation st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer","About"]) if page == "Dashboard": display_dashboard(df) elif page == "Data Explorer": display_data_explorer(df) elif page == "About": display_about_page() if __name__ == "__main__": main()