import streamlit as st import pandas as pd import plotly.express as px from huggingface_hub import HfApi import io from datetime import datetime, timedelta # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) # Cache for 1 hour def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) df = pd.read_csv(file_content) all_data.append(df) concatenated_df = pd.concat(all_data, ignore_index=True) concatenated_df = concatenated_df.drop_duplicates() # Ensure 'date_posted' is in datetime format concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted']) # Filter columns columns_to_keep = [ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'description', 'company_url' ] filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) return filtered_df def filter_data(df, start_date, end_date, selected_locations, selected_roles): filtered_df = df[ (df['date_posted'].dt.date >= start_date) & (df['date_posted'].dt.date <= end_date) ] if selected_locations: filtered_df = filtered_df[filtered_df['location'].isin(selected_locations)] if selected_roles: filtered_df = filtered_df[filtered_df['title'].isin(selected_roles)] return filtered_df def dashboard(): st.title("Job Listings Dashboard") df = load_and_concat_data() st.sidebar.header("Filters") start_date = st.sidebar.date_input("Start Date", df['date_posted'].min().date()) end_date = st.sidebar.date_input("End Date", df['date_posted'].max().date()) locations = st.sidebar.multiselect("Locations", options=df['location'].unique()) roles = st.sidebar.multiselect("Job Roles", options=df['title'].unique()) filtered_df = filter_data(df, start_date, end_date, locations, roles) st.metric("Total Job Postings", len(filtered_df)) daily_postings = filtered_df.groupby(filtered_df['date_posted'].dt.date).size().reset_index(name='count') fig_time_series = px.line(daily_postings, x='date_posted', y='count', title='Daily Job Postings') st.plotly_chart(fig_time_series) location_counts = filtered_df['location'].value_counts().head(10) fig_location = px.bar(location_counts, x=location_counts.index, y=location_counts.values, title='Top 10 Locations') st.plotly_chart(fig_location) role_counts = filtered_df['title'].value_counts().head(10) fig_role = px.bar(role_counts, x=role_counts.index, y=role_counts.values, title='Top 10 Job Roles') st.plotly_chart(fig_role) st.subheader("Recent Job Postings") recent_postings = filtered_df.sort_values('date_posted', ascending=False).head(5) for _, job in recent_postings.iterrows(): st.write(f"**{job['title']}** - {job['company']} - {job['location']} - {job['date_posted'].date()}") def data_table(): st.title("Full Job Listings Data") df = load_and_concat_data() st.dataframe( df.style.format({ 'job_url': lambda x: f'Link', 'company_url': lambda x: f'Link' if pd.notnull(x) else '' }), unsafe_allow_html=True ) def main(): st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Dashboard", "Data Table"]) if page == "Dashboard": dashboard() elif page == "Data Table": data_table() if __name__ == "__main__": main()