Spaces:

Niharmahesh
/

job_easz

Running

File size: 3,942 Bytes

f2446d9

import streamlit as st
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi
import io
from datetime import datetime, timedelta

# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"

@st.cache_data(ttl=3600)  # Cache for 1 hour
def load_and_concat_data():
    api = HfApi()
    dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
    csv_files = [file for file in dataset_files if file.endswith('.csv')]

    all_data = []
    for file in csv_files:
        file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
        df = pd.read_csv(file_content)
        all_data.append(df)

    concatenated_df = pd.concat(all_data, ignore_index=True)
    concatenated_df = concatenated_df.drop_duplicates()
    
    # Ensure 'date_posted' is in datetime format
    concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'])
    
    # Filter columns
    columns_to_keep = [
        'site', 'job_url', 'title', 'company', 'location',
        'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
    ]
    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
    
    return filtered_df

def filter_data(df, start_date, end_date, selected_locations, selected_roles):
    filtered_df = df[
        (df['date_posted'].dt.date >= start_date) &
        (df['date_posted'].dt.date <= end_date)
    ]
    if selected_locations:
        filtered_df = filtered_df[filtered_df['location'].isin(selected_locations)]
    if selected_roles:
        filtered_df = filtered_df[filtered_df['title'].isin(selected_roles)]
    return filtered_df

def dashboard():
    st.title("Job Listings Dashboard")

    df = load_and_concat_data()

    st.sidebar.header("Filters")
    start_date = st.sidebar.date_input("Start Date", df['date_posted'].min().date())
    end_date = st.sidebar.date_input("End Date", df['date_posted'].max().date())
    locations = st.sidebar.multiselect("Locations", options=df['location'].unique())
    roles = st.sidebar.multiselect("Job Roles", options=df['title'].unique())

    filtered_df = filter_data(df, start_date, end_date, locations, roles)

    st.metric("Total Job Postings", len(filtered_df))

    daily_postings = filtered_df.groupby(filtered_df['date_posted'].dt.date).size().reset_index(name='count')
    fig_time_series = px.line(daily_postings, x='date_posted', y='count', title='Daily Job Postings')
    st.plotly_chart(fig_time_series)

    location_counts = filtered_df['location'].value_counts().head(10)
    fig_location = px.bar(location_counts, x=location_counts.index, y=location_counts.values, title='Top 10 Locations')
    st.plotly_chart(fig_location)

    role_counts = filtered_df['title'].value_counts().head(10)
    fig_role = px.bar(role_counts, x=role_counts.index, y=role_counts.values, title='Top 10 Job Roles')
    st.plotly_chart(fig_role)

    st.subheader("Recent Job Postings")
    recent_postings = filtered_df.sort_values('date_posted', ascending=False).head(5)
    for _, job in recent_postings.iterrows():
        st.write(f"**{job['title']}** - {job['company']} - {job['location']} - {job['date_posted'].date()}")

def data_table():
    st.title("Full Job Listings Data")

    df = load_and_concat_data()

    st.dataframe(
        df.style.format({
            'job_url': lambda x: f'<a href="{x}" target="_blank">Link</a>',
            'company_url': lambda x: f'<a href="{x}" target="_blank">Link</a>' if pd.notnull(x) else ''
        }),
        unsafe_allow_html=True
    )

def main():
    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Go to", ["Dashboard", "Data Table"])

    if page == "Dashboard":
        dashboard()
    elif page == "Data Table":
        data_table()

if __name__ == "__main__":
    main()