Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
import io
|
6 |
+
from datetime import datetime, timedelta
|
7 |
+
|
8 |
+
# Hugging Face setup
|
9 |
+
HF_TOKEN = st.secrets["HF_TOKEN"]
|
10 |
+
HF_USERNAME = st.secrets["HF_USERNAME"]
|
11 |
+
DATASET_NAME = "jobeasz"
|
12 |
+
|
13 |
+
@st.cache_data(ttl=3600) # Cache for 1 hour
|
14 |
+
def load_and_concat_data():
|
15 |
+
api = HfApi()
|
16 |
+
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
17 |
+
csv_files = [file for file in dataset_files if file.endswith('.csv')]
|
18 |
+
|
19 |
+
all_data = []
|
20 |
+
for file in csv_files:
|
21 |
+
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
22 |
+
df = pd.read_csv(file_content)
|
23 |
+
all_data.append(df)
|
24 |
+
|
25 |
+
concatenated_df = pd.concat(all_data, ignore_index=True)
|
26 |
+
concatenated_df = concatenated_df.drop_duplicates()
|
27 |
+
|
28 |
+
# Ensure 'date_posted' is in datetime format
|
29 |
+
concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'])
|
30 |
+
|
31 |
+
# Filter columns
|
32 |
+
columns_to_keep = [
|
33 |
+
'site', 'job_url', 'title', 'company', 'location',
|
34 |
+
'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
|
35 |
+
]
|
36 |
+
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
|
37 |
+
|
38 |
+
return filtered_df
|
39 |
+
|
40 |
+
def filter_data(df, start_date, end_date, selected_locations, selected_roles):
|
41 |
+
filtered_df = df[
|
42 |
+
(df['date_posted'].dt.date >= start_date) &
|
43 |
+
(df['date_posted'].dt.date <= end_date)
|
44 |
+
]
|
45 |
+
if selected_locations:
|
46 |
+
filtered_df = filtered_df[filtered_df['location'].isin(selected_locations)]
|
47 |
+
if selected_roles:
|
48 |
+
filtered_df = filtered_df[filtered_df['title'].isin(selected_roles)]
|
49 |
+
return filtered_df
|
50 |
+
|
51 |
+
def dashboard():
|
52 |
+
st.title("Job Listings Dashboard")
|
53 |
+
|
54 |
+
df = load_and_concat_data()
|
55 |
+
|
56 |
+
st.sidebar.header("Filters")
|
57 |
+
start_date = st.sidebar.date_input("Start Date", df['date_posted'].min().date())
|
58 |
+
end_date = st.sidebar.date_input("End Date", df['date_posted'].max().date())
|
59 |
+
locations = st.sidebar.multiselect("Locations", options=df['location'].unique())
|
60 |
+
roles = st.sidebar.multiselect("Job Roles", options=df['title'].unique())
|
61 |
+
|
62 |
+
filtered_df = filter_data(df, start_date, end_date, locations, roles)
|
63 |
+
|
64 |
+
st.metric("Total Job Postings", len(filtered_df))
|
65 |
+
|
66 |
+
daily_postings = filtered_df.groupby(filtered_df['date_posted'].dt.date).size().reset_index(name='count')
|
67 |
+
fig_time_series = px.line(daily_postings, x='date_posted', y='count', title='Daily Job Postings')
|
68 |
+
st.plotly_chart(fig_time_series)
|
69 |
+
|
70 |
+
location_counts = filtered_df['location'].value_counts().head(10)
|
71 |
+
fig_location = px.bar(location_counts, x=location_counts.index, y=location_counts.values, title='Top 10 Locations')
|
72 |
+
st.plotly_chart(fig_location)
|
73 |
+
|
74 |
+
role_counts = filtered_df['title'].value_counts().head(10)
|
75 |
+
fig_role = px.bar(role_counts, x=role_counts.index, y=role_counts.values, title='Top 10 Job Roles')
|
76 |
+
st.plotly_chart(fig_role)
|
77 |
+
|
78 |
+
st.subheader("Recent Job Postings")
|
79 |
+
recent_postings = filtered_df.sort_values('date_posted', ascending=False).head(5)
|
80 |
+
for _, job in recent_postings.iterrows():
|
81 |
+
st.write(f"**{job['title']}** - {job['company']} - {job['location']} - {job['date_posted'].date()}")
|
82 |
+
|
83 |
+
def data_table():
|
84 |
+
st.title("Full Job Listings Data")
|
85 |
+
|
86 |
+
df = load_and_concat_data()
|
87 |
+
|
88 |
+
st.dataframe(
|
89 |
+
df.style.format({
|
90 |
+
'job_url': lambda x: f'<a href="{x}" target="_blank">Link</a>',
|
91 |
+
'company_url': lambda x: f'<a href="{x}" target="_blank">Link</a>' if pd.notnull(x) else ''
|
92 |
+
}),
|
93 |
+
unsafe_allow_html=True
|
94 |
+
)
|
95 |
+
|
96 |
+
def main():
|
97 |
+
st.sidebar.title("Navigation")
|
98 |
+
page = st.sidebar.radio("Go to", ["Dashboard", "Data Table"])
|
99 |
+
|
100 |
+
if page == "Dashboard":
|
101 |
+
dashboard()
|
102 |
+
elif page == "Data Table":
|
103 |
+
data_table()
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
main()
|