Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,195 +1,120 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
import pandas as pd
|
3 |
-
|
4 |
from huggingface_hub import HfApi
|
|
|
|
|
5 |
import io
|
6 |
-
|
7 |
-
import time
|
8 |
-
|
9 |
-
# Set page config for a wider layout and custom theme
|
10 |
-
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
|
11 |
-
|
12 |
-
# Custom CSS for black background and styling
|
13 |
-
st.markdown("""
|
14 |
-
<style>
|
15 |
-
.stApp {
|
16 |
-
background-color: #000000;
|
17 |
-
color: #FFFFFF;
|
18 |
-
}
|
19 |
-
.stButton>button {
|
20 |
-
background-color: #4e79a7;
|
21 |
-
color: white;
|
22 |
-
}
|
23 |
-
.stSelectbox, .stMultiSelect {
|
24 |
-
color: #FFFFFF;
|
25 |
-
}
|
26 |
-
.stDataFrame {
|
27 |
-
background-color: #1E1E1E;
|
28 |
-
}
|
29 |
-
.plotly-graph-div {
|
30 |
-
background-color: #1E1E1E;
|
31 |
-
}
|
32 |
-
.big-font {
|
33 |
-
font-size: 48px;
|
34 |
-
font-weight: bold;
|
35 |
-
text-align: center;
|
36 |
-
}
|
37 |
-
</style>
|
38 |
-
""", unsafe_allow_html=True)
|
39 |
|
40 |
# Hugging Face setup
|
41 |
HF_TOKEN = st.secrets["HF_TOKEN"]
|
42 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
43 |
DATASET_NAME = "jobeasz"
|
44 |
|
45 |
-
@st.cache_data
|
46 |
-
def
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
pass # Silently skip files that can't be processed
|
59 |
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
62 |
|
63 |
-
|
|
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
|
68 |
-
]
|
69 |
-
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
|
70 |
-
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
|
71 |
|
72 |
-
|
73 |
-
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
@st.cache_data
|
78 |
-
def get_unique_values(df):
|
79 |
-
return {
|
80 |
-
'companies': df['company'].unique(),
|
81 |
-
'locations': df['location'].unique(),
|
82 |
-
'job_types': df['job_type'].unique()
|
83 |
-
}
|
84 |
-
|
85 |
-
def display_timer():
|
86 |
-
placeholder = st.empty()
|
87 |
-
for i in range(15, 0, -1):
|
88 |
-
placeholder.markdown(f"<p class='big-font'>Loading data... {i}</p>", unsafe_allow_html=True)
|
89 |
-
time.sleep(1)
|
90 |
-
placeholder.empty()
|
91 |
-
|
92 |
-
def main():
|
93 |
-
st.title("Job Listings Dashboard")
|
94 |
-
|
95 |
-
display_timer()
|
96 |
-
|
97 |
-
df = load_and_concat_data()
|
98 |
-
|
99 |
-
if df.empty:
|
100 |
-
st.error("No data available. Please check your dataset.")
|
101 |
-
return
|
102 |
-
|
103 |
-
# Sidebar for navigation
|
104 |
-
st.sidebar.title("Navigation")
|
105 |
-
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
|
106 |
-
|
107 |
-
if page == "Dashboard":
|
108 |
-
display_dashboard(df)
|
109 |
-
elif page == "Data Explorer":
|
110 |
-
display_data_explorer(df)
|
111 |
-
|
112 |
-
@st.cache_data
|
113 |
-
def create_chart(data, x, y, title, color_sequence):
|
114 |
-
fig = px.bar(data, x=x, y=y, title=title, color_discrete_sequence=color_sequence)
|
115 |
-
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
|
116 |
-
return fig
|
117 |
-
|
118 |
-
def display_dashboard(df):
|
119 |
-
col1, col2 = st.columns(2)
|
120 |
-
|
121 |
-
with col1:
|
122 |
-
st.subheader("Job Postings Overview")
|
123 |
-
st.metric("Total Job Postings", len(df))
|
124 |
-
st.metric("Unique Companies", df['company'].nunique())
|
125 |
-
st.metric("Unique Locations", df['location'].nunique())
|
126 |
-
|
127 |
-
min_date = df['date_posted'].min().date()
|
128 |
-
max_date = df['date_posted'].max().date()
|
129 |
-
st.write(f"Job postings from {min_date} to {max_date}")
|
130 |
-
|
131 |
-
with col2:
|
132 |
-
top_companies = df['company'].value_counts().head(10)
|
133 |
-
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
|
134 |
-
st.plotly_chart(fig, use_container_width=True)
|
135 |
-
|
136 |
-
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
|
137 |
-
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
|
138 |
-
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
|
139 |
-
st.plotly_chart(fig, use_container_width=True)
|
140 |
-
|
141 |
-
col3, col4 = st.columns(2)
|
142 |
-
|
143 |
-
with col3:
|
144 |
-
top_locations = df['location'].value_counts().head(10)
|
145 |
-
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
|
146 |
-
st.plotly_chart(fig, use_container_width=True)
|
147 |
-
|
148 |
-
with col4:
|
149 |
-
job_types = df['job_type'].value_counts()
|
150 |
-
fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
|
151 |
-
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
|
152 |
-
st.plotly_chart(fig, use_container_width=True)
|
153 |
-
|
154 |
-
@st.cache_data
|
155 |
-
def filter_dataframe(df, companies, locations, job_types):
|
156 |
-
filtered_df = df
|
157 |
-
if companies:
|
158 |
-
filtered_df = filtered_df[filtered_df['company'].isin(companies)]
|
159 |
-
if locations:
|
160 |
-
filtered_df = filtered_df[filtered_df['location'].isin(locations)]
|
161 |
-
if job_types:
|
162 |
-
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
|
163 |
-
return filtered_df
|
164 |
-
|
165 |
-
def display_data_explorer(df):
|
166 |
-
st.subheader("Data Explorer")
|
167 |
-
|
168 |
-
show_all = st.radio("Display", ("All Data", "Filtered Data"))
|
169 |
-
|
170 |
-
if show_all == "Filtered Data":
|
171 |
-
unique_values = get_unique_values(df)
|
172 |
-
col1, col2, col3 = st.columns(3)
|
173 |
-
with col1:
|
174 |
-
companies = st.multiselect("Select Companies", options=unique_values['companies'])
|
175 |
-
with col2:
|
176 |
-
locations = st.multiselect("Select Locations", options=unique_values['locations'])
|
177 |
-
with col3:
|
178 |
-
job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
|
179 |
-
|
180 |
-
filtered_df = filter_dataframe(df, companies, locations, job_types)
|
181 |
-
else:
|
182 |
-
filtered_df = df
|
183 |
-
|
184 |
-
st.write(f"Showing {len(filtered_df)} job listings")
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from jobspy import scrape_jobs
|
3 |
import pandas as pd
|
4 |
+
from datasets import Dataset
|
5 |
from huggingface_hub import HfApi
|
6 |
+
import os
|
7 |
+
from datetime import datetime
|
8 |
import io
|
9 |
+
import hashlib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# Hugging Face setup
|
12 |
HF_TOKEN = st.secrets["HF_TOKEN"]
|
13 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
14 |
DATASET_NAME = "jobeasz"
|
15 |
|
16 |
+
@st.cache_data
|
17 |
+
def load_job_titles():
|
18 |
+
return [
|
19 |
+
"Data Analyst", "Data Scientist", "Data Engineer", "Machine Learning Engineer",
|
20 |
+
# ... (rest of the job titles)
|
21 |
+
"Data Annotation Expert", "Data Crowdsourcing Manager"
|
22 |
+
]
|
23 |
|
24 |
+
@st.cache_data
|
25 |
+
def load_locations():
|
26 |
+
return [
|
27 |
+
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
|
28 |
+
# ... (rest of the locations)
|
29 |
+
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
|
30 |
+
]
|
|
|
31 |
|
32 |
+
def generate_random_filename():
|
33 |
+
current_time = datetime.now().isoformat()
|
34 |
+
hash_object = hashlib.md5(current_time.encode())
|
35 |
+
random_hash = hash_object.hexdigest()[:8]
|
36 |
+
return f"{random_hash}.csv"
|
37 |
|
38 |
+
def update_huggingface_dataset(jobs):
|
39 |
+
df = pd.DataFrame(jobs)
|
40 |
+
filename = generate_random_filename()
|
41 |
|
42 |
+
if not os.path.exists("data"):
|
43 |
+
os.makedirs("data")
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
local_path = os.path.join("data", filename)
|
46 |
+
df.to_csv(local_path, index=False)
|
47 |
|
48 |
+
csv_string = df.to_csv(index=False)
|
49 |
+
file_obj = io.BytesIO(csv_string.encode())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
api = HfApi()
|
52 |
+
try:
|
53 |
+
api.upload_file(
|
54 |
+
path_or_fileobj=file_obj,
|
55 |
+
path_in_repo=f"data/{filename}",
|
56 |
+
repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
|
57 |
+
repo_type="dataset",
|
58 |
+
token=HF_TOKEN,
|
59 |
+
commit_message="Added new job listings"
|
60 |
+
)
|
61 |
+
return len(jobs), filename
|
62 |
+
except Exception as e:
|
63 |
+
st.error(f"Error uploading file to Hugging Face: {str(e)}")
|
64 |
+
return 0, None
|
65 |
+
|
66 |
+
st.title("Job Scraper and Hugging Face Dataset Updater")
|
67 |
+
|
68 |
+
job_titles = load_job_titles()
|
69 |
+
locations = load_locations()
|
70 |
+
|
71 |
+
search_term = st.selectbox("Job Title", options=job_titles, index=0)
|
72 |
+
location = st.selectbox("Location", options=locations, index=0)
|
73 |
+
|
74 |
+
results_wanted = st.number_input("Number of Results", min_value=1, max_value=100, value=20)
|
75 |
+
hours_old = st.number_input("Hours Old", min_value=1, max_value=168, value=72)
|
76 |
+
|
77 |
+
job_boards = st.multiselect(
|
78 |
+
"Select Job Boards",
|
79 |
+
["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
80 |
+
default=["indeed", "linkedin", "zip_recruiter", "glassdoor"]
|
81 |
+
)
|
82 |
+
|
83 |
+
if st.button("Scrape Jobs and Update Hugging Face Dataset"):
|
84 |
+
try:
|
85 |
+
with st.spinner("Scraping jobs..."):
|
86 |
+
jobs = scrape_jobs(
|
87 |
+
site_name=job_boards,
|
88 |
+
search_term=search_term,
|
89 |
+
location=location,
|
90 |
+
results_wanted=results_wanted,
|
91 |
+
hours_old=hours_old,
|
92 |
+
country_indeed='USA'
|
93 |
+
)
|
94 |
+
|
95 |
+
st.success(f"Found {len(jobs)} jobs")
|
96 |
+
|
97 |
+
df = pd.DataFrame(jobs)
|
98 |
+
|
99 |
+
st.subheader("Job Listings Preview")
|
100 |
+
st.dataframe(df.head())
|
101 |
+
|
102 |
+
with st.spinner("Updating Hugging Face dataset..."):
|
103 |
+
updated_count, filename = update_huggingface_dataset(jobs)
|
104 |
+
|
105 |
+
if updated_count > 0:
|
106 |
+
st.success(f"Hugging Face dataset updated successfully with {updated_count} job listings!")
|
107 |
+
st.info(f"New file created: {filename}")
|
108 |
+
|
109 |
+
st.markdown(f"View your dataset: https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
|
110 |
+
else:
|
111 |
+
st.error("Failed to update Hugging Face dataset. Please check your permissions and try again.")
|
112 |
+
except Exception as e:
|
113 |
+
st.error(f"An error occurred: {str(e)}")
|
114 |
+
|
115 |
+
st.sidebar.header("About")
|
116 |
+
st.sidebar.info(
|
117 |
+
"This app uses JobSpy to scrape job listings from various job boards "
|
118 |
+
"and updates a Hugging Face dataset with the results. "
|
119 |
+
"Enter your search criteria, select the job boards, and click 'Scrape Jobs and Update Hugging Face Dataset' to start."
|
120 |
+
)
|