Niharmahesh commited on
Commit
b920577
·
verified ·
1 Parent(s): d8ebc25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -103
app.py CHANGED
@@ -1,120 +1,195 @@
1
  import streamlit as st
2
- from jobspy import scrape_jobs
3
  import pandas as pd
4
- from datasets import Dataset
5
  from huggingface_hub import HfApi
6
- import os
7
- from datetime import datetime
8
  import io
9
- import hashlib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Hugging Face setup
12
  HF_TOKEN = st.secrets["HF_TOKEN"]
13
  HF_USERNAME = st.secrets["HF_USERNAME"]
14
  DATASET_NAME = "jobeasz"
15
 
16
- @st.cache_data
17
- def load_job_titles():
18
- return [
19
- "Data Analyst", "Data Scientist", "Data Engineer", "Machine Learning Engineer",
20
- # ... (rest of the job titles)
21
- "Data Annotation Expert", "Data Crowdsourcing Manager"
22
- ]
23
 
24
- @st.cache_data
25
- def load_locations():
26
- return [
27
- "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
28
- # ... (rest of the locations)
29
- "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
30
- ]
 
31
 
32
- def generate_random_filename():
33
- current_time = datetime.now().isoformat()
34
- hash_object = hashlib.md5(current_time.encode())
35
- random_hash = hash_object.hexdigest()[:8]
36
- return f"{random_hash}.csv"
37
 
38
- def update_huggingface_dataset(jobs):
39
- df = pd.DataFrame(jobs)
40
- filename = generate_random_filename()
41
-
42
- if not os.path.exists("data"):
43
- os.makedirs("data")
44
 
45
- local_path = os.path.join("data", filename)
46
- df.to_csv(local_path, index=False)
 
 
 
 
47
 
48
- csv_string = df.to_csv(index=False)
49
- file_obj = io.BytesIO(csv_string.encode())
50
 
51
- api = HfApi()
52
- try:
53
- api.upload_file(
54
- path_or_fileobj=file_obj,
55
- path_in_repo=f"data/{filename}",
56
- repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
57
- repo_type="dataset",
58
- token=HF_TOKEN,
59
- commit_message="Added new job listings"
60
- )
61
- return len(jobs), filename
62
- except Exception as e:
63
- st.error(f"Error uploading file to Hugging Face: {str(e)}")
64
- return 0, None
65
-
66
- st.title("Job Scraper and Hugging Face Dataset Updater")
67
-
68
- job_titles = load_job_titles()
69
- locations = load_locations()
70
-
71
- search_term = st.selectbox("Job Title", options=job_titles, index=0)
72
- location = st.selectbox("Location", options=locations, index=0)
73
-
74
- results_wanted = st.number_input("Number of Results", min_value=1, max_value=100, value=20)
75
- hours_old = st.number_input("Hours Old", min_value=1, max_value=168, value=72)
76
-
77
- job_boards = st.multiselect(
78
- "Select Job Boards",
79
- ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
80
- default=["indeed", "linkedin", "zip_recruiter", "glassdoor"]
81
- )
82
-
83
- if st.button("Scrape Jobs and Update Hugging Face Dataset"):
84
- try:
85
- with st.spinner("Scraping jobs..."):
86
- jobs = scrape_jobs(
87
- site_name=job_boards,
88
- search_term=search_term,
89
- location=location,
90
- results_wanted=results_wanted,
91
- hours_old=hours_old,
92
- country_indeed='USA'
93
- )
94
-
95
- st.success(f"Found {len(jobs)} jobs")
96
-
97
- df = pd.DataFrame(jobs)
98
-
99
- st.subheader("Job Listings Preview")
100
- st.dataframe(df.head())
101
-
102
- with st.spinner("Updating Hugging Face dataset..."):
103
- updated_count, filename = update_huggingface_dataset(jobs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- if updated_count > 0:
106
- st.success(f"Hugging Face dataset updated successfully with {updated_count} job listings!")
107
- st.info(f"New file created: {filename}")
108
-
109
- st.markdown(f"View your dataset: https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
110
- else:
111
- st.error("Failed to update Hugging Face dataset. Please check your permissions and try again.")
112
- except Exception as e:
113
- st.error(f"An error occurred: {str(e)}")
114
-
115
- st.sidebar.header("About")
116
- st.sidebar.info(
117
- "This app uses JobSpy to scrape job listings from various job boards "
118
- "and updates a Hugging Face dataset with the results. "
119
- "Enter your search criteria, select the job boards, and click 'Scrape Jobs and Update Hugging Face Dataset' to start."
120
- )
 
1
  import streamlit as st
 
2
  import pandas as pd
3
+ import plotly.express as px
4
  from huggingface_hub import HfApi
 
 
5
  import io
6
+ from datetime import datetime, timedelta
7
+ import time
8
+
9
+ # Set page config for a wider layout and custom theme
10
+ st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
11
+
12
+ # Custom CSS for black background and styling
13
+ st.markdown("""
14
+ <style>
15
+ .stApp {
16
+ background-color: #000000;
17
+ color: #FFFFFF;
18
+ }
19
+ .stButton>button {
20
+ background-color: #4e79a7;
21
+ color: white;
22
+ }
23
+ .stSelectbox, .stMultiSelect {
24
+ color: #FFFFFF;
25
+ }
26
+ .stDataFrame {
27
+ background-color: #1E1E1E;
28
+ }
29
+ .plotly-graph-div {
30
+ background-color: #1E1E1E;
31
+ }
32
+ .big-font {
33
+ font-size: 48px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ }
37
+ </style>
38
+ """, unsafe_allow_html=True)
39
 
40
  # Hugging Face setup
41
  HF_TOKEN = st.secrets["HF_TOKEN"]
42
  HF_USERNAME = st.secrets["HF_USERNAME"]
43
  DATASET_NAME = "jobeasz"
44
 
45
+ @st.cache_data(ttl=3600)
46
+ def load_and_concat_data():
47
+ api = HfApi()
48
+ dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
49
+ csv_files = [file for file in dataset_files if file.endswith('.csv')]
 
 
50
 
51
+ all_data = []
52
+ for file in csv_files:
53
+ try:
54
+ file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
55
+ df = pd.read_csv(file_content)
56
+ all_data.append(df)
57
+ except Exception:
58
+ pass # Silently skip files that can't be processed
59
 
60
+ if not all_data:
61
+ return pd.DataFrame()
 
 
 
62
 
63
+ concatenated_df = pd.concat(all_data, ignore_index=True)
 
 
 
 
 
64
 
65
+ columns_to_keep = [
66
+ 'site', 'job_url', 'title', 'company', 'location',
67
+ 'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
68
+ ]
69
+ filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
70
+ filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
71
 
72
+ # Drop duplicates
73
+ filtered_df = filtered_df.drop_duplicates()
74
 
75
+ return filtered_df
76
+
77
+ @st.cache_data
78
+ def get_unique_values(df):
79
+ return {
80
+ 'companies': df['company'].unique(),
81
+ 'locations': df['location'].unique(),
82
+ 'job_types': df['job_type'].unique()
83
+ }
84
+
85
+ def display_timer():
86
+ placeholder = st.empty()
87
+ for i in range(15, 0, -1):
88
+ placeholder.markdown(f"<p class='big-font'>Loading data... {i}</p>", unsafe_allow_html=True)
89
+ time.sleep(1)
90
+ placeholder.empty()
91
+
92
+ def main():
93
+ st.title("Job Listings Dashboard")
94
+
95
+ display_timer()
96
+
97
+ df = load_and_concat_data()
98
+
99
+ if df.empty:
100
+ st.error("No data available. Please check your dataset.")
101
+ return
102
+
103
+ # Sidebar for navigation
104
+ st.sidebar.title("Navigation")
105
+ page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
106
+
107
+ if page == "Dashboard":
108
+ display_dashboard(df)
109
+ elif page == "Data Explorer":
110
+ display_data_explorer(df)
111
+
112
+ @st.cache_data
113
+ def create_chart(data, x, y, title, color_sequence):
114
+ fig = px.bar(data, x=x, y=y, title=title, color_discrete_sequence=color_sequence)
115
+ fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
116
+ return fig
117
+
118
+ def display_dashboard(df):
119
+ col1, col2 = st.columns(2)
120
+
121
+ with col1:
122
+ st.subheader("Job Postings Overview")
123
+ st.metric("Total Job Postings", len(df))
124
+ st.metric("Unique Companies", df['company'].nunique())
125
+ st.metric("Unique Locations", df['location'].nunique())
126
+
127
+ min_date = df['date_posted'].min().date()
128
+ max_date = df['date_posted'].max().date()
129
+ st.write(f"Job postings from {min_date} to {max_date}")
130
+
131
+ with col2:
132
+ top_companies = df['company'].value_counts().head(10)
133
+ fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
134
+ st.plotly_chart(fig, use_container_width=True)
135
+
136
+ df_by_date = df.groupby('date_posted').size().reset_index(name='count')
137
+ fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
138
+ fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
139
+ st.plotly_chart(fig, use_container_width=True)
140
+
141
+ col3, col4 = st.columns(2)
142
+
143
+ with col3:
144
+ top_locations = df['location'].value_counts().head(10)
145
+ fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
146
+ st.plotly_chart(fig, use_container_width=True)
147
+
148
+ with col4:
149
+ job_types = df['job_type'].value_counts()
150
+ fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
151
+ fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
152
+ st.plotly_chart(fig, use_container_width=True)
153
+
154
+ @st.cache_data
155
+ def filter_dataframe(df, companies, locations, job_types):
156
+ filtered_df = df
157
+ if companies:
158
+ filtered_df = filtered_df[filtered_df['company'].isin(companies)]
159
+ if locations:
160
+ filtered_df = filtered_df[filtered_df['location'].isin(locations)]
161
+ if job_types:
162
+ filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
163
+ return filtered_df
164
+
165
+ def display_data_explorer(df):
166
+ st.subheader("Data Explorer")
167
+
168
+ show_all = st.radio("Display", ("All Data", "Filtered Data"))
169
+
170
+ if show_all == "Filtered Data":
171
+ unique_values = get_unique_values(df)
172
+ col1, col2, col3 = st.columns(3)
173
+ with col1:
174
+ companies = st.multiselect("Select Companies", options=unique_values['companies'])
175
+ with col2:
176
+ locations = st.multiselect("Select Locations", options=unique_values['locations'])
177
+ with col3:
178
+ job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
179
 
180
+ filtered_df = filter_dataframe(df, companies, locations, job_types)
181
+ else:
182
+ filtered_df = df
183
+
184
+ st.write(f"Showing {len(filtered_df)} job listings")
185
+
186
+ def make_clickable(url):
187
+ return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'
188
+
189
+ filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
190
+ filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
191
+
192
+ st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
193
+
194
+ if __name__ == "__main__":
195
+ main()