Niharmahesh commited on
Commit
d8ebc25
·
verified ·
1 Parent(s): 81b2b74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -178
app.py CHANGED
@@ -1,195 +1,120 @@
1
  import streamlit as st
 
2
  import pandas as pd
3
- import plotly.express as px
4
  from huggingface_hub import HfApi
 
 
5
  import io
6
- from datetime import datetime, timedelta
7
- import time
8
-
9
- # Set page config for a wider layout and custom theme
10
- st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
11
-
12
- # Custom CSS for black background and styling
13
- st.markdown("""
14
- <style>
15
- .stApp {
16
- background-color: #000000;
17
- color: #FFFFFF;
18
- }
19
- .stButton>button {
20
- background-color: #4e79a7;
21
- color: white;
22
- }
23
- .stSelectbox, .stMultiSelect {
24
- color: #FFFFFF;
25
- }
26
- .stDataFrame {
27
- background-color: #1E1E1E;
28
- }
29
- .plotly-graph-div {
30
- background-color: #1E1E1E;
31
- }
32
- .big-font {
33
- font-size: 48px;
34
- font-weight: bold;
35
- text-align: center;
36
- }
37
- </style>
38
- """, unsafe_allow_html=True)
39
 
40
  # Hugging Face setup
41
  HF_TOKEN = st.secrets["HF_TOKEN"]
42
  HF_USERNAME = st.secrets["HF_USERNAME"]
43
  DATASET_NAME = "jobeasz"
44
 
45
- @st.cache_data(ttl=3600)
46
- def load_and_concat_data():
47
- api = HfApi()
48
- dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
49
- csv_files = [file for file in dataset_files if file.endswith('.csv')]
 
 
50
 
51
- all_data = []
52
- for file in csv_files:
53
- try:
54
- file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
55
- df = pd.read_csv(file_content)
56
- all_data.append(df)
57
- except Exception:
58
- pass # Silently skip files that can't be processed
59
 
60
- if not all_data:
61
- return pd.DataFrame()
 
 
 
62
 
63
- concatenated_df = pd.concat(all_data, ignore_index=True)
 
 
64
 
65
- columns_to_keep = [
66
- 'site', 'job_url', 'title', 'company', 'location',
67
- 'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
68
- ]
69
- filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
70
- filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
71
 
72
- # Drop duplicates
73
- filtered_df = filtered_df.drop_duplicates()
74
 
75
- return filtered_df
76
-
77
- @st.cache_data
78
- def get_unique_values(df):
79
- return {
80
- 'companies': df['company'].unique(),
81
- 'locations': df['location'].unique(),
82
- 'job_types': df['job_type'].unique()
83
- }
84
-
85
- def display_timer():
86
- placeholder = st.empty()
87
- for i in range(15, 0, -1):
88
- placeholder.markdown(f"<p class='big-font'>Loading data... {i}</p>", unsafe_allow_html=True)
89
- time.sleep(1)
90
- placeholder.empty()
91
-
92
- def main():
93
- st.title("Job Listings Dashboard")
94
-
95
- display_timer()
96
-
97
- df = load_and_concat_data()
98
-
99
- if df.empty:
100
- st.error("No data available. Please check your dataset.")
101
- return
102
-
103
- # Sidebar for navigation
104
- st.sidebar.title("Navigation")
105
- page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
106
-
107
- if page == "Dashboard":
108
- display_dashboard(df)
109
- elif page == "Data Explorer":
110
- display_data_explorer(df)
111
-
112
- @st.cache_data
113
- def create_chart(data, x, y, title, color_sequence):
114
- fig = px.bar(data, x=x, y=y, title=title, color_discrete_sequence=color_sequence)
115
- fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
116
- return fig
117
-
118
- def display_dashboard(df):
119
- col1, col2 = st.columns(2)
120
-
121
- with col1:
122
- st.subheader("Job Postings Overview")
123
- st.metric("Total Job Postings", len(df))
124
- st.metric("Unique Companies", df['company'].nunique())
125
- st.metric("Unique Locations", df['location'].nunique())
126
-
127
- min_date = df['date_posted'].min().date()
128
- max_date = df['date_posted'].max().date()
129
- st.write(f"Job postings from {min_date} to {max_date}")
130
-
131
- with col2:
132
- top_companies = df['company'].value_counts().head(10)
133
- fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
134
- st.plotly_chart(fig, use_container_width=True)
135
-
136
- df_by_date = df.groupby('date_posted').size().reset_index(name='count')
137
- fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
138
- fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
139
- st.plotly_chart(fig, use_container_width=True)
140
-
141
- col3, col4 = st.columns(2)
142
-
143
- with col3:
144
- top_locations = df['location'].value_counts().head(10)
145
- fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
146
- st.plotly_chart(fig, use_container_width=True)
147
-
148
- with col4:
149
- job_types = df['job_type'].value_counts()
150
- fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
151
- fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
152
- st.plotly_chart(fig, use_container_width=True)
153
-
154
- @st.cache_data
155
- def filter_dataframe(df, companies, locations, job_types):
156
- filtered_df = df
157
- if companies:
158
- filtered_df = filtered_df[filtered_df['company'].isin(companies)]
159
- if locations:
160
- filtered_df = filtered_df[filtered_df['location'].isin(locations)]
161
- if job_types:
162
- filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
163
- return filtered_df
164
-
165
- def display_data_explorer(df):
166
- st.subheader("Data Explorer")
167
-
168
- show_all = st.radio("Display", ("All Data", "Filtered Data"))
169
-
170
- if show_all == "Filtered Data":
171
- unique_values = get_unique_values(df)
172
- col1, col2, col3 = st.columns(3)
173
- with col1:
174
- companies = st.multiselect("Select Companies", options=unique_values['companies'])
175
- with col2:
176
- locations = st.multiselect("Select Locations", options=unique_values['locations'])
177
- with col3:
178
- job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
179
-
180
- filtered_df = filter_dataframe(df, companies, locations, job_types)
181
- else:
182
- filtered_df = df
183
-
184
- st.write(f"Showing {len(filtered_df)} job listings")
185
 
186
- def make_clickable(url):
187
- return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'
188
-
189
- filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
190
- filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
191
-
192
- st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
193
-
194
- if __name__ == "__main__":
195
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from jobspy import scrape_jobs
3
  import pandas as pd
4
+ from datasets import Dataset
5
  from huggingface_hub import HfApi
6
+ import os
7
+ from datetime import datetime
8
  import io
9
+ import hashlib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Hugging Face setup
12
  HF_TOKEN = st.secrets["HF_TOKEN"]
13
  HF_USERNAME = st.secrets["HF_USERNAME"]
14
  DATASET_NAME = "jobeasz"
15
 
16
+ @st.cache_data
17
+ def load_job_titles():
18
+ return [
19
+ "Data Analyst", "Data Scientist", "Data Engineer", "Machine Learning Engineer",
20
+ # ... (rest of the job titles)
21
+ "Data Annotation Expert", "Data Crowdsourcing Manager"
22
+ ]
23
 
24
+ @st.cache_data
25
+ def load_locations():
26
+ return [
27
+ "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
28
+ # ... (rest of the locations)
29
+ "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
30
+ ]
 
31
 
32
+ def generate_random_filename():
33
+ current_time = datetime.now().isoformat()
34
+ hash_object = hashlib.md5(current_time.encode())
35
+ random_hash = hash_object.hexdigest()[:8]
36
+ return f"{random_hash}.csv"
37
 
38
+ def update_huggingface_dataset(jobs):
39
+ df = pd.DataFrame(jobs)
40
+ filename = generate_random_filename()
41
 
42
+ if not os.path.exists("data"):
43
+ os.makedirs("data")
 
 
 
 
44
 
45
+ local_path = os.path.join("data", filename)
46
+ df.to_csv(local_path, index=False)
47
 
48
+ csv_string = df.to_csv(index=False)
49
+ file_obj = io.BytesIO(csv_string.encode())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ api = HfApi()
52
+ try:
53
+ api.upload_file(
54
+ path_or_fileobj=file_obj,
55
+ path_in_repo=f"data/{filename}",
56
+ repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
57
+ repo_type="dataset",
58
+ token=HF_TOKEN,
59
+ commit_message="Added new job listings"
60
+ )
61
+ return len(jobs), filename
62
+ except Exception as e:
63
+ st.error(f"Error uploading file to Hugging Face: {str(e)}")
64
+ return 0, None
65
+
66
+ st.title("Job Scraper and Hugging Face Dataset Updater")
67
+
68
+ job_titles = load_job_titles()
69
+ locations = load_locations()
70
+
71
+ search_term = st.selectbox("Job Title", options=job_titles, index=0)
72
+ location = st.selectbox("Location", options=locations, index=0)
73
+
74
+ results_wanted = st.number_input("Number of Results", min_value=1, max_value=100, value=20)
75
+ hours_old = st.number_input("Hours Old", min_value=1, max_value=168, value=72)
76
+
77
+ job_boards = st.multiselect(
78
+ "Select Job Boards",
79
+ ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
80
+ default=["indeed", "linkedin", "zip_recruiter", "glassdoor"]
81
+ )
82
+
83
+ if st.button("Scrape Jobs and Update Hugging Face Dataset"):
84
+ try:
85
+ with st.spinner("Scraping jobs..."):
86
+ jobs = scrape_jobs(
87
+ site_name=job_boards,
88
+ search_term=search_term,
89
+ location=location,
90
+ results_wanted=results_wanted,
91
+ hours_old=hours_old,
92
+ country_indeed='USA'
93
+ )
94
+
95
+ st.success(f"Found {len(jobs)} jobs")
96
+
97
+ df = pd.DataFrame(jobs)
98
+
99
+ st.subheader("Job Listings Preview")
100
+ st.dataframe(df.head())
101
+
102
+ with st.spinner("Updating Hugging Face dataset..."):
103
+ updated_count, filename = update_huggingface_dataset(jobs)
104
+
105
+ if updated_count > 0:
106
+ st.success(f"Hugging Face dataset updated successfully with {updated_count} job listings!")
107
+ st.info(f"New file created: {filename}")
108
+
109
+ st.markdown(f"View your dataset: https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
110
+ else:
111
+ st.error("Failed to update Hugging Face dataset. Please check your permissions and try again.")
112
+ except Exception as e:
113
+ st.error(f"An error occurred: {str(e)}")
114
+
115
+ st.sidebar.header("About")
116
+ st.sidebar.info(
117
+ "This app uses JobSpy to scrape job listings from various job boards "
118
+ "and updates a Hugging Face dataset with the results. "
119
+ "Enter your search criteria, select the job boards, and click 'Scrape Jobs and Update Hugging Face Dataset' to start."
120
+ )