Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -74,16 +74,13 @@ def load_and_concat_data():
|
|
74 |
# Drop duplicates and rows with NaT in date_posted
|
75 |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
76 |
|
77 |
-
# Parse locations
|
78 |
-
filtered_df = parse_locations(filtered_df)
|
79 |
-
|
80 |
return filtered_df
|
81 |
|
82 |
@st.cache_data()
|
83 |
def get_unique_values(df):
|
84 |
return {
|
85 |
'companies': df['company'].unique(),
|
86 |
-
'locations': df['
|
87 |
'job_types': df['job_type'].unique()
|
88 |
}
|
89 |
|
@@ -104,8 +101,16 @@ def create_time_series(df):
|
|
104 |
)
|
105 |
return fig
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
@st.cache_data
|
108 |
-
def
|
109 |
valid_locations = [
|
110 |
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
|
111 |
"Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
|
@@ -128,21 +133,9 @@ def parse_locations(df):
|
|
128 |
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
|
129 |
]
|
130 |
|
131 |
-
|
132 |
-
df['location'] = df['location'].fillna('').astype(str)
|
133 |
-
|
134 |
-
def parse_location(x):
|
135 |
-
if pd.isna(x) or not isinstance(x, str):
|
136 |
-
return 'Other'
|
137 |
-
return next((loc for loc in valid_locations if loc in x), 'Other')
|
138 |
-
|
139 |
-
df['parsed_location'] = df['location'].apply(parse_location)
|
140 |
-
return df
|
141 |
-
|
142 |
-
@st.cache_data
|
143 |
-
def prepare_dashboard_data(df):
|
144 |
top_companies = df['company'].value_counts().head(10)
|
145 |
-
top_locations =
|
146 |
top_job_titles = df['title'].value_counts().head(20)
|
147 |
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
|
148 |
return top_companies, top_locations, top_job_titles, df_by_date
|
@@ -156,7 +149,7 @@ def display_dashboard(df):
|
|
156 |
st.subheader("Job Postings Overview")
|
157 |
st.metric("Total Job Postings", len(df))
|
158 |
st.metric("Unique Companies", df['company'].nunique())
|
159 |
-
st.metric("Unique Locations",
|
160 |
|
161 |
min_date = df['date_posted'].min().date()
|
162 |
max_date = df['date_posted'].max().date()
|
@@ -186,7 +179,7 @@ def filter_dataframe(df, companies, locations, job_types):
|
|
186 |
if companies:
|
187 |
filtered_df = filtered_df[filtered_df['company'].isin(companies)]
|
188 |
if locations:
|
189 |
-
filtered_df = filtered_df[filtered_df['
|
190 |
if job_types:
|
191 |
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
|
192 |
return filtered_df
|
|
|
74 |
# Drop duplicates and rows with NaT in date_posted
|
75 |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
76 |
|
|
|
|
|
|
|
77 |
return filtered_df
|
78 |
|
79 |
@st.cache_data()
|
80 |
def get_unique_values(df):
|
81 |
return {
|
82 |
'companies': df['company'].unique(),
|
83 |
+
'locations': df['location'].unique(),
|
84 |
'job_types': df['job_type'].unique()
|
85 |
}
|
86 |
|
|
|
101 |
)
|
102 |
return fig
|
103 |
|
104 |
+
def match_location(loc, valid_locations):
|
105 |
+
# Remove country names if present
|
106 |
+
loc = loc.replace(', USA', '').replace(', US', '').strip()
|
107 |
+
for valid_loc in valid_locations:
|
108 |
+
if valid_loc.lower() in loc.lower():
|
109 |
+
return valid_loc
|
110 |
+
return 'Other'
|
111 |
+
|
112 |
@st.cache_data
|
113 |
+
def prepare_dashboard_data(df):
|
114 |
valid_locations = [
|
115 |
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
|
116 |
"Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
|
|
|
133 |
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
|
134 |
]
|
135 |
|
136 |
+
matched_locations = df['location'].apply(lambda x: match_location(x, valid_locations))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
top_companies = df['company'].value_counts().head(10)
|
138 |
+
top_locations = matched_locations.value_counts().head(10)
|
139 |
top_job_titles = df['title'].value_counts().head(20)
|
140 |
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
|
141 |
return top_companies, top_locations, top_job_titles, df_by_date
|
|
|
149 |
st.subheader("Job Postings Overview")
|
150 |
st.metric("Total Job Postings", len(df))
|
151 |
st.metric("Unique Companies", df['company'].nunique())
|
152 |
+
st.metric("Unique Locations", top_locations.index.nunique())
|
153 |
|
154 |
min_date = df['date_posted'].min().date()
|
155 |
max_date = df['date_posted'].max().date()
|
|
|
179 |
if companies:
|
180 |
filtered_df = filtered_df[filtered_df['company'].isin(companies)]
|
181 |
if locations:
|
182 |
+
filtered_df = filtered_df[filtered_df['location'].apply(lambda x: match_location(x, locations)) != 'Other']
|
183 |
if job_types:
|
184 |
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
|
185 |
return filtered_df
|