Niharmahesh commited on
Commit
c43821b
·
verified ·
1 Parent(s): cecfc3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -21
app.py CHANGED
@@ -74,16 +74,13 @@ def load_and_concat_data():
74
  # Drop duplicates and rows with NaT in date_posted
75
  filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
76
 
77
- # Parse locations
78
- filtered_df = parse_locations(filtered_df)
79
-
80
  return filtered_df
81
 
82
  @st.cache_data()
83
  def get_unique_values(df):
84
  return {
85
  'companies': df['company'].unique(),
86
- 'locations': df['parsed_location'].unique(),
87
  'job_types': df['job_type'].unique()
88
  }
89
 
@@ -104,8 +101,16 @@ def create_time_series(df):
104
  )
105
  return fig
106
 
 
 
 
 
 
 
 
 
107
  @st.cache_data
108
- def parse_locations(df):
109
  valid_locations = [
110
  "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
111
  "Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
@@ -128,21 +133,9 @@ def parse_locations(df):
128
  "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
129
  ]
130
 
131
- # Handle NaN and non-string types before parsing
132
- df['location'] = df['location'].fillna('').astype(str)
133
-
134
- def parse_location(x):
135
- if pd.isna(x) or not isinstance(x, str):
136
- return 'Other'
137
- return next((loc for loc in valid_locations if loc in x), 'Other')
138
-
139
- df['parsed_location'] = df['location'].apply(parse_location)
140
- return df
141
-
142
- @st.cache_data
143
- def prepare_dashboard_data(df):
144
  top_companies = df['company'].value_counts().head(10)
145
- top_locations = df['parsed_location'].value_counts().head(10)
146
  top_job_titles = df['title'].value_counts().head(20)
147
  df_by_date = df.groupby('date_posted').size().reset_index(name='count')
148
  return top_companies, top_locations, top_job_titles, df_by_date
@@ -156,7 +149,7 @@ def display_dashboard(df):
156
  st.subheader("Job Postings Overview")
157
  st.metric("Total Job Postings", len(df))
158
  st.metric("Unique Companies", df['company'].nunique())
159
- st.metric("Unique Locations", df['parsed_location'].nunique())
160
 
161
  min_date = df['date_posted'].min().date()
162
  max_date = df['date_posted'].max().date()
@@ -186,7 +179,7 @@ def filter_dataframe(df, companies, locations, job_types):
186
  if companies:
187
  filtered_df = filtered_df[filtered_df['company'].isin(companies)]
188
  if locations:
189
- filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)]
190
  if job_types:
191
  filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
192
  return filtered_df
 
74
  # Drop duplicates and rows with NaT in date_posted
75
  filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
76
 
 
 
 
77
  return filtered_df
78
 
79
  @st.cache_data()
80
  def get_unique_values(df):
81
  return {
82
  'companies': df['company'].unique(),
83
+ 'locations': df['location'].unique(),
84
  'job_types': df['job_type'].unique()
85
  }
86
 
 
101
  )
102
  return fig
103
 
104
+ def match_location(loc, valid_locations):
105
+ # Remove country names if present
106
+ loc = loc.replace(', USA', '').replace(', US', '').strip()
107
+ for valid_loc in valid_locations:
108
+ if valid_loc.lower() in loc.lower():
109
+ return valid_loc
110
+ return 'Other'
111
+
112
  @st.cache_data
113
+ def prepare_dashboard_data(df):
114
  valid_locations = [
115
  "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
116
  "Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
 
133
  "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
134
  ]
135
 
136
+ matched_locations = df['location'].apply(lambda x: match_location(x, valid_locations))
 
 
 
 
 
 
 
 
 
 
 
 
137
  top_companies = df['company'].value_counts().head(10)
138
+ top_locations = matched_locations.value_counts().head(10)
139
  top_job_titles = df['title'].value_counts().head(20)
140
  df_by_date = df.groupby('date_posted').size().reset_index(name='count')
141
  return top_companies, top_locations, top_job_titles, df_by_date
 
149
  st.subheader("Job Postings Overview")
150
  st.metric("Total Job Postings", len(df))
151
  st.metric("Unique Companies", df['company'].nunique())
152
+ st.metric("Unique Locations", top_locations.index.nunique())
153
 
154
  min_date = df['date_posted'].min().date()
155
  max_date = df['date_posted'].max().date()
 
179
  if companies:
180
  filtered_df = filtered_df[filtered_df['company'].isin(companies)]
181
  if locations:
182
+ filtered_df = filtered_df[filtered_df['location'].apply(lambda x: match_location(x, locations)) != 'Other']
183
  if job_types:
184
  filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
185
  return filtered_df