Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

Niharmahesh commited on Oct 26, 2024

Commit

c43821b

verified ·

1 Parent(s): cecfc3c

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -21

app.py CHANGED Viewed

@@ -74,16 +74,13 @@ def load_and_concat_data():
     # Drop duplicates and rows with NaT in date_posted
     filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
-    # Parse locations
-    filtered_df = parse_locations(filtered_df)
     return filtered_df
 @st.cache_data()
 def get_unique_values(df):
     return {
         'companies': df['company'].unique(),
-        'locations': df['parsed_location'].unique(),
         'job_types': df['job_type'].unique()
     }
@@ -104,8 +101,16 @@ def create_time_series(df):
     )
     return fig
 @st.cache_data
-def parse_locations(df):
     valid_locations = [
         "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
         "Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
@@ -128,21 +133,9 @@ def parse_locations(df):
         "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
     ]
-    # Handle NaN and non-string types before parsing
-    df['location'] = df['location'].fillna('').astype(str)
-    def parse_location(x):
-        if pd.isna(x) or not isinstance(x, str):
-            return 'Other'
-        return next((loc for loc in valid_locations if loc in x), 'Other')
-    df['parsed_location'] = df['location'].apply(parse_location)
-    return df
-@st.cache_data
-def prepare_dashboard_data(df):
     top_companies = df['company'].value_counts().head(10)
-    top_locations = df['parsed_location'].value_counts().head(10)
     top_job_titles = df['title'].value_counts().head(20)
     df_by_date = df.groupby('date_posted').size().reset_index(name='count')
     return top_companies, top_locations, top_job_titles, df_by_date
@@ -156,7 +149,7 @@ def display_dashboard(df):
         st.subheader("Job Postings Overview")
         st.metric("Total Job Postings", len(df))
         st.metric("Unique Companies", df['company'].nunique())
-        st.metric("Unique Locations", df['parsed_location'].nunique())
         min_date = df['date_posted'].min().date()
         max_date = df['date_posted'].max().date()
@@ -186,7 +179,7 @@ def filter_dataframe(df, companies, locations, job_types):
     if companies:
         filtered_df = filtered_df[filtered_df['company'].isin(companies)]
     if locations:
-        filtered_df = filtered_df[filtered_df['parsed_location'].isin(locations)]
     if job_types:
         filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
     return filtered_df

     # Drop duplicates and rows with NaT in date_posted
     filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
     return filtered_df
 @st.cache_data()
 def get_unique_values(df):
     return {
         'companies': df['company'].unique(),
+        'locations': df['location'].unique(),
         'job_types': df['job_type'].unique()
     }
     )
     return fig
+def match_location(loc, valid_locations):
+    # Remove country names if present
+    loc = loc.replace(', USA', '').replace(', US', '').strip()
+    for valid_loc in valid_locations:
+        if valid_loc.lower() in loc.lower():
+            return valid_loc
+    return 'Other'
 @st.cache_data
+def prepare_dashboard_data(df):
     valid_locations = [
         "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
         "Phoenix, AZ", "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX",
         "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
     ]
+    matched_locations = df['location'].apply(lambda x: match_location(x, valid_locations))
     top_companies = df['company'].value_counts().head(10)
+    top_locations = matched_locations.value_counts().head(10)
     top_job_titles = df['title'].value_counts().head(20)
     df_by_date = df.groupby('date_posted').size().reset_index(name='count')
     return top_companies, top_locations, top_job_titles, df_by_date
         st.subheader("Job Postings Overview")
         st.metric("Total Job Postings", len(df))
         st.metric("Unique Companies", df['company'].nunique())
+        st.metric("Unique Locations", top_locations.index.nunique())
         min_date = df['date_posted'].min().date()
         max_date = df['date_posted'].max().date()
     if companies:
         filtered_df = filtered_df[filtered_df['company'].isin(companies)]
     if locations:
+        filtered_df = filtered_df[filtered_df['location'].apply(lambda x: match_location(x, locations)) != 'Other']
     if job_types:
         filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
     return filtered_df