Niharmahesh commited on
Commit
c5bab47
·
verified ·
1 Parent(s): ff96c77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -37
app.py CHANGED
@@ -9,6 +9,7 @@ import pyarrow as pa
9
  import pyarrow.parquet as pq
10
  import math
11
  import re
 
12
  # Set page config for a wider layout and custom theme
13
  st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
14
 
@@ -53,56 +54,51 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
53
  HF_USERNAME = st.secrets["HF_USERNAME"]
54
  DATASET_NAME = "jobeasz"
55
 
 
56
  @st.cache_data(ttl=3600)
57
  def load_and_concat_data():
58
  api = HfApi()
59
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
60
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
61
 
62
- all_data = []
63
- for file in csv_files:
64
  try:
65
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
66
- df = pd.read_csv(file_content, engine='pyarrow')
67
- all_data.append(df)
68
- except Exception:
69
- pass # Silently skip files that can't be processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  if not all_data:
72
  return pd.DataFrame()
73
 
74
  concatenated_df = pd.concat(all_data, ignore_index=True)
75
-
76
- columns_to_keep = [
77
- 'site', 'job_url', 'title', 'company', 'location',
78
- 'job_type', 'date_posted', 'is_remote', 'company_url'
79
- ]
80
- filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
81
- filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
82
-
83
- # Drop duplicates and rows with NaT in date_posted
84
- filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
85
- #filtering based on data in 2024
86
- filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
87
- # Convert titles and company name to lowercase
88
- filtered_df['title'] = filtered_df['title'].str.lower()
89
- filtered_df['company'] = filtered_df['company'].str.lower()
90
-
91
- # Function to clean the location
92
- def clean_location(location):
93
- if pd.isna(location):
94
- return location # Return NaN as is
95
- # Convert to lowercase
96
- location = location.lower()
97
- # Remove ', us' or ', usa' from the end using regex
98
- location = re.sub(r',\s*(us|usa)$', '', location)
99
- return location
100
-
101
- # Clean the location in place
102
- filtered_df['location'] = filtered_df['location'].apply(clean_location)
103
- #added new line to drop duplciate records
104
- filtered_df = filtered_df.drop_duplicates()
105
-
106
  return filtered_df
107
 
108
  @st.cache_data()
 
9
  import pyarrow.parquet as pq
10
  import math
11
  import re
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
  # Set page config for a wider layout and custom theme
14
  st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
15
 
 
54
  HF_USERNAME = st.secrets["HF_USERNAME"]
55
  DATASET_NAME = "jobeasz"
56
 
57
+
58
  @st.cache_data(ttl=3600)
59
  def load_and_concat_data():
60
  api = HfApi()
61
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
62
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
63
 
64
+ def process_file(file):
 
65
  try:
66
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
67
+ df = pd.read_csv(file_content, engine='pyarrow', usecols=[
68
+ 'site', 'job_url', 'title', 'company', 'location',
69
+ 'job_type', 'date_posted', 'is_remote', 'company_url'
70
+ ])
71
+ df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')
72
+ df = df[df['date_posted'].dt.year == 2024].dropna(subset=['date_posted'])
73
+ df['title'] = df['title'].str.lower()
74
+ df['company'] = df['company'].str.lower()
75
+ df['location'] = df['location'].apply(clean_location)
76
+ return df
77
+ except Exception as e:
78
+ print(f"Error processing file {file}: {str(e)}")
79
+ return None
80
+
81
+ def clean_location(location):
82
+ if pd.isna(location):
83
+ return location
84
+ location = location.lower()
85
+ return re.sub(r',\s*(us|usa)$', '', location)
86
+
87
+ # Use ThreadPoolExecutor for parallel processing
88
+ with ThreadPoolExecutor(max_workers=4) as executor:
89
+ future_to_file = {executor.submit(process_file, file): file for file in csv_files}
90
+ all_data = []
91
+ for future in as_completed(future_to_file):
92
+ df = future.result()
93
+ if df is not None:
94
+ all_data.append(df)
95
 
96
  if not all_data:
97
  return pd.DataFrame()
98
 
99
  concatenated_df = pd.concat(all_data, ignore_index=True)
100
+ filtered_df = concatenated_df.drop_duplicates().reset_index(drop=True)
101
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  return filtered_df
103
 
104
  @st.cache_data()