Niharmahesh commited on
Commit
cf1bd9a
·
verified ·
1 Parent(s): 838169f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -35
app.py CHANGED
@@ -66,46 +66,64 @@ def load_and_concat_data():
66
  for file in csv_files:
67
  try:
68
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
69
- df = pd.read_csv(file_content, engine='pyarrow')
70
- all_data.append(df)
71
- except Exception:
72
- pass # Silently skip files that can't be processed
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  if not all_data:
75
- return pd.DataFrame()
76
 
77
- concatenated_df = pd.concat(all_data, ignore_index=True)
78
-
79
- columns_to_keep = [
80
- 'site', 'job_url', 'title', 'company', 'location',
81
- 'job_type', 'date_posted', 'is_remote', 'company_url'
82
- ]
83
- filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
84
- filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
85
-
86
- # Drop duplicates and rows with NaT in date_posted
87
- filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
88
- #filtering based on data in 2024
89
- filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
90
- # Convert titles and company name to lowercase
91
- filtered_df['title'] = filtered_df['title'].str.lower()
92
- filtered_df['company'] = filtered_df['company'].str.lower()
93
-
94
- # Function to clean the location
 
 
95
  def clean_location(location):
96
- if pd.isna(location):
97
- return location # Return NaN as is
98
- # Convert to lowercase
99
  location = location.lower()
100
- # Remove ', us' or ', usa' from the end using regex
101
- location = re.sub(r',\s*(us|usa)$', '', location)
102
- return location
103
-
104
- # Clean the location in place
105
- filtered_df['location'] = filtered_df['location'].apply(clean_location)
106
- #added new line to drop duplciate records
107
- filtered_df = filtered_df.drop_duplicates()
108
-
 
 
 
 
 
 
109
  return filtered_df
110
 
111
  @st.cache_data()
 
66
  for file in csv_files:
67
  try:
68
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
69
+
70
+ # Use PyArrow to read CSV
71
+ read_options = csv.ReadOptions(column_names=[
72
+ 'site', 'job_url', 'title', 'company', 'location',
73
+ 'job_type', 'date_posted', 'is_remote', 'company_url'
74
+ ])
75
+ parse_options = csv.ParseOptions(delimiter=',')
76
+ convert_options = csv.ConvertOptions(
77
+ timestamp_parsers=['%Y-%m-%d']
78
+ )
79
+
80
+ table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
81
+ all_data.append(table)
82
+ except Exception as e:
83
+ print(f"Error processing file {file}: {str(e)}")
84
 
85
  if not all_data:
86
+ return pa.Table.from_pandas(pd.DataFrame())
87
 
88
+ # Concatenate all tables
89
+ concatenated_table = pa.concat_tables(all_data)
90
+
91
+ # Filter for 2024 data
92
+ mask = pc.year(concatenated_table['date_posted']) == 2024
93
+ filtered_table = concatenated_table.filter(mask)
94
+
95
+ # Convert titles and company names to lowercase
96
+ filtered_table = filtered_table.set_column(
97
+ filtered_table.schema.get_field_index('title'),
98
+ 'title',
99
+ pc.utf8_lower(filtered_table['title'])
100
+ )
101
+ filtered_table = filtered_table.set_column(
102
+ filtered_table.schema.get_field_index('company'),
103
+ 'company',
104
+ pc.utf8_lower(filtered_table['company'])
105
+ )
106
+
107
+ # Clean location
108
  def clean_location(location):
109
+ if location is None:
110
+ return None
 
111
  location = location.lower()
112
+ return re.sub(r',\s*(us|usa)$', '', location)
113
+
114
+ cleaned_locations = pc.map(filtered_table['location'], clean_location)
115
+ filtered_table = filtered_table.set_column(
116
+ filtered_table.schema.get_field_index('location'),
117
+ 'location',
118
+ cleaned_locations
119
+ )
120
+
121
+ # Remove duplicates
122
+ filtered_table = filtered_table.group_by(filtered_table.column_names).aggregate([])
123
+
124
+ # Convert to pandas DataFrame for compatibility with the rest of your code
125
+ filtered_df = filtered_table.to_pandas()
126
+
127
  return filtered_df
128
 
129
  @st.cache_data()