Niharmahesh commited on
Commit
64d6b5e
·
verified ·
1 Parent(s): b5e737c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -18
app.py CHANGED
@@ -64,18 +64,33 @@ def load_and_concat_data():
64
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
65
  feather_files = [file for file in dataset_files if file.endswith('.feather')]
66
 
67
- all_data = []
68
- for file in feather_files:
69
  try:
70
- file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
71
- df = feather.read_feather(file_content)
72
- all_data.append(df)
73
- except Exception:
74
- pass # Silently skip files that can't be processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  if not all_data:
77
  return pd.DataFrame()
78
 
 
79
  concatenated_df = pd.concat(all_data, ignore_index=True)
80
 
81
  columns_to_keep = [
@@ -85,31 +100,22 @@ def load_and_concat_data():
85
  filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
86
  filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
87
 
88
- # Drop duplicates and rows with NaT in date_posted removed this to make it clear (jan13th)
89
- #filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
90
- #filtering based on data in 2024
91
  filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2025]
92
- # Convert titles and company name to lowercase
93
  filtered_df['title'] = filtered_df['title'].str.lower()
94
  filtered_df['company'] = filtered_df['company'].str.lower()
95
 
96
- # Function to clean the location
97
  def clean_location(location):
98
  if pd.isna(location):
99
- return location # Return NaN as is
100
- # Convert to lowercase
101
  location = location.lower()
102
- # Remove ', us' or ', usa' from the end using regex
103
  location = re.sub(r',\s*(us|usa)$', '', location)
104
  return location
105
 
106
- # Clean the location in place
107
  filtered_df['location'] = filtered_df['location'].apply(clean_location)
108
- #added new line to drop duplicate records
109
  filtered_df = filtered_df.drop_duplicates()
110
 
111
  return filtered_df
112
-
113
  @st.cache_data()
114
  def get_unique_values(df):
115
  return {
 
64
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
65
  feather_files = [file for file in dataset_files if file.endswith('.feather')]
66
 
67
+ # Function to download and load a single file
68
+ def download_and_load(file):
69
  try:
70
+ file_content = api.hf_hub_download(
71
+ repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
72
+ filename=file,
73
+ repo_type="dataset",
74
+ token=HF_TOKEN
75
+ )
76
+ return feather.read_feather(file_content)
77
+ except Exception as e:
78
+ print(f"Error loading {file}: {str(e)}")
79
+ return None
80
+
81
+ # Download files in parallel
82
+ all_data = []
83
+ with ThreadPoolExecutor(max_workers=10) as executor:
84
+ future_to_file = {executor.submit(download_and_load, file): file for file in feather_files}
85
+ for future in as_completed(future_to_file):
86
+ df = future.result()
87
+ if df is not None:
88
+ all_data.append(df)
89
 
90
  if not all_data:
91
  return pd.DataFrame()
92
 
93
+ # Rest of your processing logic remains the same
94
  concatenated_df = pd.concat(all_data, ignore_index=True)
95
 
96
  columns_to_keep = [
 
100
  filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
101
  filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
102
 
 
 
 
103
  filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2025]
 
104
  filtered_df['title'] = filtered_df['title'].str.lower()
105
  filtered_df['company'] = filtered_df['company'].str.lower()
106
 
 
107
  def clean_location(location):
108
  if pd.isna(location):
109
+ return location
 
110
  location = location.lower()
 
111
  location = re.sub(r',\s*(us|usa)$', '', location)
112
  return location
113
 
 
114
  filtered_df['location'] = filtered_df['location'].apply(clean_location)
 
115
  filtered_df = filtered_df.drop_duplicates()
116
 
117
  return filtered_df
118
+
119
  @st.cache_data()
120
  def get_unique_values(df):
121
  return {