Niharmahesh commited on
Commit
ad3a72f
·
verified ·
1 Parent(s): efabaac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -38
app.py CHANGED
@@ -66,28 +66,34 @@ def load_and_concat_data():
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
 
69
- # Use CSV sniffer to detect delimiter and number of columns
70
- with open(file_content, 'r') as f:
71
- sample = f.read(1024)
72
- sniffer = csv.Sniffer()
73
- dialect = sniffer.sniff(sample)
74
- f.seek(0)
75
-
76
- # Use pyarrow for more flexible parsing
77
- parse_options = csv.ParseOptions(delimiter=dialect.delimiter)
78
- table = csv.read_csv(file_content, parse_options=parse_options)
79
-
80
- # Convert to pandas DataFrame
81
- df = table.to_pandas()
82
 
83
- # Ensure all required columns are present, fill with NaN if missing
84
- required_columns = ['site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url']
85
- for col in required_columns:
86
- if col not in df.columns:
87
- df[col] = pd.NA
88
 
89
- # Select only the required columns
90
- df = df[required_columns]
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  all_data.append(df)
93
  except Exception as e:
@@ -98,24 +104,9 @@ def load_and_concat_data():
98
  return pd.DataFrame()
99
 
100
  concatenated_df = pd.concat(all_data, ignore_index=True)
101
-
102
- # Perform data cleaning and processing
103
- concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'], errors='coerce')
104
- concatenated_df = concatenated_df.dropna(subset=['date_posted'])
105
- concatenated_df = concatenated_df[concatenated_df['date_posted'].dt.year == 2024]
106
- concatenated_df['title'] = concatenated_df['title'].str.lower()
107
- concatenated_df['company'] = concatenated_df['company'].str.lower()
108
-
109
- def clean_location(location):
110
- if pd.isna(location):
111
- return location
112
- location = str(location).lower()
113
- return re.sub(r',\s*(us|usa)$', '', location)
114
-
115
- concatenated_df['location'] = concatenated_df['location'].apply(clean_location)
116
- concatenated_df = concatenated_df.drop_duplicates()
117
-
118
- return concatenated_df
119
 
120
  @st.cache_data()
121
  def get_unique_values(df):
 
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
 
69
+ # Use PyArrow's CSV reading capabilities
70
+ read_options = csv.ReadOptions(use_threads=True)
71
+ parse_options = csv.ParseOptions(delimiter=',') # Adjust delimiter if needed
72
+ convert_options = csv.ConvertOptions(
73
+ column_types={
74
+ 'date_posted': pa.timestamp('s'),
75
+ 'is_remote': pa.bool_()
76
+ },
77
+ strings_can_be_null=True
78
+ )
 
 
 
79
 
80
+ table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
81
+ df = table.to_pandas()
 
 
 
82
 
83
+ # Perform data cleaning and processing
84
+ df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')
85
+ df = df.dropna(subset=['date_posted'])
86
+ df = df[df['date_posted'].dt.year == 2024]
87
+ df['title'] = df['title'].str.lower()
88
+ df['company'] = df['company'].str.lower()
89
+
90
+ def clean_location(location):
91
+ if pd.isna(location):
92
+ return location
93
+ location = str(location).lower()
94
+ return re.sub(r',\s*(us|usa)$', '', location)
95
+
96
+ df['location'] = df['location'].apply(clean_location)
97
 
98
  all_data.append(df)
99
  except Exception as e:
 
104
  return pd.DataFrame()
105
 
106
  concatenated_df = pd.concat(all_data, ignore_index=True)
107
+ filtered_df = concatenated_df.drop_duplicates().reset_index(drop=True)
108
+
109
+ return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  @st.cache_data()
112
  def get_unique_values(df):