Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -66,46 +66,64 @@ def load_and_concat_data():
|
|
66 |
for file in csv_files:
|
67 |
try:
|
68 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
if not all_data:
|
75 |
-
return pd.DataFrame()
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
def clean_location(location):
|
96 |
-
if
|
97 |
-
return
|
98 |
-
# Convert to lowercase
|
99 |
location = location.lower()
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
return filtered_df
|
110 |
|
111 |
@st.cache_data()
|
|
|
66 |
for file in csv_files:
|
67 |
try:
|
68 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
69 |
+
|
70 |
+
# Use PyArrow to read CSV
|
71 |
+
read_options = csv.ReadOptions(column_names=[
|
72 |
+
'site', 'job_url', 'title', 'company', 'location',
|
73 |
+
'job_type', 'date_posted', 'is_remote', 'company_url'
|
74 |
+
])
|
75 |
+
parse_options = csv.ParseOptions(delimiter=',')
|
76 |
+
convert_options = csv.ConvertOptions(
|
77 |
+
timestamp_parsers=['%Y-%m-%d']
|
78 |
+
)
|
79 |
+
|
80 |
+
table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
|
81 |
+
all_data.append(table)
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error processing file {file}: {str(e)}")
|
84 |
|
85 |
if not all_data:
|
86 |
+
return pa.Table.from_pandas(pd.DataFrame())
|
87 |
|
88 |
+
# Concatenate all tables
|
89 |
+
concatenated_table = pa.concat_tables(all_data)
|
90 |
+
|
91 |
+
# Filter for 2024 data
|
92 |
+
mask = pc.year(concatenated_table['date_posted']) == 2024
|
93 |
+
filtered_table = concatenated_table.filter(mask)
|
94 |
+
|
95 |
+
# Convert titles and company names to lowercase
|
96 |
+
filtered_table = filtered_table.set_column(
|
97 |
+
filtered_table.schema.get_field_index('title'),
|
98 |
+
'title',
|
99 |
+
pc.utf8_lower(filtered_table['title'])
|
100 |
+
)
|
101 |
+
filtered_table = filtered_table.set_column(
|
102 |
+
filtered_table.schema.get_field_index('company'),
|
103 |
+
'company',
|
104 |
+
pc.utf8_lower(filtered_table['company'])
|
105 |
+
)
|
106 |
+
|
107 |
+
# Clean location
|
108 |
def clean_location(location):
|
109 |
+
if location is None:
|
110 |
+
return None
|
|
|
111 |
location = location.lower()
|
112 |
+
return re.sub(r',\s*(us|usa)$', '', location)
|
113 |
+
|
114 |
+
cleaned_locations = pc.map(filtered_table['location'], clean_location)
|
115 |
+
filtered_table = filtered_table.set_column(
|
116 |
+
filtered_table.schema.get_field_index('location'),
|
117 |
+
'location',
|
118 |
+
cleaned_locations
|
119 |
+
)
|
120 |
+
|
121 |
+
# Remove duplicates
|
122 |
+
filtered_table = filtered_table.group_by(filtered_table.column_names).aggregate([])
|
123 |
+
|
124 |
+
# Convert to pandas DataFrame for compatibility with the rest of your code
|
125 |
+
filtered_df = filtered_table.to_pandas()
|
126 |
+
|
127 |
return filtered_df
|
128 |
|
129 |
@st.cache_data()
|