Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,32 +8,68 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
|
|
8 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
9 |
DATASET_NAME = "jobeasz"
|
10 |
|
11 |
-
|
|
|
12 |
api = HfApi()
|
13 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
14 |
csv_files = [file for file in dataset_files if file.endswith('.csv')]
|
15 |
|
16 |
-
|
17 |
-
|
18 |
for file in csv_files:
|
19 |
-
st.subheader(f"File: {file}")
|
20 |
try:
|
21 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
22 |
df = pd.read_csv(file_content)
|
23 |
-
|
24 |
-
st.write("First 5 rows:")
|
25 |
-
st.dataframe(df.head())
|
26 |
-
except pd.errors.EmptyDataError:
|
27 |
-
st.warning(f"File {file} is empty or contains no data.")
|
28 |
except Exception as e:
|
29 |
-
st.
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def main():
|
33 |
-
st.title("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
main()
|
|
|
8 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
9 |
DATASET_NAME = "jobeasz"
|
10 |
|
11 |
+
@st.cache_data(ttl=3600) # Cache for 1 hour
|
12 |
+
def load_and_concat_data():
|
13 |
api = HfApi()
|
14 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
15 |
csv_files = [file for file in dataset_files if file.endswith('.csv')]
|
16 |
|
17 |
+
all_data = []
|
|
|
18 |
for file in csv_files:
|
|
|
19 |
try:
|
20 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
21 |
df = pd.read_csv(file_content)
|
22 |
+
all_data.append(df)
|
|
|
|
|
|
|
|
|
23 |
except Exception as e:
|
24 |
+
st.warning(f"Error reading file {file}: {str(e)}")
|
25 |
+
|
26 |
+
if not all_data:
|
27 |
+
st.error("No valid data found in any of the CSV files.")
|
28 |
+
return pd.DataFrame()
|
29 |
+
|
30 |
+
concatenated_df = pd.concat(all_data, ignore_index=True)
|
31 |
+
|
32 |
+
# Filter columns
|
33 |
+
columns_to_keep = [
|
34 |
+
'site', 'job_url', 'title', 'company', 'location',
|
35 |
+
'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
|
36 |
+
]
|
37 |
+
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
|
38 |
+
|
39 |
+
# Ensure 'date_posted' is in datetime format
|
40 |
+
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
|
41 |
+
|
42 |
+
return filtered_df
|
43 |
|
44 |
def main():
|
45 |
+
st.title("Concatenated Job Listings Data")
|
46 |
+
|
47 |
+
if st.button("Load and Preview Concatenated Data"):
|
48 |
+
with st.spinner("Loading and concatenating data..."):
|
49 |
+
df = load_and_concat_data()
|
50 |
+
|
51 |
+
if not df.empty:
|
52 |
+
st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}")
|
53 |
+
|
54 |
+
st.subheader("Data Preview")
|
55 |
+
st.dataframe(df.head())
|
56 |
+
|
57 |
+
st.subheader("Dataset Statistics")
|
58 |
+
st.write(f"Total job listings: {len(df)}")
|
59 |
+
st.write(f"Unique companies: {df['company'].nunique()}")
|
60 |
+
st.write(f"Unique locations: {df['location'].nunique()}")
|
61 |
+
st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}")
|
62 |
|
63 |
+
# Allow user to download the concatenated dataset
|
64 |
+
csv = df.to_csv(index=False)
|
65 |
+
st.download_button(
|
66 |
+
label="Download concatenated dataset as CSV",
|
67 |
+
data=csv,
|
68 |
+
file_name="concatenated_job_listings.csv",
|
69 |
+
mime="text/csv",
|
70 |
+
)
|
71 |
+
else:
|
72 |
+
st.error("No data available to display.")
|
73 |
|
74 |
if __name__ == "__main__":
|
75 |
main()
|