Niharmahesh commited on
Commit
ba4caa1
·
verified ·
1 Parent(s): a4dd7d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -14
app.py CHANGED
@@ -8,32 +8,68 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
8
  HF_USERNAME = st.secrets["HF_USERNAME"]
9
  DATASET_NAME = "jobeasz"
10
 
11
- def list_and_preview_csv_files():
 
12
  api = HfApi()
13
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
14
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
15
 
16
- st.write(f"Total CSV files found: {len(csv_files)}")
17
-
18
  for file in csv_files:
19
- st.subheader(f"File: {file}")
20
  try:
21
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
22
  df = pd.read_csv(file_content)
23
- st.write(f"Shape: {df.shape}")
24
- st.write("First 5 rows:")
25
- st.dataframe(df.head())
26
- except pd.errors.EmptyDataError:
27
- st.warning(f"File {file} is empty or contains no data.")
28
  except Exception as e:
29
- st.error(f"Error reading file {file}: {str(e)}")
30
- st.write("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def main():
33
- st.title("Hugging Face Dataset CSV Files Preview")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- if st.button("List and Preview CSV Files"):
36
- list_and_preview_csv_files()
 
 
 
 
 
 
 
 
37
 
38
  if __name__ == "__main__":
39
  main()
 
8
  HF_USERNAME = st.secrets["HF_USERNAME"]
9
  DATASET_NAME = "jobeasz"
10
 
11
+ @st.cache_data(ttl=3600) # Cache for 1 hour
12
+ def load_and_concat_data():
13
  api = HfApi()
14
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
15
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
16
 
17
+ all_data = []
 
18
  for file in csv_files:
 
19
  try:
20
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
21
  df = pd.read_csv(file_content)
22
+ all_data.append(df)
 
 
 
 
23
  except Exception as e:
24
+ st.warning(f"Error reading file {file}: {str(e)}")
25
+
26
+ if not all_data:
27
+ st.error("No valid data found in any of the CSV files.")
28
+ return pd.DataFrame()
29
+
30
+ concatenated_df = pd.concat(all_data, ignore_index=True)
31
+
32
+ # Filter columns
33
+ columns_to_keep = [
34
+ 'site', 'job_url', 'title', 'company', 'location',
35
+ 'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
36
+ ]
37
+ filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
38
+
39
+ # Ensure 'date_posted' is in datetime format
40
+ filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
41
+
42
+ return filtered_df
43
 
44
  def main():
45
+ st.title("Concatenated Job Listings Data")
46
+
47
+ if st.button("Load and Preview Concatenated Data"):
48
+ with st.spinner("Loading and concatenating data..."):
49
+ df = load_and_concat_data()
50
+
51
+ if not df.empty:
52
+ st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}")
53
+
54
+ st.subheader("Data Preview")
55
+ st.dataframe(df.head())
56
+
57
+ st.subheader("Dataset Statistics")
58
+ st.write(f"Total job listings: {len(df)}")
59
+ st.write(f"Unique companies: {df['company'].nunique()}")
60
+ st.write(f"Unique locations: {df['location'].nunique()}")
61
+ st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}")
62
 
63
+ # Allow user to download the concatenated dataset
64
+ csv = df.to_csv(index=False)
65
+ st.download_button(
66
+ label="Download concatenated dataset as CSV",
67
+ data=csv,
68
+ file_name="concatenated_job_listings.csv",
69
+ mime="text/csv",
70
+ )
71
+ else:
72
+ st.error("No data available to display.")
73
 
74
  if __name__ == "__main__":
75
  main()