import streamlit as st import pandas as pd from huggingface_hub import HfApi import io # Hugging Face setup HF_TOKEN = st.secrets["HF_TOKEN"] HF_USERNAME = st.secrets["HF_USERNAME"] DATASET_NAME = "jobeasz" @st.cache_data(ttl=3600) # Cache for 1 hour def load_and_concat_data(): api = HfApi() dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") csv_files = [file for file in dataset_files if file.endswith('.csv')] all_data = [] for file in csv_files: try: file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) df = pd.read_csv(file_content) all_data.append(df) except Exception as e: st.warning(f"Error reading file {file}: {str(e)}") if not all_data: st.error("No valid data found in any of the CSV files.") return pd.DataFrame() concatenated_df = pd.concat(all_data, ignore_index=True) # Filter columns columns_to_keep = [ 'site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'description', 'company_url' ] filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) # Ensure 'date_posted' is in datetime format filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') return filtered_df def main(): st.title("Concatenated Job Listings Data") if st.button("Load and Preview Concatenated Data"): with st.spinner("Loading and concatenating data..."): df = load_and_concat_data() if not df.empty: st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}") st.subheader("Data Preview") st.dataframe(df.head()) st.subheader("Dataset Statistics") st.write(f"Total job listings: {len(df)}") st.write(f"Unique companies: {df['company'].nunique()}") st.write(f"Unique locations: {df['location'].nunique()}") st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}") # Allow user to download the concatenated dataset csv = df.to_csv(index=False) st.download_button( label="Download concatenated dataset as CSV", data=csv, file_name="concatenated_job_listings.csv", mime="text/csv", ) else: st.error("No data available to display.") if __name__ == "__main__": main()