Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from huggingface_hub import HfApi | |
import io | |
# Hugging Face setup | |
HF_TOKEN = st.secrets["HF_TOKEN"] | |
HF_USERNAME = st.secrets["HF_USERNAME"] | |
DATASET_NAME = "jobeasz" | |
# Cache for 1 hour | |
def load_and_concat_data(): | |
api = HfApi() | |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") | |
csv_files = [file for file in dataset_files if file.endswith('.csv')] | |
all_data = [] | |
for file in csv_files: | |
try: | |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) | |
df = pd.read_csv(file_content) | |
all_data.append(df) | |
except Exception as e: | |
st.warning(f"Error reading file {file}: {str(e)}") | |
if not all_data: | |
st.error("No valid data found in any of the CSV files.") | |
return pd.DataFrame() | |
concatenated_df = pd.concat(all_data, ignore_index=True) | |
# Filter columns | |
columns_to_keep = [ | |
'site', 'job_url', 'title', 'company', 'location', | |
'job_type', 'date_posted', 'is_remote', 'description', 'company_url' | |
] | |
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) | |
# Ensure 'date_posted' is in datetime format | |
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') | |
return filtered_df | |
def main(): | |
st.title("Concatenated Job Listings Data") | |
if st.button("Load and Preview Concatenated Data"): | |
with st.spinner("Loading and concatenating data..."): | |
df = load_and_concat_data() | |
if not df.empty: | |
st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}") | |
st.subheader("Data Preview") | |
st.dataframe(df.head()) | |
st.subheader("Dataset Statistics") | |
st.write(f"Total job listings: {len(df)}") | |
st.write(f"Unique companies: {df['company'].nunique()}") | |
st.write(f"Unique locations: {df['location'].nunique()}") | |
st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}") | |
# Allow user to download the concatenated dataset | |
csv = df.to_csv(index=False) | |
st.download_button( | |
label="Download concatenated dataset as CSV", | |
data=csv, | |
file_name="concatenated_job_listings.csv", | |
mime="text/csv", | |
) | |
else: | |
st.error("No data available to display.") | |
if __name__ == "__main__": | |
main() |