import streamlit as st
import pandas as pd
from huggingface_hub import HfApi
import io

# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"

@st.cache_data(ttl=3600)  # Cache for 1 hour
def load_and_concat_data():
    api = HfApi()
    dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
    csv_files = [file for file in dataset_files if file.endswith('.csv')]

    all_data = []
    for file in csv_files:
        try:
            file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
            df = pd.read_csv(file_content)
            all_data.append(df)
        except Exception as e:
            st.warning(f"Error reading file {file}: {str(e)}")

    if not all_data:
        st.error("No valid data found in any of the CSV files.")
        return pd.DataFrame()

    concatenated_df = pd.concat(all_data, ignore_index=True)
    
    # Filter columns
    columns_to_keep = [
        'site', 'job_url', 'title', 'company', 'location',
        'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
    ]
    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
    
    # Ensure 'date_posted' is in datetime format
    filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
    
    return filtered_df

def main():
    st.title("Concatenated Job Listings Data")

    if st.button("Load and Preview Concatenated Data"):
        with st.spinner("Loading and concatenating data..."):
            df = load_and_concat_data()

        if not df.empty:
            st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}")
            
            st.subheader("Data Preview")
            st.dataframe(df.head())

            st.subheader("Dataset Statistics")
            st.write(f"Total job listings: {len(df)}")
            st.write(f"Unique companies: {df['company'].nunique()}")
            st.write(f"Unique locations: {df['location'].nunique()}")
            st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}")

            # Allow user to download the concatenated dataset
            csv = df.to_csv(index=False)
            st.download_button(
                label="Download concatenated dataset as CSV",
                data=csv,
                file_name="concatenated_job_listings.csv",
                mime="text/csv",
            )
        else:
            st.error("No data available to display.")

if __name__ == "__main__":
    main()