job_easz / app.py
Niharmahesh's picture
Update app.py
ba4caa1 verified
raw
history blame
2.66 kB
import streamlit as st
import pandas as pd
from huggingface_hub import HfApi
import io
# Hugging Face setup
HF_TOKEN = st.secrets["HF_TOKEN"]
HF_USERNAME = st.secrets["HF_USERNAME"]
DATASET_NAME = "jobeasz"
@st.cache_data(ttl=3600) # Cache for 1 hour
def load_and_concat_data():
api = HfApi()
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
csv_files = [file for file in dataset_files if file.endswith('.csv')]
all_data = []
for file in csv_files:
try:
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
df = pd.read_csv(file_content)
all_data.append(df)
except Exception as e:
st.warning(f"Error reading file {file}: {str(e)}")
if not all_data:
st.error("No valid data found in any of the CSV files.")
return pd.DataFrame()
concatenated_df = pd.concat(all_data, ignore_index=True)
# Filter columns
columns_to_keep = [
'site', 'job_url', 'title', 'company', 'location',
'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
]
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
# Ensure 'date_posted' is in datetime format
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
return filtered_df
def main():
st.title("Concatenated Job Listings Data")
if st.button("Load and Preview Concatenated Data"):
with st.spinner("Loading and concatenating data..."):
df = load_and_concat_data()
if not df.empty:
st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}")
st.subheader("Data Preview")
st.dataframe(df.head())
st.subheader("Dataset Statistics")
st.write(f"Total job listings: {len(df)}")
st.write(f"Unique companies: {df['company'].nunique()}")
st.write(f"Unique locations: {df['location'].nunique()}")
st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}")
# Allow user to download the concatenated dataset
csv = df.to_csv(index=False)
st.download_button(
label="Download concatenated dataset as CSV",
data=csv,
file_name="concatenated_job_listings.csv",
mime="text/csv",
)
else:
st.error("No data available to display.")
if __name__ == "__main__":
main()