Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

job_easz / app.py

Niharmahesh

Update app.py

ba4caa1 verified 4 months ago

raw

history blame

2.66 kB

	import streamlit as st
	import pandas as pd
	from huggingface_hub import HfApi
	import io

	# Hugging Face setup
	HF_TOKEN = st.secrets["HF_TOKEN"]
	HF_USERNAME = st.secrets["HF_USERNAME"]
	DATASET_NAME = "jobeasz"

	@st.cache_data(ttl=3600) # Cache for 1 hour
	def load_and_concat_data():
	api = HfApi()
	dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
	csv_files = [file for file in dataset_files if file.endswith('.csv')]

	all_data = []
	for file in csv_files:
	try:
	file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
	df = pd.read_csv(file_content)
	all_data.append(df)
	except Exception as e:
	st.warning(f"Error reading file {file}: {str(e)}")

	if not all_data:
	st.error("No valid data found in any of the CSV files.")
	return pd.DataFrame()

	concatenated_df = pd.concat(all_data, ignore_index=True)

	# Filter columns
	columns_to_keep = [
	'site', 'job_url', 'title', 'company', 'location',
	'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
	]
	filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)

	# Ensure 'date_posted' is in datetime format
	filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')

	return filtered_df

	def main():
	st.title("Concatenated Job Listings Data")

	if st.button("Load and Preview Concatenated Data"):
	with st.spinner("Loading and concatenating data..."):
	df = load_and_concat_data()

	if not df.empty:
	st.success(f"Successfully loaded and concatenated data. Total rows: {len(df)}")

	st.subheader("Data Preview")
	st.dataframe(df.head())

	st.subheader("Dataset Statistics")
	st.write(f"Total job listings: {len(df)}")
	st.write(f"Unique companies: {df['company'].nunique()}")
	st.write(f"Unique locations: {df['location'].nunique()}")
	st.write(f"Date range: {df['date_posted'].min()} to {df['date_posted'].max()}")

	# Allow user to download the concatenated dataset
	csv = df.to_csv(index=False)
	st.download_button(
	label="Download concatenated dataset as CSV",
	data=csv,
	file_name="concatenated_job_listings.csv",
	mime="text/csv",
	)
	else:
	st.error("No data available to display.")

	if __name__ == "__main__":
	main()