Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

Niharmahesh commited on Oct 26, 2024

Commit

b920577

verified ·

1 Parent(s): d8ebc25

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -103

app.py CHANGED Viewed

@@ -1,120 +1,195 @@
 import streamlit as st
-from jobspy import scrape_jobs
 import pandas as pd
-from datasets import Dataset
 from huggingface_hub import HfApi
-import os
-from datetime import datetime
 import io
-import hashlib
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
-@st.cache_data
-def load_job_titles():
-    return [
-        "Data Analyst", "Data Scientist", "Data Engineer", "Machine Learning Engineer",
-        # ... (rest of the job titles)
-        "Data Annotation Expert", "Data Crowdsourcing Manager"
-    ]
-@st.cache_data
-def load_locations():
-    return [
-        "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
-        # ... (rest of the locations)
-        "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
-    ]
-def generate_random_filename():
-    current_time = datetime.now().isoformat()
-    hash_object = hashlib.md5(current_time.encode())
-    random_hash = hash_object.hexdigest()[:8]
-    return f"{random_hash}.csv"
-def update_huggingface_dataset(jobs):
-    df = pd.DataFrame(jobs)
-    filename = generate_random_filename()
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    local_path = os.path.join("data", filename)
-    df.to_csv(local_path, index=False)
-    csv_string = df.to_csv(index=False)
-    file_obj = io.BytesIO(csv_string.encode())
-    api = HfApi()
-    try:
-        api.upload_file(
-            path_or_fileobj=file_obj,
-            path_in_repo=f"data/{filename}",
-            repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
-            repo_type="dataset",
-            token=HF_TOKEN,
-            commit_message="Added new job listings"
-        )
-        return len(jobs), filename
-    except Exception as e:
-        st.error(f"Error uploading file to Hugging Face: {str(e)}")
-        return 0, None
-st.title("Job Scraper and Hugging Face Dataset Updater")
-job_titles = load_job_titles()
-locations = load_locations()
-search_term = st.selectbox("Job Title", options=job_titles, index=0)
-location = st.selectbox("Location", options=locations, index=0)
-results_wanted = st.number_input("Number of Results", min_value=1, max_value=100, value=20)
-hours_old = st.number_input("Hours Old", min_value=1, max_value=168, value=72)
-job_boards = st.multiselect(
-    "Select Job Boards",
-    ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
-    default=["indeed", "linkedin", "zip_recruiter", "glassdoor"]
-)
-if st.button("Scrape Jobs and Update Hugging Face Dataset"):
-    try:
-        with st.spinner("Scraping jobs..."):
-            jobs = scrape_jobs(
-                site_name=job_boards,
-                search_term=search_term,
-                location=location,
-                results_wanted=results_wanted,
-                hours_old=hours_old,
-                country_indeed='USA'
-            )
-        st.success(f"Found {len(jobs)} jobs")
-        df = pd.DataFrame(jobs)
-        st.subheader("Job Listings Preview")
-        st.dataframe(df.head())
-        with st.spinner("Updating Hugging Face dataset..."):
-            updated_count, filename = update_huggingface_dataset(jobs)
-        if updated_count > 0:
-            st.success(f"Hugging Face dataset updated successfully with {updated_count} job listings!")
-            st.info(f"New file created: {filename}")
-            st.markdown(f"View your dataset: https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
-        else:
-            st.error("Failed to update Hugging Face dataset. Please check your permissions and try again.")
-    except Exception as e:
-        st.error(f"An error occurred: {str(e)}")
-st.sidebar.header("About")
-st.sidebar.info(
-    "This app uses JobSpy to scrape job listings from various job boards "
-    "and updates a Hugging Face dataset with the results. "
-    "Enter your search criteria, select the job boards, and click 'Scrape Jobs and Update Hugging Face Dataset' to start."
-)

 import streamlit as st
 import pandas as pd
+import plotly.express as px
 from huggingface_hub import HfApi
 import io
+from datetime import datetime, timedelta
+import time
+# Set page config for a wider layout and custom theme
+st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
+# Custom CSS for black background and styling
+st.markdown("""
+<style>
+    .stApp {
+        background-color: #000000;
+        color: #FFFFFF;
+    }
+    .stButton>button {
+        background-color: #4e79a7;
+        color: white;
+    }
+    .stSelectbox, .stMultiSelect {
+        color: #FFFFFF;
+    }
+    .stDataFrame {
+        background-color: #1E1E1E;
+    }
+    .plotly-graph-div {
+        background-color: #1E1E1E;
+    }
+    .big-font {
+        font-size: 48px;
+        font-weight: bold;
+        text-align: center;
+    }
+</style>
+""", unsafe_allow_html=True)
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
+@st.cache_data(ttl=3600)
+def load_and_concat_data():
+    api = HfApi()
+    dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
+    csv_files = [file for file in dataset_files if file.endswith('.csv')]
+    all_data = []
+    for file in csv_files:
+        try:
+            file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
+            df = pd.read_csv(file_content)
+            all_data.append(df)
+        except Exception:
+            pass  # Silently skip files that can't be processed
+    if not all_data:
+        return pd.DataFrame()
+    concatenated_df = pd.concat(all_data, ignore_index=True)
+    columns_to_keep = [
+        'site', 'job_url', 'title', 'company', 'location',
+        'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
+    ]
+    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
+    filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
+    # Drop duplicates
+    filtered_df = filtered_df.drop_duplicates()
+    return filtered_df
+@st.cache_data
+def get_unique_values(df):
+    return {
+        'companies': df['company'].unique(),
+        'locations': df['location'].unique(),
+        'job_types': df['job_type'].unique()
+    }
+def display_timer():
+    placeholder = st.empty()
+    for i in range(15, 0, -1):
+        placeholder.markdown(f"<p class='big-font'>Loading data... {i}</p>", unsafe_allow_html=True)
+        time.sleep(1)
+    placeholder.empty()
+def main():
+    st.title("Job Listings Dashboard")
+    display_timer()
+    df = load_and_concat_data()
+    if df.empty:
+        st.error("No data available. Please check your dataset.")
+        return
+    # Sidebar for navigation
+    st.sidebar.title("Navigation")
+    page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
+    if page == "Dashboard":
+        display_dashboard(df)
+    elif page == "Data Explorer":
+        display_data_explorer(df)
+@st.cache_data
+def create_chart(data, x, y, title, color_sequence):
+    fig = px.bar(data, x=x, y=y, title=title, color_discrete_sequence=color_sequence)
+    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
+    return fig
+def display_dashboard(df):
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Job Postings Overview")
+        st.metric("Total Job Postings", len(df))
+        st.metric("Unique Companies", df['company'].nunique())
+        st.metric("Unique Locations", df['location'].nunique())
+        min_date = df['date_posted'].min().date()
+        max_date = df['date_posted'].max().date()
+        st.write(f"Job postings from {min_date} to {max_date}")
+    with col2:
+        top_companies = df['company'].value_counts().head(10)
+        fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
+        st.plotly_chart(fig, use_container_width=True)
+    df_by_date = df.groupby('date_posted').size().reset_index(name='count')
+    fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
+    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
+    st.plotly_chart(fig, use_container_width=True)
+    col3, col4 = st.columns(2)
+    with col3:
+        top_locations = df['location'].value_counts().head(10)
+        fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
+        st.plotly_chart(fig, use_container_width=True)
+    with col4:
+        job_types = df['job_type'].value_counts()
+        fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
+        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
+        st.plotly_chart(fig, use_container_width=True)
+@st.cache_data
+def filter_dataframe(df, companies, locations, job_types):
+    filtered_df = df
+    if companies:
+        filtered_df = filtered_df[filtered_df['company'].isin(companies)]
+    if locations:
+        filtered_df = filtered_df[filtered_df['location'].isin(locations)]
+    if job_types:
+        filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
+    return filtered_df
+def display_data_explorer(df):
+    st.subheader("Data Explorer")
+    show_all = st.radio("Display", ("All Data", "Filtered Data"))
+    if show_all == "Filtered Data":
+        unique_values = get_unique_values(df)
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            companies = st.multiselect("Select Companies", options=unique_values['companies'])
+        with col2:
+            locations = st.multiselect("Select Locations", options=unique_values['locations'])
+        with col3:
+            job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
+        filtered_df = filter_dataframe(df, companies, locations, job_types)
+    else:
+        filtered_df = df
+    st.write(f"Showing {len(filtered_df)} job listings")
+    def make_clickable(url):
+        return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'
+    filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
+    filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
+    st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()