Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

Niharmahesh commited on Oct 26, 2024

Commit

d8ebc25

verified ·

1 Parent(s): 81b2b74

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -178

app.py CHANGED Viewed

@@ -1,195 +1,120 @@
 import streamlit as st
 import pandas as pd
-import plotly.express as px
 from huggingface_hub import HfApi
 import io
-from datetime import datetime, timedelta
-import time
-# Set page config for a wider layout and custom theme
-st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
-# Custom CSS for black background and styling
-st.markdown("""
-<style>
-    .stApp {
-        background-color: #000000;
-        color: #FFFFFF;
-    }
-    .stButton>button {
-        background-color: #4e79a7;
-        color: white;
-    }
-    .stSelectbox, .stMultiSelect {
-        color: #FFFFFF;
-    }
-    .stDataFrame {
-        background-color: #1E1E1E;
-    }
-    .plotly-graph-div {
-        background-color: #1E1E1E;
-    }
-    .big-font {
-        font-size: 48px;
-        font-weight: bold;
-        text-align: center;
-    }
-</style>
-""", unsafe_allow_html=True)
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
-@st.cache_data(ttl=3600)
-def load_and_concat_data():
-    api = HfApi()
-    dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
-    csv_files = [file for file in dataset_files if file.endswith('.csv')]
-    all_data = []
-    for file in csv_files:
-        try:
-            file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
-            df = pd.read_csv(file_content)
-            all_data.append(df)
-        except Exception:
-            pass  # Silently skip files that can't be processed
-    if not all_data:
-        return pd.DataFrame()
-    concatenated_df = pd.concat(all_data, ignore_index=True)
-    columns_to_keep = [
-        'site', 'job_url', 'title', 'company', 'location',
-        'job_type', 'date_posted', 'is_remote', 'description', 'company_url'
-    ]
-    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
-    filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
-    # Drop duplicates
-    filtered_df = filtered_df.drop_duplicates()
-    return filtered_df
-@st.cache_data
-def get_unique_values(df):
-    return {
-        'companies': df['company'].unique(),
-        'locations': df['location'].unique(),
-        'job_types': df['job_type'].unique()
-    }
-def display_timer():
-    placeholder = st.empty()
-    for i in range(15, 0, -1):
-        placeholder.markdown(f"<p class='big-font'>Loading data... {i}</p>", unsafe_allow_html=True)
-        time.sleep(1)
-    placeholder.empty()
-def main():
-    st.title("Job Listings Dashboard")
-    display_timer()
-    df = load_and_concat_data()
-    if df.empty:
-        st.error("No data available. Please check your dataset.")
-        return
-    # Sidebar for navigation
-    st.sidebar.title("Navigation")
-    page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
-    if page == "Dashboard":
-        display_dashboard(df)
-    elif page == "Data Explorer":
-        display_data_explorer(df)
-@st.cache_data
-def create_chart(data, x, y, title, color_sequence):
-    fig = px.bar(data, x=x, y=y, title=title, color_discrete_sequence=color_sequence)
-    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
-    return fig
-def display_dashboard(df):
-    col1, col2 = st.columns(2)
-    with col1:
-        st.subheader("Job Postings Overview")
-        st.metric("Total Job Postings", len(df))
-        st.metric("Unique Companies", df['company'].nunique())
-        st.metric("Unique Locations", df['location'].nunique())
-        min_date = df['date_posted'].min().date()
-        max_date = df['date_posted'].max().date()
-        st.write(f"Job postings from {min_date} to {max_date}")
-    with col2:
-        top_companies = df['company'].value_counts().head(10)
-        fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
-        st.plotly_chart(fig, use_container_width=True)
-    df_by_date = df.groupby('date_posted').size().reset_index(name='count')
-    fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7'])
-    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
-    st.plotly_chart(fig, use_container_width=True)
-    col3, col4 = st.columns(2)
-    with col3:
-        top_locations = df['location'].value_counts().head(10)
-        fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
-        st.plotly_chart(fig, use_container_width=True)
-    with col4:
-        job_types = df['job_type'].value_counts()
-        fig = px.pie(names=job_types.index, values=job_types.values, title="Job Types Distribution", color_discrete_sequence=px.colors.qualitative.Pastel)
-        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
-        st.plotly_chart(fig, use_container_width=True)
-@st.cache_data
-def filter_dataframe(df, companies, locations, job_types):
-    filtered_df = df
-    if companies:
-        filtered_df = filtered_df[filtered_df['company'].isin(companies)]
-    if locations:
-        filtered_df = filtered_df[filtered_df['location'].isin(locations)]
-    if job_types:
-        filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)]
-    return filtered_df
-def display_data_explorer(df):
-    st.subheader("Data Explorer")
-    show_all = st.radio("Display", ("All Data", "Filtered Data"))
-    if show_all == "Filtered Data":
-        unique_values = get_unique_values(df)
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            companies = st.multiselect("Select Companies", options=unique_values['companies'])
-        with col2:
-            locations = st.multiselect("Select Locations", options=unique_values['locations'])
-        with col3:
-            job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
-        filtered_df = filter_dataframe(df, companies, locations, job_types)
-    else:
-        filtered_df = df
-    st.write(f"Showing {len(filtered_df)} job listings")
-    def make_clickable(url):
-        return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>'
-    filtered_df['job_url'] = filtered_df['job_url'].apply(make_clickable)
-    filtered_df['company_url'] = filtered_df['company_url'].apply(make_clickable)
-    st.write(filtered_df.to_html(escape=False, index=False), unsafe_allow_html=True)
-if __name__ == "__main__":
-    main()

 import streamlit as st
+from jobspy import scrape_jobs
 import pandas as pd
+from datasets import Dataset
 from huggingface_hub import HfApi
+import os
+from datetime import datetime
 import io
+import hashlib
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
+@st.cache_data
+def load_job_titles():
+    return [
+        "Data Analyst", "Data Scientist", "Data Engineer", "Machine Learning Engineer",
+        # ... (rest of the job titles)
+        "Data Annotation Expert", "Data Crowdsourcing Manager"
+    ]
+@st.cache_data
+def load_locations():
+    return [
+        "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
+        # ... (rest of the locations)
+        "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
+    ]
+def generate_random_filename():
+    current_time = datetime.now().isoformat()
+    hash_object = hashlib.md5(current_time.encode())
+    random_hash = hash_object.hexdigest()[:8]
+    return f"{random_hash}.csv"
+def update_huggingface_dataset(jobs):
+    df = pd.DataFrame(jobs)
+    filename = generate_random_filename()
+    if not os.path.exists("data"):
+        os.makedirs("data")
+    local_path = os.path.join("data", filename)
+    df.to_csv(local_path, index=False)
+    csv_string = df.to_csv(index=False)
+    file_obj = io.BytesIO(csv_string.encode())
+    api = HfApi()
+    try:
+        api.upload_file(
+            path_or_fileobj=file_obj,
+            path_in_repo=f"data/{filename}",
+            repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message="Added new job listings"
+        )
+        return len(jobs), filename
+    except Exception as e:
+        st.error(f"Error uploading file to Hugging Face: {str(e)}")
+        return 0, None
+st.title("Job Scraper and Hugging Face Dataset Updater")
+job_titles = load_job_titles()
+locations = load_locations()
+search_term = st.selectbox("Job Title", options=job_titles, index=0)
+location = st.selectbox("Location", options=locations, index=0)
+results_wanted = st.number_input("Number of Results", min_value=1, max_value=100, value=20)
+hours_old = st.number_input("Hours Old", min_value=1, max_value=168, value=72)
+job_boards = st.multiselect(
+    "Select Job Boards",
+    ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
+    default=["indeed", "linkedin", "zip_recruiter", "glassdoor"]
+)
+if st.button("Scrape Jobs and Update Hugging Face Dataset"):
+    try:
+        with st.spinner("Scraping jobs..."):
+            jobs = scrape_jobs(
+                site_name=job_boards,
+                search_term=search_term,
+                location=location,
+                results_wanted=results_wanted,
+                hours_old=hours_old,
+                country_indeed='USA'
+            )
+        st.success(f"Found {len(jobs)} jobs")
+        df = pd.DataFrame(jobs)
+        st.subheader("Job Listings Preview")
+        st.dataframe(df.head())
+        with st.spinner("Updating Hugging Face dataset..."):
+            updated_count, filename = update_huggingface_dataset(jobs)
+        if updated_count > 0:
+            st.success(f"Hugging Face dataset updated successfully with {updated_count} job listings!")
+            st.info(f"New file created: {filename}")
+            st.markdown(f"View your dataset: https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
+        else:
+            st.error("Failed to update Hugging Face dataset. Please check your permissions and try again.")
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+st.sidebar.header("About")
+st.sidebar.info(
+    "This app uses JobSpy to scrape job listings from various job boards "
+    "and updates a Hugging Face dataset with the results. "
+    "Enter your search criteria, select the job boards, and click 'Scrape Jobs and Update Hugging Face Dataset' to start."
+)