Spaces:

forestav
/

jobsai

Running

App Files Files Community

forestav commited on Dec 22, 2024

Commit

6e54bce

0 Parent(s):

first commit

Browse files

Files changed (9) hide show

.gitignore +2 -0
app.py +199 -0
bootstrap.py +28 -0
get_ads.py +32 -0
main.py +46 -0
pinecone_handler.py +192 -0
settings.py +41 -0
time_handling.py +32 -0
timestamp2.txt +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import streamlit as st
+import PyPDF2
+import io
+import docx2txt
+from typing import Optional
+import re
+from pinecone_handler import PineconeHandler
+from time_handling import read_timestamp
+def extract_text_from_pdf(pdf_file) -> str:
+    """Extract text content from PDF file"""
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text() + "\n"
+    return text
+def extract_text_from_docx(docx_file) -> str:
+    """Extract text content from DOCX file"""
+    text = docx2txt.process(docx_file)
+    return text
+def extract_resume_text(uploaded_file) -> Optional[str]:
+    """Extract text from uploaded resume file"""
+    if uploaded_file is None:
+        return None
+    # Get the file extension
+    file_extension = uploaded_file.name.split('.')[-1].lower()
+    try:
+        # Process based on file type
+        if file_extension == 'pdf':
+            return extract_text_from_pdf(uploaded_file)
+        elif file_extension in ['docx', 'doc']:
+            return extract_text_from_docx(uploaded_file)
+        elif file_extension == 'txt':
+            return str(uploaded_file.read(), "utf-8")
+        else:
+            st.error(f"Unsupported file format: {file_extension}")
+            return None
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
+        return None
+def clean_resume_text(text: str) -> str:
+    """Clean and process resume text"""
+    if not text:
+        return ""
+    # Remove special characters and extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def is_description_truncated(description: str) -> bool:
+    """Check if the description appears to be truncated"""
+    # Check for obvious truncation indicators
+    truncation_indicators = [
+        lambda x: len(x) >= 995,  # Close to the 1000 char limit
+        lambda x: x.rstrip().endswith(('...', '…')),
+        lambda x: re.search(r'\w+$', x) and not re.search(r'[.!?]$', x),  # Ends mid-word or without punctuation
+    ]
+    return any(indicator(description) for indicator in truncation_indicators)
+def format_job_description(description: str, truncated: bool = False) -> str:
+    """Format job description text with proper sections and line breaks"""
+    if not description:
+        return ""
+    # Common section headers in job descriptions
+    sections = [
+        "About us", "About you", "About the role", "About the position",
+        "Requirements", "Qualifications", "Skills", "Responsibilities",
+        "What you'll do", "What we offer", "Benefits", "Your profile",
+        "Required skills", "What you need", "Who you are"
+    ]
+    # Add line breaks before section headers
+    formatted_text = description
+    for section in sections:
+        # Look for section headers with case-insensitive matching
+        pattern = re.compile(f'({section}:?)', re.IGNORECASE)
+        formatted_text = pattern.sub(r'\n\n\1', formatted_text)
+    # Handle bullet points (both • and - symbols)
+    formatted_text = re.sub(r'[•-]\s*', '\n• ', formatted_text)
+    # Add line breaks for sentences that look like list items
+    formatted_text = re.sub(r'(?<=\w)\.(?=\s*[A-Z])', '.\n', formatted_text)
+    # Clean up any excessive line breaks
+    formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
+    if truncated:
+        formatted_text = formatted_text.rstrip() + "..."
+    return formatted_text.strip()
+def main():
+    st.title("Resume-Based Job Search")
+    st.write("Upload your resume to find matching job opportunities")
+    # Initialize PineconeHandler
+    try:
+        handler = PineconeHandler()
+    except Exception as e:
+        st.error(f"Error connecting to Pinecone: {str(e)}")
+        return
+    # File uploader
+    uploaded_file = st.file_uploader("Upload your resume", type=['pdf', 'docx', 'doc', 'txt'])
+    # Search parameters
+    num_results = st.slider("Number of results", min_value=1, max_value=20, value=5)
+    if uploaded_file:
+        with st.spinner("Processing resume..."):
+            # Extract and clean resume text
+            resume_text = extract_resume_text(uploaded_file)
+            if resume_text:
+                clean_text = clean_resume_text(resume_text)
+                # Preview extracted text
+                with st.expander("Preview extracted text"):
+                    st.text(clean_text[:500] + "..." if len(clean_text) > 500 else clean_text)
+                # Search button
+                if st.button("Search Jobs"):
+                    with st.spinner("Searching for matching jobs..."):
+                        try:
+                            # Search for similar job ads
+                            results = handler.search_similar_ads(clean_text, top_k=num_results)
+                            if results:
+                                st.subheader("Matching Jobs")
+                                for i, match in enumerate(results, 1):
+                                    metadata = match.metadata
+                                    score = match.score
+                                    # Create job card
+                                    with st.container():
+                                        # Header section with key information
+                                        col1, col2 = st.columns([2, 1])
+                                        with col1:
+                                            st.markdown(f"### {metadata['headline']}")
+                                        with col2:
+                                            st.markdown(f"**Match Score:** {score:.2f}")
+                                        # Job details section
+                                        st.markdown(f"**Company:** {metadata.get('company', 'Not specified')}")
+                                        st.markdown(f"**Location:** {metadata['city']}")
+                                        st.markdown(f"**Occupation:** {metadata['occupation']}")
+                                        st.markdown(f"**Published:** {metadata['published']}")
+                                        if metadata.get('logo_url'):
+                                            st.image(metadata['logo_url'], width=100)
+                                        # Check if description is truncated
+                                        description = metadata['description']
+                                        is_truncated = is_description_truncated(description)
+                                        # Display initial description preview
+                                        formatted_description = format_job_description(
+                                            description[:500] if is_truncated else description,
+                                            truncated=is_truncated
+                                        )
+                                        st.markdown(formatted_description)
+                                        # If truncated, show expandable full description
+                                        if is_truncated:
+                                            with st.expander("Read Full Description"):
+                                                # Try to fetch full description from webpage_url
+                                                st.markdown("""
+                                                    **Note:** The full description has been truncated in our database.
+                                                    Please visit the original job posting for complete details.
+                                                """)
+                                                if metadata.get('webpage_url'):
+                                                    st.markdown(f"[View Original Job Posting]({metadata['webpage_url']})")
+                                        # Application section
+                                        st.markdown("### How to Apply")
+                                        if metadata.get('webpage_url'):
+                                            st.markdown(f"[Apply Online]({metadata['webpage_url']})")
+                                        if metadata.get('email'):
+                                            st.markdown(f"📧 Contact: {metadata['email']}")
+                                        st.markdown("---")
+                            else:
+                                st.info("No matching jobs found. Try adjusting your search criteria.")
+                        except Exception as e:
+                            st.error(f"Error searching jobs: {str(e)}")
+if __name__ == "__main__":
+    main()

bootstrap.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import sys
+import logging
+import time_handling
+from get_ads import get_all_ads
+from pinecone_handler import PineconeHandler, load_all
+from settings import LOG_LEVEL, LOG_DATE_FORMAT, LOG_FORMAT, PLACES, OCCUPATIONS
+log = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
+if __name__ == '__main__':
+    """
+    This is executed once to initialize the Pinecone database and
+    load all ads into it. To keep the database updated, run main.py
+    """
+    # Initialize Pinecone handler
+    handler = PineconeHandler()
+    log.info('Pinecone connection initialized')
+    if PLACES or OCCUPATIONS:
+        # If filtering by location/occupation, set past timestamp
+        timestamp = time_handling.write_timestamp('2022-01-01T00:00:00')
+    else:
+        timestamp = time_handling.write_timestamp()
+        all_ads = get_all_ads()
+        load_all(all_ads)
+        log.info(f'Loaded {len(all_ads)} into Pinecone. Timestamp: {timestamp}')

get_ads.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import sys
+import json
+import logging
+import requests
+from settings import LOG_LEVEL, LOG_DATE_FORMAT, LOG_FORMAT, STREAM_URL, SNAPSHOT_URL, PLACES, OCCUPATIONS
+log = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
+def _get(url, params={}):
+    log.info(f'Collecting ads from: {url} with params {params}')
+    headers = {'Accept': 'application/json'}
+    response = requests.get(url, headers=headers, params=params)
+    response.raise_for_status()
+    list_of_ads = json.loads(response.content.decode('utf8'))
+    log.info(f"Got {len(list_of_ads)} ads from {url}. Params: {params}")
+    return list_of_ads
+def get_all_ads():
+    return _get(SNAPSHOT_URL)
+def get_ads_since_time(timestamp):
+    params = {'date': timestamp}
+    if PLACES:
+        params['location-concept-id'] = PLACES
+    if OCCUPATIONS:
+        params['occupation-concept-id'] = OCCUPATIONS
+    return _get(STREAM_URL, params)

main.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import sys
+from time import sleep
+import logging
+from pinecone_handler import PineconeHandler
+import get_ads
+from time_handling import timestamp_now, write_timestamp, read_timestamp
+from settings import LOG_LEVEL, LOG_DATE_FORMAT, LOG_FORMAT, MAX_UPDATES, SLEEP_TIME_MINUTES
+log = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
+def keep_updated():
+    handler = PineconeHandler()
+    last_timestamp = read_timestamp()
+    counter = 0
+    while True:
+        new_timestamp = timestamp_now()
+        log.info(f"Getting ads after timestamp '{last_timestamp}'")
+        if ads := get_ads.get_ads_since_time(last_timestamp):
+            handler.upsert_ads(ads)
+        else:
+            log.info(f"No ads found after timestamp '{last_timestamp}'")
+        write_timestamp(new_timestamp)
+        counter += 1
+        log.info(f"Completed update {counter} of {MAX_UPDATES}")
+        if counter == MAX_UPDATES:
+            break
+        log.info(f"Waiting {SLEEP_TIME_MINUTES} minutes before collecting ads again")
+        sleep(SLEEP_TIME_MINUTES * 60)
+    log.info('Finished')
+if __name__ == '__main__':
+    """
+    Important:
+    You must run bootstrap.py first to initialize Pinecone and
+    load current ads into the database (if applicable)
+    """
+    keep_updated()

pinecone_handler.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from datetime import datetime
+import sys
+import logging
+from pinecone import Pinecone, ServerlessSpec
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict, Any
+from settings import (
+    LOG_LEVEL,
+    LOG_DATE_FORMAT,
+    LOG_FORMAT,
+    PINECONE_API_KEY,
+    PINECONE_ENVIRONMENT,
+    PINECONE_INDEX_NAME
+)
+log = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
+class PineconeHandler:
+    """
+    Handles connections and operations with Pinecone vector database
+    for storing and retrieving job ads
+    """
+    def __init__(self):
+        self.pc = Pinecone(api_key=PINECONE_API_KEY)
+        self.BATCH_SIZE = 100  # Number of vectors to upsert at once
+        try:
+            self.index = self.pc.Index(PINECONE_INDEX_NAME)
+            log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'")
+        except Exception as e:
+            log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
+            spec = ServerlessSpec(
+                cloud="aws",
+                region="us-west-2"
+            )
+            self.pc.create_index(
+                name=PINECONE_INDEX_NAME,
+                dimension=384,
+                metric="cosine",
+                spec=spec
+            )
+            self.index = self.pc.Index(PINECONE_INDEX_NAME)
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
+    def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
+        """Create embedding from job ad text"""
+        try:
+            # Safely get text fields with fallbacks to empty string
+            headline = ad.get('headline', '') or ''
+            occupation = ad.get('occupation', {})
+            occupation_label = occupation.get('label', '') if occupation else ''
+            description = ad.get('description', {})
+            description_text = description.get('text', '') if description else ''
+            # Combine text fields
+            text_to_embed = f"{headline} {occupation_label} {description_text}".strip()
+            # If we have no text to embed, raise an exception
+            if not text_to_embed:
+                raise ValueError("No text content available for embedding")
+            return self.model.encode(text_to_embed).tolist()
+        except Exception as e:
+            log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}")
+            raise
+    def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]:
+        """Extract metadata from ad for storage"""
+        try:
+            # Safely get nested values with fallbacks
+            application_details = ad.get('application_details', {}) or {}
+            workplace_address = ad.get('workplace_address', {}) or {}
+            occupation = ad.get('occupation', {}) or {}
+            description = ad.get('description', {}) or {}
+            # Limit the size of text fields and handle potential None values
+            return {
+                'email': (application_details.get('email', '') or '')[:100],
+                'city': (workplace_address.get('municipality', '') or '')[:100],
+                'occupation': (occupation.get('label', '') or '')[:100],
+                'headline': (ad.get('headline', '') or '')[:200],
+                'description': (description.get('text', '') or '')[:1000],
+                'logo_url': (ad.get('logo_url', '') or '')[:200],
+                'webpage_url': (ad.get('webpage_url', '') or '')[:200],
+                'published': (ad.get('publication_date', '') or '')[:50]
+            }
+        except Exception as e:
+            log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}")
+            raise
+    def _batch_upsert(self, vectors: List[tuple]) -> None:
+        """
+        Upsert a batch of vectors to Pinecone
+        Args:
+            vectors: List of tuples, each containing (id, vector, metadata)
+        """
+        try:
+            # Prepare the vectors in the format Pinecone expects
+            upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors]
+            # Perform the upsert operation
+            self.index.upsert(vectors=upsert_data)
+            log.debug(f"Successfully upserted batch of {len(vectors)} vectors")
+        except Exception as e:
+            log.error(f"Error upserting batch: {str(e)}")
+            raise
+    def upsert_ads(self, ads: List[Dict[str, Any]]) -> None:
+        """Insert or update multiple ads in batches"""
+        vectors = []
+        deleted = 0
+        processed = 0
+        skipped = 0
+        for ad in ads:
+            try:
+                # Skip None or empty ads
+                if not ad:
+                    log.warning("Skipping None or empty ad")
+                    skipped += 1
+                    continue
+                ad_id = ad.get('id')
+                if not ad_id:
+                    log.warning("Skipping ad without ID")
+                    skipped += 1
+                    continue
+                if ad.get('removed', False):
+                    self.delete_ad(ad_id)
+                    deleted += 1
+                    continue
+                try:
+                    vector = self._create_embedding(ad)
+                    metadata = self._prepare_metadata(ad)
+                    vectors.append((ad_id, vector, metadata))
+                    processed += 1
+                    # When we reach batch size, upsert the batch
+                    if len(vectors) >= self.BATCH_SIZE:
+                        self._batch_upsert(vectors)
+                        vectors = []  # Clear the batch
+                except Exception as e:
+                    log.error(f"Error processing ad {ad_id}: {str(e)}")
+                    skipped += 1
+            except Exception as e:
+                log.error(f"Unexpected error processing ad: {str(e)}")
+                skipped += 1
+        # Upsert any remaining vectors
+        if vectors:
+            self._batch_upsert(vectors)
+        log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped")
+    def delete_ad(self, ad_id: str) -> None:
+        """Delete an ad by ID"""
+        try:
+            self.index.delete(ids=[ad_id])
+            log.debug(f"Deleted ad {ad_id} from Pinecone")
+        except Exception as e:
+            log.error(f"Error deleting ad {ad_id}: {str(e)}")
+    def search_similar_ads(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Search for similar job ads based on text query"""
+        query_embedding = self.model.encode(query).tolist()
+        results = self.index.query(
+            vector=query_embedding,
+            top_k=top_k,
+            include_metadata=True
+        )
+        return results.matches
+def load_all(all_ads):
+    handler = PineconeHandler()
+    handler.upsert_ads(all_ads)
+def update(list_of_updated_ads):
+    start = datetime.now()
+    handler = PineconeHandler()
+    handler.upsert_ads(list_of_updated_ads)
+    log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}")

settings.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import logging
+PINECONE_API_KEY = "pcsk_3nKkDX_K2CLsXYCkmJwP2gkv2HtCEe3ksJ7J3uQgx9ajAG63BFezrQUC5jFZVjadte4Sh8"
+PINECONE_ENVIRONMENT = "gcp-starter"
+PINECONE_INDEX_NAME = "jobads-index"
+DB_TABLE_NAME = 'jobads'
+DB_FILE_NAME = 'jobads_database_20220127.db'
+TIMESTAMP_FILE = 'timestamp2.txt'
+BASE_URL = 'https://jobstream.api.jobtechdev.se'
+STREAM_URL = f"{BASE_URL}/stream"
+SNAPSHOT_URL = f"{BASE_URL}/snapshot"
+SLEEP_TIME_MINUTES = 0.1
+MAX_UPDATES = 4
+DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
+# Logging
+LOG_LEVEL = logging.INFO  # Change INFO to DEBUG for verbose logging
+LOG_FORMAT = '%(asctime)s  %(levelname)-8s %(message)s'
+LOG_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
+"""
+Examples for the municiplaities in Västerbottens Län:
+Skellefteå - kicB_LgH_2Dk
+Robertsfors - p8Mv_377_bxp
+Norsjö - XmpG_vPQ_K7T
+Vindeln - izT6_zWu_tta
+Umeå - QiGt_BLu_amP
+Vännäs - utQc_6xq_Dfm
+"""
+# if you don't want to do geographical filtering, set PLACES = []
+#PLACES = ['kicB_LgH_2Dk', 'p8Mv_377_bxp', 'XmpG_vPQ_K7T', 'izT6_zWu_tta', 'QiGt_BLu_amP', 'utQc_6xq_Dfm']
+PLACES = []
+# if you don't want to do filtering on occupations, set OCCUPATIONS = []
+#OCCUPATIONS = ['Z6TY_xDf_Yup']  # Städare
+OCCUPATIONS = []

time_handling.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import sys
+from datetime import datetime
+import logging
+from settings import LOG_LEVEL, LOG_DATE_FORMAT, LOG_FORMAT, DATE_FORMAT, TIMESTAMP_FILE
+log = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
+def elapsed_time(start):
+    return datetime.now() - start
+def timestamp_now():
+    return datetime.now().strftime(DATE_FORMAT)
+def write_timestamp(timestamp=None):
+    if not timestamp:
+        timestamp = timestamp_now()
+    with open(file=TIMESTAMP_FILE, mode='w') as f:
+        f.write(timestamp)
+    log.info(f"New timestamp written: {timestamp}")
+    return timestamp
+def read_timestamp():
+    with open(file=TIMESTAMP_FILE, mode='r') as f:
+        timestamp = f.read()
+    return timestamp.strip('\n')

timestamp2.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 2024-12-22T13:55:53