Thanos51 commited on
Commit
775ab85
·
verified ·
1 Parent(s): 7c85fc4

Update resume_ranker.py

Browse files
Files changed (1) hide show
  1. resume_ranker.py +96 -96
resume_ranker.py CHANGED
@@ -1,97 +1,97 @@
1
- from crewai import Agent, Task, Crew
2
- from langchain_groq import ChatGroq
3
- from langchain_community.document_loaders import RecursiveUrlLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from googlesearch import search
8
- from PyPDF2 import PdfReader
9
- from dotenv import load_dotenv
10
- import os
11
- import logging
12
- from bs4 import BeautifulSoup
13
- import re
14
- load_dotenv()
15
- logging.basicConfig(filename="Logs/app.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16
-
17
- llm = ChatGroq(
18
- api_key=os.getenv("GROQ_API_KEY"),
19
- model="llama3-70b-8192",
20
- temperature=0.5,
21
- max_tokens=1000
22
- )
23
-
24
- resume_ranker = Agent(
25
- role="Resume Ranker",
26
- goal="Rank resumes based on job fit with fairness",
27
- backstory="An expert in evaluating resumes fairly",
28
- llm=llm,
29
- verbose=True,
30
- allow_delegation=False
31
- )
32
- def html_to_text(html_content: str) -> str:
33
- soup = BeautifulSoup(html_content, 'html.parser')
34
-
35
- # Extract text with proper spacing
36
- text = soup.get_text(separator=" ").strip()
37
-
38
- # Remove excessive multiple spaces
39
- text = re.sub(r'\s+', ' ', text)
40
- def extract_text_from_pdf(file_path=None, file_content=None):
41
- if file_path:
42
- reader = PdfReader(file_path)
43
- elif file_content:
44
- reader = PdfReader(file_content)
45
- text = ""
46
- for page in reader.pages:
47
- text += page.extract_text() or ""
48
- return text
49
-
50
- def fetch_related_content(job_description):
51
- query = f"{job_description} site:*.edu | site:*.org | site:*.gov -inurl:(signup | login)"
52
- urls = list(search(query, num_results=5))
53
- documents = []
54
- for url in urls:
55
- try:
56
- loader = RecursiveUrlLoader(url=url,extractor=html_to_text,max_depth=1,
57
- headers={"User-Agent": "Mozilla/5.0"})
58
- docs = loader.load()
59
- documents.extend(docs)
60
- except Exception as e:
61
- logging.error(f"Error loading {url}: {e}")
62
- return documents
63
-
64
- def store_in_vdb(documents):
65
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
66
- chunks = text_splitter.split_documents(documents)
67
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
68
- return FAISS.from_documents(chunks, embeddings)
69
-
70
- def process_resumes(job_description, dir_path=None, uploaded_files=None):
71
- resumes = []
72
- if dir_path and os.path.isdir(dir_path):
73
- for filename in os.listdir(dir_path):
74
- if filename.endswith(".pdf"):
75
- file_path = os.path.join(dir_path, filename)
76
- resume_text = extract_text_from_pdf(file_path=file_path)
77
- resumes.append(f"Resume: {filename}\nContent: {resume_text}")
78
- elif uploaded_files:
79
- for uploaded_file in uploaded_files:
80
- resume_text = extract_text_from_pdf(file_content=uploaded_file)
81
- resumes.append(f"Resume: {uploaded_file.name}\nContent: {resume_text}")
82
- return resumes
83
-
84
- def create_resume_rank_task(job_description, dir_path=None, uploaded_files=None):
85
- resumes = process_resumes(job_description, dir_path, uploaded_files)
86
- if not resumes:
87
- return None
88
- documents = fetch_related_content(job_description)
89
- vdb = store_in_vdb(documents) if documents else None
90
- context = vdb.similarity_search(job_description, k=3) if vdb else []
91
- context_text = "\n".join([doc.page_content for doc in context]) or "No context."
92
- prompt = f"Rank these resumes: {', '.join(resumes)} for '{job_description}' using context: '{context_text}'. Ensure fairness by avoiding bias based on gender, age, or ethnicity. Flag any potential bias in reasoning."
93
- return Task(
94
- description=prompt,
95
- agent=resume_ranker,
96
- expected_output="A ranked list with scores (0-100), reasoning, and bias flags."
97
  )
 
1
+ from crewai import Agent, Task, Crew
2
+ from langchain_groq import ChatGroq
3
+ from langchain_community.document_loaders import RecursiveUrlLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from googlesearch import search
8
+ from PyPDF2 import PdfReader
9
+ from dotenv import load_dotenv
10
+ import os
11
+ import logging
12
+ from bs4 import BeautifulSoup
13
+ import re
14
+ load_dotenv()
15
+ logging.basicConfig(filename="app.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16
+
17
+ llm = ChatGroq(
18
+ api_key=os.getenv("GROQ_API_KEY"),
19
+ model="llama3-70b-8192",
20
+ temperature=0.5,
21
+ max_tokens=1000
22
+ )
23
+
24
+ resume_ranker = Agent(
25
+ role="Resume Ranker",
26
+ goal="Rank resumes based on job fit with fairness",
27
+ backstory="An expert in evaluating resumes fairly",
28
+ llm=llm,
29
+ verbose=True,
30
+ allow_delegation=False
31
+ )
32
+ def html_to_text(html_content: str) -> str:
33
+ soup = BeautifulSoup(html_content, 'html.parser')
34
+
35
+ # Extract text with proper spacing
36
+ text = soup.get_text(separator=" ").strip()
37
+
38
+ # Remove excessive multiple spaces
39
+ text = re.sub(r'\s+', ' ', text)
40
+ def extract_text_from_pdf(file_path=None, file_content=None):
41
+ if file_path:
42
+ reader = PdfReader(file_path)
43
+ elif file_content:
44
+ reader = PdfReader(file_content)
45
+ text = ""
46
+ for page in reader.pages:
47
+ text += page.extract_text() or ""
48
+ return text
49
+
50
+ def fetch_related_content(job_description):
51
+ query = f"{job_description} site:*.edu | site:*.org | site:*.gov -inurl:(signup | login)"
52
+ urls = list(search(query, num_results=5))
53
+ documents = []
54
+ for url in urls:
55
+ try:
56
+ loader = RecursiveUrlLoader(url=url,extractor=html_to_text,max_depth=1,
57
+ headers={"User-Agent": "Mozilla/5.0"})
58
+ docs = loader.load()
59
+ documents.extend(docs)
60
+ except Exception as e:
61
+ logging.error(f"Error loading {url}: {e}")
62
+ return documents
63
+
64
+ def store_in_vdb(documents):
65
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
66
+ chunks = text_splitter.split_documents(documents)
67
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
68
+ return FAISS.from_documents(chunks, embeddings)
69
+
70
+ def process_resumes(job_description, dir_path=None, uploaded_files=None):
71
+ resumes = []
72
+ if dir_path and os.path.isdir(dir_path):
73
+ for filename in os.listdir(dir_path):
74
+ if filename.endswith(".pdf"):
75
+ file_path = os.path.join(dir_path, filename)
76
+ resume_text = extract_text_from_pdf(file_path=file_path)
77
+ resumes.append(f"Resume: {filename}\nContent: {resume_text}")
78
+ elif uploaded_files:
79
+ for uploaded_file in uploaded_files:
80
+ resume_text = extract_text_from_pdf(file_content=uploaded_file)
81
+ resumes.append(f"Resume: {uploaded_file.name}\nContent: {resume_text}")
82
+ return resumes
83
+
84
+ def create_resume_rank_task(job_description, dir_path=None, uploaded_files=None):
85
+ resumes = process_resumes(job_description, dir_path, uploaded_files)
86
+ if not resumes:
87
+ return None
88
+ documents = fetch_related_content(job_description)
89
+ vdb = store_in_vdb(documents) if documents else None
90
+ context = vdb.similarity_search(job_description, k=3) if vdb else []
91
+ context_text = "\n".join([doc.page_content for doc in context]) or "No context."
92
+ prompt = f"Rank these resumes: {', '.join(resumes)} for '{job_description}' using context: '{context_text}'. Ensure fairness by avoiding bias based on gender, age, or ethnicity. Flag any potential bias in reasoning."
93
+ return Task(
94
+ description=prompt,
95
+ agent=resume_ranker,
96
+ expected_output="A ranked list with scores (0-100), reasoning, and bias flags."
97
  )