datasciencedojo's picture
Update utils/utils.py
e21cf92 verified
from PyPDF2 import PdfReader
from agents.agents import get_agent_groq
import json
import re
import time
from agents import prompts
def parse_resume(path):
loader = PdfReader(path)
text=''
print(len(loader.pages))
for i in range(len(loader.pages)):
text+= loader.pages[i].extract_text()
return text
def parse_resumes(resumes_list):
resumes_text=[]
for resume in resumes_list:
loader = PdfReader(resume)
text=''
#print(len(loader.pages))
for i in range(len(loader.pages)):
text+= loader.pages[i].extract_text()
resumes_text.append(text)
return resumes_text
def parse_(resumes_list):
resumes_text=[]
for resume in resumes_list:
text=parse_resume(resume)
resumes_text.append(text)
return resumes_text
from typing_extensions import Annotated, TypedDict, Optional
# Define TypedDict for structured output
class ResumeAnalysis(TypedDict):
candidate_name: Annotated[str, ..., "Name of the candidate with the highest score"]
overall_match_score: Annotated[int, ..., "sum of scores for skills_keywords_score, experience_score, education_certifications_score, and preferred_qualifications_score (Whole Number)"]
skills_keywords_score: Annotated[int, ..., "Score for Skills and Keywords (0-40)"]
skills_keywords_explanation: Annotated[str, ..., "Explanation for Skills and Keywords"]
experience_score: Annotated[int, ..., "Score for Experience (0-30)"]
experience_explanation: Annotated[str, ..., "Explanation for Experience"]
education_certifications_score: Annotated[int, ..., "Score for Education & Certifications (0-20)"]
education_certifications_explanation: Annotated[str, ..., "Explanation for Education & Certifications"]
preferred_qualifications_score: Annotated[int, ..., "Score for Preferred Qualifications (0-10)"]
preferred_qualifications_explanation: Annotated[str, ..., "Explanation for Preferred Qualifications"]
score_interpretation: Annotated[str, ..., "donot mention any numbers here, just Interpretation in words of the overall_match_score"]
# Use structured output with the LLM
def generate_analysis_new(resume_text, job_listing_text, job_title_text, must_have, prompt_template):
# Send the structured prompt to the agent and expect a structured response
agent = get_agent_groq().with_structured_output(ResumeAnalysis)
# using structured output LLM
response = agent.invoke(
prompt_template.format(
resume=resume_text,
job_listing=job_listing_text,
job_title_text=job_title_text,
must_have=must_have
)
)
response['overall_match_score']=response['skills_keywords_score']+response['education_certifications_score']+response['experience_score']+response['preferred_qualifications_score']
print(response)
return response # response is already structured as per ResumeAnalysis
def generate_analysis(resume_text, job_listing_text,job_title_text, must_have,prompt_template):
agent = get_agent_groq()
resp = agent.invoke(prompt_template.format(resume=resume_text, job_listing=job_listing_text,job_title_text=job_title_text,must_have=must_have))
#print('response of agent',resp)
text_res=extract(resp.content)
#text_res=extract(text_res)
#chain = prompt | agent
#print(text_res)
#text = resp.content
return text_res
def generate_sel_analysis(resume_text, job_listing_text,job_title_text, must_have,prompt_template):
prompt_templates = prompts.prompt_template_modern
generate_individual_analysis(resume_text, job_listing_text,job_title_text, must_have,prompt_templates)
#chain = prompt | agent
agent = get_agent_groq()
response = agent.invoke(prompt_template.format(resume=resume_text, job_listing=job_listing_text,job_title_text=job_title_text,must_have=must_have))
#print(response.content)
text_res=extract_sel(response.content)
#print(text_res)
return text_res
# Analyzing each resume individually and handling delays to avoid token limits
def generate_individual_analysis(resumes, job_listing_text, job_title_text, must_have, prompt_template, delay=10):
#agent = get_agent_groq()
all_results = []
for resume_text in resumes:
structured_response= generate_analysis_new(resume_text, job_listing_text, job_title_text, must_have, prompt_template)
#agent = get_agent_groq().with_structured_output(ResumeAnalysis)
# print(response)
if structured_response:
all_results.append(structured_response)
# Adding delay to avoid the 6000 tokens per minute limit
time.sleep(delay)
# Sorting results by match score (or any other criteria you prefer)
best_match = max(all_results, key=lambda x: x.get("overall_match_score", 0))
print('best_match',best_match)
print('all_results',all_results)
return all_results
def extract(content):
json_pattern = r'```\n(.*?)\n```'
json_string = re.search(json_pattern, content, re.DOTALL).group(1)
# Load the extracted JSON string into a dictionary
data = json.loads(json_string)
new={}
# Print the extracted variables and their values
for key, value in data.items():
print(f"{key}: {value}")
new[key]=value
return new
def extract_mist(json_string):
# Load the extracted JSON string into a dictionary
data = json.loads(json_string)
new={}
# Print the extracted variables and their values
for key, value in data.items():
print(f"{key}: {value}")
new[key]=value
return new
def extract_sel(content):
try:
# Split the content by identifying each candidate section using the candidate names (bolded)
candidates = re.split(r'\*\*(.*?)\*\*', content) # Split on the pattern of bolded names
# The split result will have alternating candidate names and JSON sections
candidate_json_list = []
for i in range(1, len(candidates), 2): # Iterate over candidate name and their JSON parts
candidate_name = candidates[i].strip() # Candidate name
json_string = candidates[i+1].strip() # JSON string part
# Load the JSON string into a dictionary
candidate_data = json.loads(json_string)
candidate_json_list.append(candidate_data)
return candidate_json_list
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return []
def generate_adv(job_listing_text,job_title_text, prompt_template):
# if model_selection=="Groq":
agent = get_agent_groq()
resp = agent.invoke(prompt_template.format(job_listing=job_listing_text,job_title_text=job_title_text))
text = resp.content
print(text)
return text