Spaces:
Sleeping
Sleeping
File size: 4,990 Bytes
77fff65 f5d2489 abe3356 d9e56e5 9208e17 77fff65 b397dc0 d9e56e5 55ff713 6a1a97b abe3356 6a1a97b d9e56e5 77fff65 d9e56e5 abe3356 d9e56e5 abe3356 d9e56e5 abe3356 725f549 d9e56e5 a7d6962 d9e56e5 77fff65 f65dc03 9208e17 abe3356 d9e56e5 abe3356 d9e56e5 abe3356 d9e56e5 abe3356 6a1a97b d9e56e5 6a1a97b d9e56e5 77fff65 9208e17 3efbf71 9208e17 d9e56e5 9208e17 77fff65 9208e17 3efbf71 6a1a97b 9f26a6c 77fff65 91207a8 9208e17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from fuzzywuzzy import fuzz
import gradio as gr
import fitz # PyMuPDF for PDF extraction
# Load the SentenceTransformer model for embeddings
model = SentenceTransformer('fine_tuned_job_resume_similarity_model')
# Load Hugging Face NER model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")
# Generalized keywords across multiple industries
TARGET_KEYWORDS = [
"skill", "experience", "education", "certification", "project", "management",
"sales", "marketing", "customer service", "financial", "analysis", "quality",
"engineering", "healthcare", "law", "administration", "communication",
"problem-solving", "teamwork", "leadership", "technical", "planning", "operations"
]
# Function to extract text from PDF files
def extract_text_from_pdf(pdf_file):
text = ""
with fitz.open(pdf_file) as doc:
for page in doc:
text += page.get_text("text")
return text
# Define function to dynamically extract entities into generalized categories
def extract_entities(text):
entities = {"qualifications": [], "responsibilities": [], "other": []}
ner_results = ner_pipeline(text)
for entity in ner_results:
word = entity['word'].strip()
# Dynamically assign entities to generalized categories based on context
if any(keyword in word.lower() for keyword in ["skill", "degree", "education", "certification", "qualification"]):
entities["qualifications"].append(word)
elif any(keyword in word.lower() for keyword in ["experience", "responsibility", "role", "project"]):
entities["responsibilities"].append(word)
else:
entities["other"].append(word)
return entities
# Function to compute fuzzy matching score for keywords
def fuzzy_match_keywords(cv_text, job_text, keywords):
match_score = 0
for keyword in keywords:
score = fuzz.partial_ratio(cv_text.lower(), keyword.lower())
match_score += score if score > 60 else 0 # Consider only high-confidence matches
return match_score / len(keywords) if keywords else 0
def match_cv_to_job(pdf_file, job_description):
# Extract text from PDF file
cv_text = extract_text_from_pdf(pdf_file)
debug_info = "Debug Info:\n"
# Extract entities from CV and job description
cv_entities = extract_entities(cv_text)
job_entities = extract_entities(job_description)
# Calculate NER-based entity match score
match_score = 0
for key in cv_entities:
if key in job_entities:
matched = set(cv_entities[key]) & set(job_entities[key])
match_score += len(matched) / len(set(job_entities[key])) if job_entities[key] else 0
# Normalize NER match score by number of categories
ner_match_score = (match_score / len(cv_entities)) * 100
debug_info += f"NER Match Score: {ner_match_score:.2f}%\n"
# Compute fuzzy matching score for generalized keywords
fuzzy_keyword_score = fuzzy_match_keywords(cv_text, job_description, TARGET_KEYWORDS)
debug_info += f"Fuzzy Keyword Score: {fuzzy_keyword_score:.2f}\n"
# Calculate overall similarity score using embeddings
cv_embedding = model.encode(cv_text, convert_to_tensor=True)
job_embedding = model.encode(job_description, convert_to_tensor=True)
similarity_score = util.pytorch_cos_sim(cv_embedding, job_embedding).item()
debug_info += f"Embedding Similarity Score: {similarity_score:.2f}\n"
# Adjust weights to balance entity and keyword matching for various industries
combined_score = (
similarity_score * 0.5 + # Embedding similarity
(ner_match_score / 100) * 0.3 + # NER-based entity match
(fuzzy_keyword_score / 100) * 0.2 # Fuzzy matching for keywords
)
match_percentage = combined_score * 100
debug_info += f"Overall Match Percentage: {match_percentage:.2f}%\n"
return {"Match Percentage": f"{match_percentage:.2f}%"}, debug_info
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# CV and Job Description Matcher for All Industries with NER and Fuzzy Matching")
pdf_input = gr.File(label="Upload CV (PDF format)")
job_description = gr.Textbox(label="Job Description", placeholder="Enter the job description text here", lines=10)
match_button = gr.Button("Calculate Match Percentage")
output = gr.JSON(label="Match Result")
debug_output = gr.Textbox(label="Debug Info", lines=10)
match_button.click(fn=match_cv_to_job, inputs=[pdf_input, job_description], outputs=[output, debug_output])
demo.launch()
|