Spaces:
Running
Running
from sentence_transformers import SentenceTransformer, util | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
from fuzzywuzzy import fuzz | |
import gradio as gr | |
import fitz # PyMuPDF for PDF extraction | |
import pymupdf | |
# Load the SentenceTransformer model for embeddings | |
model = SentenceTransformer('fine_tuned_job_resume_similarity_model') | |
# Load Hugging Face NER model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") | |
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") | |
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple") | |
# Generalized keywords across multiple industries | |
TARGET_KEYWORDS = [ | |
"skill", "experience", "education", "certification", "project", "management", | |
"sales", "marketing", "customer service", "financial", "analysis", "quality", | |
"engineering", "healthcare", "law", "administration", "communication", | |
"problem-solving", "teamwork", "leadership", "technical", "planning", "operations" | |
] | |
# Function to extract text from PDF files | |
def extract_text_from_pdf(pdf_file): | |
text = "" | |
with pymupdf.open(pdf_file) as doc: | |
for page in doc: | |
text += page.get_text("text") | |
return text | |
# Define function to dynamically extract entities into generalized categories | |
def extract_entities(text): | |
entities = {"qualifications": [], "responsibilities": [], "other": []} | |
ner_results = ner_pipeline(text) | |
for entity in ner_results: | |
word = entity['word'].strip() | |
if any(keyword in word.lower() for keyword in ["skill", "degree", "education", "certification", "qualification"]): | |
entities["qualifications"].append(word) | |
elif any(keyword in word.lower() for keyword in ["experience", "responsibility", "role", "project"]): | |
entities["responsibilities"].append(word) | |
else: | |
entities["other"].append(word) | |
return entities | |
# Function to compute fuzzy matching score for keywords | |
def fuzzy_match_keywords(cv_text, job_text, keywords): | |
match_score = 0 | |
for keyword in keywords: | |
score = fuzz.partial_ratio(cv_text.lower(), keyword.lower()) | |
match_score += score if score > 60 else 0 # Consider only high-confidence matches | |
return match_score / len(keywords) if keywords else 0 | |
def match_cv_to_job(pdf_file, job_description): | |
try: | |
# Extract text from PDF file | |
cv_text = extract_text_from_pdf(pdf_file) | |
debug_info = "Debug Info:\n" | |
# Extract entities from CV and job description | |
cv_entities = extract_entities(cv_text) | |
job_entities = extract_entities(job_description) | |
# Calculate NER-based entity match score | |
match_score = 0 | |
for key in cv_entities: | |
if key in job_entities: | |
matched = set(cv_entities[key]) & set(job_entities[key]) | |
match_score += len(matched) / len(set(job_entities[key])) if job_entities[key] else 0 | |
# Normalize NER match score by number of categories | |
ner_match_score = (match_score / len(cv_entities)) * 100 | |
debug_info += f"NER Match Score: {ner_match_score:.2f}%\n" | |
# Compute fuzzy matching score for generalized keywords | |
fuzzy_keyword_score = fuzzy_match_keywords(cv_text, job_description, TARGET_KEYWORDS) | |
debug_info += f"Fuzzy Keyword Score: {fuzzy_keyword_score:.2f}\n" | |
# Calculate overall similarity score using embeddings | |
cv_embedding = model.encode(cv_text, convert_to_tensor=True) | |
job_embedding = model.encode(job_description, convert_to_tensor=True) | |
similarity_score = util.pytorch_cos_sim(cv_embedding, job_embedding).item() | |
debug_info += f"Embedding Similarity Score: {similarity_score:.2f}\n" | |
# Adjust weights to balance entity and keyword matching for various industries | |
combined_score = ( | |
similarity_score * 0.5 + # Embedding similarity | |
(ner_match_score / 100) * 0.3 + # NER-based entity match | |
(fuzzy_keyword_score / 100) * 0.2 # Fuzzy matching for keywords | |
) | |
match_percentage = combined_score * 100 | |
debug_info += f"Overall Match Percentage: {match_percentage:.2f}%\n" | |
return {"Match Percentage": f"{match_percentage:.2f}%"}, debug_info | |
except Exception as e: | |
# Capture and display the exception in debug output | |
debug_info = f"An error occurred: {str(e)}" | |
return {"Match Percentage": "Error"}, debug_info | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# CV and Job Description Matcher for All Industries with NER and Fuzzy Matching") | |
pdf_input = gr.File(label="Upload CV (PDF format)") | |
job_description = gr.Textbox(label="Job Description", placeholder="Enter the job description text here", lines=10) | |
match_button = gr.Button("Calculate Match Percentage") | |
output = gr.JSON(label="Match Result") | |
debug_output = gr.Textbox(label="Debug Info", lines=10) | |
match_button.click(fn=match_cv_to_job, inputs=[pdf_input, job_description], outputs=[output, debug_output]) | |
demo.launch() | |