File size: 4,990 Bytes
77fff65
f5d2489
abe3356
d9e56e5
9208e17
77fff65
b397dc0
d9e56e5
55ff713
6a1a97b
abe3356
 
 
 
6a1a97b
d9e56e5
 
 
 
 
 
 
 
77fff65
 
 
 
 
 
 
 
d9e56e5
abe3356
d9e56e5
abe3356
 
d9e56e5
 
 
 
 
 
 
 
abe3356
725f549
d9e56e5
 
 
 
 
a7d6962
d9e56e5
 
77fff65
 
 
f65dc03
9208e17
abe3356
 
 
 
d9e56e5
abe3356
 
 
d9e56e5
 
abe3356
d9e56e5
 
abe3356
6a1a97b
d9e56e5
 
 
 
6a1a97b
 
 
 
d9e56e5
 
 
 
 
 
 
 
77fff65
 
9208e17
3efbf71
9208e17
 
 
d9e56e5
9208e17
77fff65
 
9208e17
3efbf71
 
6a1a97b
9f26a6c
77fff65
91207a8
9208e17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from fuzzywuzzy import fuzz
import gradio as gr
import fitz  # PyMuPDF for PDF extraction

# Load the SentenceTransformer model for embeddings
model = SentenceTransformer('fine_tuned_job_resume_similarity_model')

# Load Hugging Face NER model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")

# Generalized keywords across multiple industries
TARGET_KEYWORDS = [
    "skill", "experience", "education", "certification", "project", "management",
    "sales", "marketing", "customer service", "financial", "analysis", "quality",
    "engineering", "healthcare", "law", "administration", "communication",
    "problem-solving", "teamwork", "leadership", "technical", "planning", "operations"
]

# Function to extract text from PDF files
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

# Define function to dynamically extract entities into generalized categories
def extract_entities(text):
    entities = {"qualifications": [], "responsibilities": [], "other": []}
    ner_results = ner_pipeline(text)
    for entity in ner_results:
        word = entity['word'].strip()
        # Dynamically assign entities to generalized categories based on context
        if any(keyword in word.lower() for keyword in ["skill", "degree", "education", "certification", "qualification"]):
            entities["qualifications"].append(word)
        elif any(keyword in word.lower() for keyword in ["experience", "responsibility", "role", "project"]):
            entities["responsibilities"].append(word)
        else:
            entities["other"].append(word)
    return entities

# Function to compute fuzzy matching score for keywords
def fuzzy_match_keywords(cv_text, job_text, keywords):
    match_score = 0
    for keyword in keywords:
        score = fuzz.partial_ratio(cv_text.lower(), keyword.lower())
        match_score += score if score > 60 else 0  # Consider only high-confidence matches
    return match_score / len(keywords) if keywords else 0

def match_cv_to_job(pdf_file, job_description):
    # Extract text from PDF file
    cv_text = extract_text_from_pdf(pdf_file)
    debug_info = "Debug Info:\n"
    
    # Extract entities from CV and job description
    cv_entities = extract_entities(cv_text)
    job_entities = extract_entities(job_description)
    
    # Calculate NER-based entity match score
    match_score = 0
    for key in cv_entities:
        if key in job_entities:
            matched = set(cv_entities[key]) & set(job_entities[key])
            match_score += len(matched) / len(set(job_entities[key])) if job_entities[key] else 0

    # Normalize NER match score by number of categories
    ner_match_score = (match_score / len(cv_entities)) * 100  
    debug_info += f"NER Match Score: {ner_match_score:.2f}%\n"
    
    # Compute fuzzy matching score for generalized keywords
    fuzzy_keyword_score = fuzzy_match_keywords(cv_text, job_description, TARGET_KEYWORDS)
    debug_info += f"Fuzzy Keyword Score: {fuzzy_keyword_score:.2f}\n"
    
    # Calculate overall similarity score using embeddings
    cv_embedding = model.encode(cv_text, convert_to_tensor=True)
    job_embedding = model.encode(job_description, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(cv_embedding, job_embedding).item()
    debug_info += f"Embedding Similarity Score: {similarity_score:.2f}\n"

    # Adjust weights to balance entity and keyword matching for various industries
    combined_score = (
        similarity_score * 0.5 +      # Embedding similarity
        (ner_match_score / 100) * 0.3 +  # NER-based entity match
        (fuzzy_keyword_score / 100) * 0.2  # Fuzzy matching for keywords
    )
    match_percentage = combined_score * 100
    debug_info += f"Overall Match Percentage: {match_percentage:.2f}%\n"
    
    return {"Match Percentage": f"{match_percentage:.2f}%"}, debug_info

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# CV and Job Description Matcher for All Industries with NER and Fuzzy Matching")
    
    pdf_input = gr.File(label="Upload CV (PDF format)")
    job_description = gr.Textbox(label="Job Description", placeholder="Enter the job description text here", lines=10)
    
    match_button = gr.Button("Calculate Match Percentage")
    output = gr.JSON(label="Match Result")
    debug_output = gr.Textbox(label="Debug Info", lines=10)
    
    match_button.click(fn=match_cv_to_job, inputs=[pdf_input, job_description], outputs=[output, debug_output])

demo.launch()