Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer, util | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
import gradio as gr | |
# Load the SentenceTransformer model | |
model = SentenceTransformer('msmarco-distilbert-base-v4') | |
# Load Hugging Face NER model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") | |
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") | |
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple") | |
# Define function to extract entities from text using the Hugging Face NER pipeline | |
def extract_entities(text): | |
entities = {"skills": [], "experience": [], "education": []} | |
ner_results = ner_pipeline(text) | |
for entity in ner_results: | |
label = entity['entity_group'] | |
if "SKILL" in label: | |
entities["skills"].append(entity['word']) | |
elif "EXPERIENCE" in label or "JOB" in label: | |
entities["experience"].append(entity['word']) | |
elif "DEGREE" in label or "EDUCATION" in label: | |
entities["education"].append(entity['word']) | |
return entities | |
def match_cv_to_job(cv_text, job_description): | |
debug_info = "Debug Info:\n" | |
# Extract entities from CV and job description | |
cv_entities = extract_entities(cv_text) | |
job_entities = extract_entities(job_description) | |
# Calculate similarity score between entities | |
match_score = 0 | |
for key in cv_entities: | |
if key in job_entities: | |
match_score += len(set(cv_entities[key]) & set(job_entities[key])) / len(set(job_entities[key])) if job_entities[key] else 0 | |
# Average score by number of categories | |
ner_match_score = (match_score / 3) * 100 # Normalized score for NER entities | |
debug_info += f"NER Match Score: {ner_match_score:.2f}%\n" | |
# Calculate overall similarity score using embeddings | |
cv_embedding = model.encode(cv_text, convert_to_tensor=True) | |
job_embedding = model.encode(job_description, convert_to_tensor=True) | |
similarity_score = util.pytorch_cos_sim(cv_embedding, job_embedding).item() | |
# Combine scores with weights (embedding similarity + NER matching) | |
combined_score = (similarity_score * 0.7) + (ner_match_score / 100) * 0.3 # Weighted combined score | |
match_percentage = combined_score * 100 | |
debug_info += f"Overall Match Percentage: {match_percentage:.2f}%\n" | |
return {"Match Percentage": f"{match_percentage:.2f}%"}, debug_info | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# CV and Job Description Matcher with Embeddings and NER Matching") | |
cv_text = gr.Textbox(label="CV Text", placeholder="Enter the CV text here", lines=10) | |
job_description = gr.Textbox(label="Job Description", placeholder="Enter the entire job description text here", lines=10) | |
match_button = gr.Button("Calculate Match Percentage") | |
output = gr.JSON(label="Match Result") | |
debug_output = gr.Textbox(label="Debug Info", lines=10) | |
match_button.click(fn=match_cv_to_job, inputs=[cv_text, job_description], outputs=[output, debug_output]) | |
demo.launch() | |