Spaces:

saifeddinemk
/

cv_job

Sleeping

App Files Files Community

cv_job / app.py

saifeddinemk

Init Commit

1c6a59c 8 months ago

raw

history blame contribute delete

5.28 kB

	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	from fuzzywuzzy import fuzz
	import gradio as gr
	import fitz # PyMuPDF for PDF extraction
	import pymupdf

	# Load the SentenceTransformer model for embeddings
	model = SentenceTransformer('fine_tuned_job_resume_similarity_model')

	# Load Hugging Face NER model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
	ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
	ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")

	# Generalized keywords across multiple industries
	TARGET_KEYWORDS = [
	"skill", "experience", "education", "certification", "project", "management",
	"sales", "marketing", "customer service", "financial", "analysis", "quality",
	"engineering", "healthcare", "law", "administration", "communication",
	"problem-solving", "teamwork", "leadership", "technical", "planning", "operations"
	]

	# Function to extract text from PDF files
	def extract_text_from_pdf(pdf_file):
	text = ""
	with pymupdf.open(pdf_file) as doc:
	for page in doc:
	text += page.get_text("text")
	return text

	# Define function to dynamically extract entities into generalized categories
	def extract_entities(text):
	entities = {"qualifications": [], "responsibilities": [], "other": []}
	ner_results = ner_pipeline(text)
	for entity in ner_results:
	word = entity['word'].strip()
	if any(keyword in word.lower() for keyword in ["skill", "degree", "education", "certification", "qualification"]):
	entities["qualifications"].append(word)
	elif any(keyword in word.lower() for keyword in ["experience", "responsibility", "role", "project"]):
	entities["responsibilities"].append(word)
	else:
	entities["other"].append(word)
	return entities

	# Function to compute fuzzy matching score for keywords
	def fuzzy_match_keywords(cv_text, job_text, keywords):
	match_score = 0
	for keyword in keywords:
	score = fuzz.partial_ratio(cv_text.lower(), keyword.lower())
	match_score += score if score > 60 else 0 # Consider only high-confidence matches
	return match_score / len(keywords) if keywords else 0

	def match_cv_to_job(pdf_file, job_description):
	try:
	# Extract text from PDF file
	cv_text = extract_text_from_pdf(pdf_file)
	debug_info = "Debug Info:\n"

	# Extract entities from CV and job description
	cv_entities = extract_entities(cv_text)
	job_entities = extract_entities(job_description)

	# Calculate NER-based entity match score
	match_score = 0
	for key in cv_entities:
	if key in job_entities:
	matched = set(cv_entities[key]) & set(job_entities[key])
	match_score += len(matched) / len(set(job_entities[key])) if job_entities[key] else 0

	# Normalize NER match score by number of categories
	ner_match_score = (match_score / len(cv_entities)) * 100
	debug_info += f"NER Match Score: {ner_match_score:.2f}%\n"

	# Compute fuzzy matching score for generalized keywords
	fuzzy_keyword_score = fuzzy_match_keywords(cv_text, job_description, TARGET_KEYWORDS)
	debug_info += f"Fuzzy Keyword Score: {fuzzy_keyword_score:.2f}\n"

	# Calculate overall similarity score using embeddings
	cv_embedding = model.encode(cv_text, convert_to_tensor=True)
	job_embedding = model.encode(job_description, convert_to_tensor=True)
	similarity_score = util.pytorch_cos_sim(cv_embedding, job_embedding).item()
	debug_info += f"Embedding Similarity Score: {similarity_score:.2f}\n"

	# Adjust weights to balance entity and keyword matching for various industries
	combined_score = (
	similarity_score * 0.5 + # Embedding similarity
	(ner_match_score / 100) * 0.3 + # NER-based entity match
	(fuzzy_keyword_score / 100) * 0.2 # Fuzzy matching for keywords
	)
	match_percentage = combined_score * 100
	debug_info += f"Overall Match Percentage: {match_percentage:.2f}%\n"

	return {"Match Percentage": f"{match_percentage:.2f}%"}, debug_info

	except Exception as e:
	# Capture and display the exception in debug output
	debug_info = f"An error occurred: {str(e)}"
	return {"Match Percentage": "Error"}, debug_info

	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# CV and Job Description Matcher for All Industries with NER and Fuzzy Matching")

	pdf_input = gr.File(label="Upload CV (PDF format)")
	job_description = gr.Textbox(label="Job Description", placeholder="Enter the job description text here", lines=10)

	match_button = gr.Button("Calculate Match Percentage")
	output = gr.JSON(label="Match Result")
	debug_output = gr.Textbox(label="Debug Info", lines=10)

	match_button.click(fn=match_cv_to_job, inputs=[pdf_input, job_description], outputs=[output, debug_output])

	demo.launch()