import streamlit as st import pandas as pd import re import json import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import torch from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer import time import os import docx2txt import io import docx # Set page title and configuration st.set_page_config( page_title="Resume-Job Fit Analyzer", page_icon="📊", layout="wide" ) # Download NLTK resources if needed @st.cache_resource def download_nltk_resources(): try: nltk.data.find('tokenizers/punkt') nltk.data.find('corpora/stopwords') except LookupError: nltk.download('punkt') nltk.download('stopwords') return stopwords.words('english') stop_words = download_nltk_resources() # Load models @st.cache_resource def load_models(): """Load and cache the NLP models""" models = {} # Use BART for resume parsing models['parser'] = pipeline( "text2text-generation", model="facebook/bart-base", # This would be the fine-tuned model in production device=0 if torch.cuda.is_available() else -1 ) # Use Qwen for evaluation models['evaluator'] = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") return models # Read resume file def read_resume_file(uploaded_file): """Extract text from uploaded resume file""" file_extension = os.path.splitext(uploaded_file.name)[1].lower() if file_extension == ".txt": # Text file text = uploaded_file.read().decode('utf-8') return text elif file_extension == ".docx": # Modern Word document try: text = docx2txt.process(uploaded_file) return text except Exception as e: st.error(f"Error reading DOCX file: {str(e)}") return None elif file_extension == ".doc": # Legacy Word document - this is more complex try: # For .doc files, we'll return a warning that the conversion might not be perfect st.warning("Note: .doc files might not convert perfectly. For best results, upload .docx or .txt files.") # Save the uploaded file temporarily with open("temp_file.doc", "wb") as f: f.write(uploaded_file.getbuffer()) # Use an external converter - this is a placeholder # In a real implementation, you might want to use antiword, textract or similar # Here we'll try using docx2txt as a fallback but it might not work well try: text = docx2txt.process("temp_file.doc") except: # If that fails, try a very basic approach with open("temp_file.doc", "rb") as f: content = f.read() text = content.decode('utf-8', errors='ignore') # Try to extract readable text by removing binary parts text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text) # Clean up if os.path.exists("temp_file.doc"): os.remove("temp_file.doc") return text except Exception as e: st.error(f"Error reading DOC file: {str(e)}") return None else: st.error(f"Unsupported file format: {file_extension}") return None # Extract skills from text def extract_skills(text, skill_keywords): """Extract skills from text based on a predefined list of skills""" found_skills = [] text_lower = text.lower() for skill in skill_keywords: # Create a regular expression pattern for whole word matching pattern = r'\b' + re.escape(skill.lower()) + r'\b' if re.search(pattern, text_lower): found_skills.append(skill) return list(set(found_skills)) # Parse resume def parse_resume(resume_text, models): """Extract structured information from resume text""" # In production, this would use the fine-tuned BART model # For now, we'll implement a simple rule-based parser # Clean the text clean_text = re.sub(r'\s+', ' ', resume_text).strip() # Extract common skill keywords (this would be a more extensive list in production) tech_skills = [ "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL", "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring", "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP", "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions", "REST API", "GraphQL", "Microservices", "Serverless" ] soft_skills = [ "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking", "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation" ] # Extract skills found_tech_skills = extract_skills(clean_text, tech_skills) found_soft_skills = extract_skills(clean_text, soft_skills) # Extract experience using regex patterns (simplified) experience_pattern = r'(?:Experience|EXPERIENCE|Work Experience|WORK EXPERIENCE).*?(?:Education|EDUCATION|Skills|SKILLS|$)' experience_match = re.search(experience_pattern, clean_text, re.DOTALL) experience_text = experience_match.group(0) if experience_match else "" # Extract education using regex patterns (simplified) education_pattern = r'(?:Education|EDUCATION).*?(?:Skills|SKILLS|Experience|EXPERIENCE|$)' education_match = re.search(education_pattern, clean_text, re.DOTALL) education_text = education_match.group(0) if education_match else "" # Estimate years of experience (simplified) years_exp = 0 year_patterns = [ r'(\d{4})\s*-\s*(?:present|current|now|2023|2024|2025)', r'(\d{4})\s*-\s*(\d{4})' ] for pattern in year_patterns: matches = re.findall(pattern, clean_text, re.IGNORECASE) for match in matches: if isinstance(match, tuple): start_year = int(match[0]) end_year = int(match[1]) if match[1].isdigit() else 2025 years_exp += (end_year - start_year) else: start_year = int(match) years_exp += (2025 - start_year) # Cap reasonable years years_exp = min(years_exp, 30) # Create structured data structured_data = { "skills": { "technical": found_tech_skills, "soft": found_soft_skills }, "experience": { "years": years_exp, "summary": experience_text[:300] + "..." if len(experience_text) > 300 else experience_text }, "education": education_text[:300] + "..." if len(education_text) > 300 else education_text } return structured_data # Parse job description def parse_job_description(job_text): """Extract key requirements from job description""" # Clean the text clean_text = re.sub(r'\s+', ' ', job_text).strip() # Extract common skill keywords (same as resume parser) tech_skills = [ "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL", "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring", "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP", "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions", "REST API", "GraphQL", "Microservices", "Serverless" ] soft_skills = [ "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking", "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation" ] # Extract skills required_tech_skills = extract_skills(clean_text, tech_skills) required_soft_skills = extract_skills(clean_text, soft_skills) # Extract years of experience requirement (simplified) exp_patterns = [ r'(\d+)\+?\s*(?:years|yrs|yr)(?:\s*of)?\s*(?:experience|exp)', r'(?:experience|exp)(?:\s*of)?\s*(\d+)\+?\s*(?:years|yrs|yr)' ] required_years = 0 for pattern in exp_patterns: matches = re.findall(pattern, clean_text, re.IGNORECASE) if matches: # Take the highest mentioned years required_years = max([int(y) for y in matches if y.isdigit()] + [required_years]) # Extract job title title_pattern = r'^(.*?)(?:\n|$)' title_match = re.search(title_pattern, clean_text) job_title = title_match.group(1).strip() if title_match else "Not specified" # Create structured data structured_data = { "title": job_title, "requirements": { "technical_skills": required_tech_skills, "soft_skills": required_soft_skills, "years_experience": required_years }, "full_text": job_text } return structured_data # Calculate match score def calculate_match_score(resume_data, job_data): """Calculate how well the resume matches the job description""" scores = {} # Calculate skill match percentage required_tech_skills = set(job_data["requirements"]["technical_skills"]) candidate_tech_skills = set(resume_data["skills"]["technical"]) required_soft_skills = set(job_data["requirements"]["soft_skills"]) candidate_soft_skills = set(resume_data["skills"]["soft"]) if required_tech_skills: tech_match = len(candidate_tech_skills.intersection(required_tech_skills)) / len(required_tech_skills) scores["technical_skills"] = { "score": int(tech_match * 100), "matched": list(candidate_tech_skills.intersection(required_tech_skills)), "missing": list(required_tech_skills - candidate_tech_skills) } else: scores["technical_skills"] = {"score": 0, "matched": [], "missing": []} if required_soft_skills: soft_match = len(candidate_soft_skills.intersection(required_soft_skills)) / len(required_soft_skills) scores["soft_skills"] = { "score": int(soft_match * 100), "matched": list(candidate_soft_skills.intersection(required_soft_skills)), "missing": list(required_soft_skills - candidate_soft_skills) } else: scores["soft_skills"] = {"score": 0, "matched": [], "missing": []} # Experience match required_years = job_data["requirements"]["years_experience"] candidate_years = resume_data["experience"]["years"] if required_years > 0: if candidate_years >= required_years: exp_score = 100 else: exp_score = int((candidate_years / required_years) * 100) scores["experience"] = { "score": exp_score, "candidate_years": candidate_years, "required_years": required_years } else: scores["experience"] = { "score": 100 if candidate_years > 0 else 50, "candidate_years": candidate_years, "required_years": "Not specified" } # Calculate overall score (weighted) tech_weight = 0.6 soft_weight = 0.2 exp_weight = 0.2 overall_score = ( scores["technical_skills"]["score"] * tech_weight + scores["soft_skills"]["score"] * soft_weight + scores["experience"]["score"] * exp_weight ) scores["overall"] = int(overall_score) return scores # Generate expert assessment using Qwen def generate_assessment(resume_data, job_data, match_scores, models): """Generate an expert assessment using Qwen model""" # Prepare context job_title = job_data["title"] matched_skills = match_scores["technical_skills"]["matched"] missing_skills = match_scores["technical_skills"]["missing"] experience_match = match_scores["experience"] overall_score = match_scores["overall"] # Determine fit classification fit_status = "FIT" if overall_score >= 70 else "NOT FIT" # Create prompt for Qwen prompt = f""" <|im_start|>system You are an expert resume evaluator. Analyze how well a candidate fits a job posting and provide professional feedback. <|im_end|> <|im_start|>user Evaluate this candidate for a {job_title} position. Overall match score: {overall_score}% Technical skills match: {match_scores["technical_skills"]["score"]}% Soft skills match: {match_scores["soft_skills"]["score"]}% Experience match: {experience_match["score"]}% Candidate has: {experience_match["candidate_years"]} years of experience Position requires: {experience_match["required_years"]} years of experience Matched technical skills: {", ".join(matched_skills) if matched_skills else "None"} Missing technical skills: {", ".join(missing_skills) if missing_skills else "None"} Create a professional assessment of this candidate. First state whether they are a FIT or NOT FIT for the position, then explain why with specific strengths and development areas. <|im_end|> <|im_start|>assistant """ try: # Generate the assessment using Qwen tokenizer = models['evaluator_tokenizer'] qwen_model = models['evaluator'] inputs = tokenizer(prompt, return_tensors="pt") outputs = qwen_model.generate( inputs.input_ids, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9 ) assessment = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract the assistant's response if "<|im_start|>assistant" in assessment: assessment = assessment.split("<|im_start|>assistant")[-1] # Clean up any remaining markers assessment = re.sub(r'<\|im_(start|end)\|>', '', assessment) assessment = assessment.strip() # If no assessment was generated, create a fallback if not assessment or len(assessment) < 50: assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status) except Exception as e: st.error(f"Error generating assessment: {str(e)}") assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status) return assessment, fit_status # Generate fallback assessment def generate_fallback_assessment(resume_data, job_data, match_scores, fit_status): """Generate a fallback assessment if the model fails""" job_title = job_data["title"] matched_skills = match_scores["technical_skills"]["matched"] missing_skills = match_scores["technical_skills"]["missing"] overall_score = match_scores["overall"] if fit_status == "FIT": assessment = f"""FIT: This candidate demonstrates a strong alignment with the {job_title} position, achieving an overall match score of {overall_score}%. Their proficiency in {', '.join(matched_skills) if matched_skills else 'relevant skills'} positions them well to contribute effectively from the start. The candidate's experience level is suitable for the role's requirements. To maximize their success, they could consider developing expertise in {', '.join(missing_skills) if missing_skills else 'additional specialized areas relevant to this role'}. """ else: assessment = f"""NOT FIT: This candidate currently shows limited alignment with the {job_title} position, with an overall match score of {overall_score}%. While they demonstrate some relevant capabilities in {', '.join(matched_skills) if matched_skills else 'a few areas'}, they would need to develop expertise in critical areas such as {', '.join(missing_skills) if missing_skills else 'key technical requirements for this position'}. The candidate may become more competitive for this role by focusing on these skill gaps and gaining more relevant experience. """ return assessment # Create the main header and interface st.title("Resume-Job Fit Analyzer") st.markdown("### Evaluate how well a resume matches a job description") # Resume upload st.subheader("Resume") uploaded_file = st.file_uploader("Upload Resume (.doc, .docx, .txt)", type=["doc", "docx", "txt"]) # Job description input st.subheader("Job Description") job_description = st.text_area("Paste job description here", height=200, placeholder="Paste the job description here...") # Display resume text if file is uploaded resume_text = None if uploaded_file is not None: resume_text = read_resume_file(uploaded_file) if resume_text: with st.expander("View Resume Text"): st.text(resume_text[:1000] + ("..." if len(resume_text) > 1000 else "")) # Analysis button analyze_button = st.button("Analyze Match", type="primary") # Main analysis logic if analyze_button: if not resume_text or not job_description: st.error("Please upload a resume file and provide a job description.") else: with st.spinner("Analyzing resume and job match..."): # Record start time start_time = time.time() # Load models (uses caching so only loads once) models = load_models() # Parse resume and job description resume_data = parse_resume(resume_text, models) job_data = parse_job_description(job_description) # Calculate match score match_scores = calculate_match_score(resume_data, job_data) # Generate assessment assessment, fit_status = generate_assessment(resume_data, job_data, match_scores, models) # Calculate execution time execution_time = time.time() - start_time # Display results st.success(f"Analysis complete in {execution_time:.2f} seconds") # Display fit status prominently st.markdown(f"## Overall Result: {fit_status}") # Display match score st.subheader("Match Score") score_col1, score_col2, score_col3 = st.columns(3) with score_col1: st.metric("Overall Match", f"{match_scores['overall']}%") with score_col2: st.metric("Technical Skills", f"{match_scores['technical_skills']['score']}%") with score_col3: st.metric("Experience Match", f"{match_scores['experience']['score']}%") # Show skills breakdown st.subheader("Skills Breakdown") skill_col1, skill_col2 = st.columns(2) with skill_col1: st.markdown("##### Matched Skills") if match_scores["technical_skills"]["matched"]: for skill in match_scores["technical_skills"]["matched"]: st.markdown(f"✅ {skill}") else: st.markdown("No matched skills found") with skill_col2: st.markdown("##### Missing Skills") if match_scores["technical_skills"]["missing"]: for skill in match_scores["technical_skills"]["missing"]: st.markdown(f"❌ {skill}") else: st.markdown("No missing skills detected") # Show experience comparison st.subheader("Experience") exp_col1, exp_col2 = st.columns(2) with exp_col1: st.markdown(f"**Required**: {job_data['requirements']['years_experience']} years") with exp_col2: st.markdown(f"**Candidate has**: {resume_data['experience']['years']} years") # Display detailed assessment st.subheader("Expert Assessment") st.markdown(assessment) # Show parsed data (expandable) with st.expander("View Parsed Data"): col1, col2 = st.columns(2) with col1: st.subheader("Resume Data") st.json(resume_data) with col2: st.subheader("Job Requirements") st.json(job_data)