Spaces:
Sleeping
Sleeping
import streamlit as st | |
import google.generativeai as genai | |
from PIL import Image | |
import fitz # PyMuPDF | |
from docx import Document | |
import json | |
from pathlib import Path | |
from datetime import datetime | |
import re | |
import pytesseract | |
import io | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text from uploaded PDF file.""" | |
text_content = [] | |
try: | |
pdf_bytes = pdf_file.read() | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
for page_num in range(len(doc)): | |
page = doc[page_num] | |
text_content.append(page.get_text()) | |
return "\n".join(text_content) | |
except Exception as e: | |
st.error(f"Error in PDF extraction: {str(e)}") | |
return "" | |
def extract_text_from_docx(docx_file): | |
"""Extract text from uploaded DOCX file.""" | |
try: | |
doc = Document(docx_file) | |
text_content = [] | |
for paragraph in doc.paragraphs: | |
text_content.append(paragraph.text) | |
return "\n".join(text_content) | |
except Exception as e: | |
st.error(f"Error in DOCX extraction: {str(e)}") | |
return "" | |
def parse_date(date_str): | |
"""Parse date from various formats.""" | |
try: | |
# Handle 'Present' or 'Current' | |
if date_str.lower() in ['present', 'current', 'now']: | |
return datetime.now() | |
date_str = date_str.strip() | |
formats = [ | |
'%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y', | |
'%Y/%m', '%Y-%m' | |
] | |
for fmt in formats: | |
try: | |
return datetime.strptime(date_str, fmt) | |
except ValueError: | |
continue | |
year_match = re.search(r'\b20\d{2}\b', date_str) | |
if year_match: | |
return datetime.strptime(year_match.group(), '%Y') | |
return None | |
except Exception: | |
return None | |
def calculate_experience(work_history): | |
"""Calculate total years of experience from work history.""" | |
total_experience = 0 | |
current_year = datetime.now().year | |
for job in work_history: | |
duration = job.get('duration', '') | |
if not duration: | |
continue | |
parts = re.split(r'\s*-\s*|\s+to\s+', duration) | |
if len(parts) != 2: | |
continue | |
start_date = parse_date(parts[0]) | |
end_date = parse_date(parts[1]) | |
if start_date and end_date: | |
years = (end_date.year - start_date.year) + \ | |
(end_date.month - start_date.month) / 12 | |
total_experience += max(0, years) | |
return round(total_experience, 1) | |
def parse_resume(file_uploaded, api_key): | |
"""Parse resume and extract information.""" | |
genai.configure(api_key=api_key) | |
model = genai.GenerativeModel('gemini-1.5-pro') | |
prompt = """Extract the following information from this resume: | |
1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications | |
2. Full Name | |
3. Email Address | |
4. Phone Number | |
5. Education History (including degree, institution, graduation year, and field of study) | |
6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020") | |
7. Skills | |
8. LinkedIn Profile URL | |
Return the information in this JSON format: | |
{ | |
"summary": "", | |
"name": "", | |
"email": "", | |
"phone": "", | |
"education": [ | |
{ | |
"degree": "", | |
"institution": "", | |
"year": "", | |
"field": "", | |
"gpa": "" | |
} | |
], | |
"work_experience": [ | |
{ | |
"company": "", | |
"position": "", | |
"duration": "" | |
} | |
], | |
"skills": [], | |
"linkedin": "" | |
} | |
For skills include tools and technologies in output if present any in resume. | |
For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending. | |
Only return the JSON object, nothing else. If any field is not found, leave it empty.""" | |
try: | |
file_extension = Path(file_uploaded.name).suffix.lower() | |
if file_extension == '.pdf': | |
text_content = extract_text_from_pdf(file_uploaded) | |
elif file_extension in ['.docx', '.doc']: | |
text_content = extract_text_from_docx(file_uploaded) | |
elif file_extension in ['.jpg', '.jpeg', '.png']: | |
image = Image.open(file_uploaded) | |
text_content = pytesseract.image_to_string(image) | |
else: | |
st.error(f"Unsupported file format: {file_extension}") | |
return None | |
response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") | |
try: | |
response_text = response.text | |
json_start = response_text.find('{') | |
json_end = response_text.rfind('}') + 1 | |
json_str = response_text[json_start:json_end] | |
result = json.loads(json_str) | |
total_exp = calculate_experience(result.get('work_experience', [])) | |
result['total_years_experience'] = total_exp | |
return result | |
except json.JSONDecodeError as e: | |
st.error(f"Error parsing response: {str(e)}") | |
return None | |
except Exception as e: | |
st.error(f"Error processing resume: {str(e)}") | |
return None | |
def format_education(edu): | |
"""Format education details for display.""" | |
parts = [] | |
if edu.get('degree'): | |
parts.append(edu['degree']) | |
if edu.get('field'): | |
parts.append(f"in {edu['field']}") | |
if edu.get('institution'): | |
parts.append(f"from {edu['institution']}") | |
if edu.get('year'): | |
parts.append(f"({edu['year']})") | |
if edu.get('gpa') and edu['gpa'].strip(): | |
parts.append(f"- GPA: {edu['gpa']}") | |
return " ".join(parts) | |
def main(): | |
st.title("Resume Parser") | |
st.write("Upload a resume (PDF, DOCX, or Image) to extract information") | |
# Get API key from secrets or user input | |
api_key = st.secrets["GEMINI_API_KEY"] if "GEMINI_API_KEY" in st.secrets else st.text_input("Enter Gemini API Key", type="password") | |
uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"]) | |
if uploaded_file and api_key: | |
with st.spinner('Analyzing resume...'): | |
result = parse_resume(uploaded_file, api_key) | |
if result: | |
st.subheader("Extracted Information") | |
# Display summary in a text area | |
st.text_area("Summary", result.get('summary', 'Not found'), height=100) | |
# Display personal information | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.write("**Name:**", result.get('name', 'Not found')) | |
with col2: | |
st.write("**Email:**", result.get('email', 'Not found')) | |
with col3: | |
st.write("**Phone:**", result.get('phone', 'Not found')) | |
# Display total experience | |
total_exp = result.get('total_years_experience', 0) | |
exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months" | |
st.write("**Total Experience:**", exp_text) | |
# Display education | |
st.subheader("Education") | |
if result.get('education'): | |
for edu in result['education']: | |
st.write(f"- {format_education(edu)}") | |
else: | |
st.write("No education information found") | |
# Display work experience | |
st.subheader("Work Experience") | |
if result.get('work_experience'): | |
for exp in result['work_experience']: | |
duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else "" | |
st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}") | |
else: | |
st.write("No work experience found") | |
# Display Skills | |
st.subheader("Skills:") | |
if result.get('skills'): | |
for skill in result['skills']: | |
st.write(f"- {skill}") | |
else: | |
st.write("- No skills found") | |
# Display LinkedIn profile | |
st.write("**LinkedIn Profile:**", result.get('linkedin', 'Not found')) | |
if __name__ == "__main__": | |
main() |