Spaces:

Manojajj
/

dbmdz-bert-large-cased-finetuned

Sleeping

App Files Files Community

dbmdz-bert-large-cased-finetuned / app.py

Manojajj

Update app.py

2543fdd verified 2 months ago

raw

history blame contribute delete

2.4 kB

	import gradio as gr
	import torch
	from transformers import pipeline
	import pdfplumber
	import re
	import pandas as pd

	# Load pre-trained model for Named Entity Recognition (NER) to extract details
	nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt")

	def extract_text_from_pdf(pdf_file):
	"""Extract text from the uploaded PDF resume."""
	with pdfplumber.open(pdf_file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text()
	return text

	def parse_resume(resume_text):
	"""Parse the resume and extract details like name, email, phone, and skills."""
	# Define regex for phone and email extraction
	phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
	email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

	# Extract phone and email using regex
	phone = re.findall(phone_pattern, resume_text)
	email = re.findall(email_pattern, resume_text)

	# Extract named entities for skills
	entities = nlp(resume_text)
	skills = [entity['word'] for entity in entities if 'MISC' in entity['entity']]

	# Create a dictionary of parsed data (exclude Experience, Education, Certifications)
	parsed_data = {
	"Phone": phone[0] if phone else "Not found",
	"Email": email[0] if email else "Not found",
	"Skills": ", ".join(skills),
	}

	return parsed_data

	def process_resumes(pdf_files):
	"""Process multiple resumes and output a single Excel file."""
	all_parsed_data = []

	# Loop through each uploaded PDF file and parse the data
	for pdf_file in pdf_files:
	resume_text = extract_text_from_pdf(pdf_file)
	parsed_info = parse_resume(resume_text)
	all_parsed_data.append(parsed_info)

	# Convert the parsed data into a pandas DataFrame
	df = pd.DataFrame(all_parsed_data)

	# Save the DataFrame to an Excel file
	output_file = "parsed_resumes.xlsx"
	df.to_excel(output_file, index=False)

	return output_file

	# Define Gradio interface
	gr.Interface(
	fn=process_resumes,
	inputs=gr.File(file_count="multiple", label="Upload Resumes (PDFs)"),
	outputs=gr.File(label="Download Parsed Data (Excel)"),
	title="AI Resume Parser",
	description="Upload multiple resumes (PDFs) to extract details like Name, Email, Phone, and Skills. The results will be saved in an Excel file."
	).launch()