Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -36,32 +36,37 @@ def extract_text_from_docx(file):
|
|
36 |
return "\n".join([para.text for para in doc.paragraphs])
|
37 |
|
38 |
def extract_companies(text):
|
|
|
39 |
doc = nlp(text)
|
40 |
companies = []
|
41 |
|
|
|
42 |
company_pattern = re.compile(
|
43 |
-
|
44 |
|
|
|
45 |
for ent in doc.ents:
|
46 |
-
if ent.label_ == "ORG"
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
return "\n".join(companies)
|
51 |
|
52 |
def extract_colleges(text):
|
53 |
doc = nlp(text)
|
54 |
colleges = []
|
55 |
|
|
|
56 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
57 |
|
58 |
for sent in doc.sents:
|
|
|
59 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
60 |
for edu in edu_ents:
|
61 |
colleges.append(edu.text)
|
62 |
-
|
63 |
-
|
64 |
-
return "\n".join(colleges)
|
65 |
|
66 |
def extract_years_of_experience(text):
|
67 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
@@ -122,27 +127,28 @@ def parse_resume(file):
|
|
122 |
summary = extract_summary(doc)
|
123 |
linkedin = extract_linkedin(text)
|
124 |
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
except Exception as e:
|
128 |
import traceback
|
129 |
-
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
130 |
|
131 |
-
# Create Gradio interface
|
132 |
iface = gr.Interface(
|
133 |
fn=parse_resume,
|
134 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
135 |
-
outputs=
|
136 |
-
gr.Textbox(label="Companies Worked For", lines=10),
|
137 |
-
gr.Textbox(label="Colleges Attended", lines=10),
|
138 |
-
gr.Textbox(label="Years of Experience"),
|
139 |
-
gr.Textbox(label="Phone Number"),
|
140 |
-
gr.Textbox(label="Email ID"),
|
141 |
-
gr.Textbox(label="Summary", lines=3),
|
142 |
-
gr.Textbox(label="LinkedIn ID")
|
143 |
-
],
|
144 |
title="Advanced Resume Parser",
|
145 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
146 |
)
|
147 |
|
148 |
-
iface.launch(share=True)
|
|
|
36 |
return "\n".join([para.text for para in doc.paragraphs])
|
37 |
|
38 |
def extract_companies(text):
|
39 |
+
# Process the text with the spaCy model
|
40 |
doc = nlp(text)
|
41 |
companies = []
|
42 |
|
43 |
+
# Define a regex pattern for common company name suffixes
|
44 |
company_pattern = re.compile(
|
45 |
+
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
|
46 |
|
47 |
+
# Iterate over the identified entities in the text
|
48 |
for ent in doc.ents:
|
49 |
+
if ent.label_ == "ORG":
|
50 |
+
# Apply the regex pattern to filter out company names
|
51 |
+
if company_pattern.search(ent.text):
|
52 |
+
companies.append(ent.text)
|
53 |
|
54 |
+
return companies
|
|
|
55 |
|
56 |
def extract_colleges(text):
|
57 |
doc = nlp(text)
|
58 |
colleges = []
|
59 |
|
60 |
+
# Extended list of education-related keywords
|
61 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
62 |
|
63 |
for sent in doc.sents:
|
64 |
+
# Extract entities labeled as ORG and check if they contain education-related keywords
|
65 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
66 |
for edu in edu_ents:
|
67 |
colleges.append(edu.text)
|
68 |
+
|
69 |
+
return colleges
|
|
|
70 |
|
71 |
def extract_years_of_experience(text):
|
72 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
|
|
127 |
summary = extract_summary(doc)
|
128 |
linkedin = extract_linkedin(text)
|
129 |
|
130 |
+
result = {
|
131 |
+
"Companies Worked For": companies,
|
132 |
+
"Colleges Attended": colleges,
|
133 |
+
"Years of Experience": years_of_experience,
|
134 |
+
"Phone Number": phone,
|
135 |
+
"Email ID": email,
|
136 |
+
"Summary": summary,
|
137 |
+
"LinkedIn ID": linkedin
|
138 |
+
}
|
139 |
+
|
140 |
+
return result
|
141 |
except Exception as e:
|
142 |
import traceback
|
143 |
+
return {"Error": f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"}
|
144 |
|
145 |
+
# Create Gradio interface
|
146 |
iface = gr.Interface(
|
147 |
fn=parse_resume,
|
148 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
149 |
+
outputs=gr.JSON(label="Extracted Information"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
title="Advanced Resume Parser",
|
151 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
152 |
)
|
153 |
|
154 |
+
iface.launch(share=True)
|