Prernas19 commited on
Commit
6116745
·
verified ·
1 Parent(s): 1053c28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -22
app.py CHANGED
@@ -36,32 +36,37 @@ def extract_text_from_docx(file):
36
  return "\n".join([para.text for para in doc.paragraphs])
37
 
38
  def extract_companies(text):
 
39
  doc = nlp(text)
40
  companies = []
41
 
 
42
  company_pattern = re.compile(
43
- r'\b(?:Inc\.|Corp\.|LLC|Ltd\.|Co\.|Company|Group|Services|Technologies|Pvt\.|Solutions|Consulting|Associates|Enterprises|Partners|Holdings|Systems|Networks|Ventures|Partners|International|Ltd|GmbH|S\.A\.|S\.L\.|LLP|PLC|AG|LLC)\b', re.IGNORECASE)
44
 
 
45
  for ent in doc.ents:
46
- if ent.label_ == "ORG" and company_pattern.search(ent.text):
47
- companies.append(ent.text)
 
 
48
 
49
- # Join companies with new lines
50
- return "\n".join(companies)
51
 
52
  def extract_colleges(text):
53
  doc = nlp(text)
54
  colleges = []
55
 
 
56
  edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
57
 
58
  for sent in doc.sents:
 
59
  edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
60
  for edu in edu_ents:
61
  colleges.append(edu.text)
62
-
63
- # Join colleges with new lines
64
- return "\n".join(colleges)
65
 
66
  def extract_years_of_experience(text):
67
  years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
@@ -122,27 +127,28 @@ def parse_resume(file):
122
  summary = extract_summary(doc)
123
  linkedin = extract_linkedin(text)
124
 
125
- return companies, colleges, years_of_experience, phone, email, summary, linkedin
126
-
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
  import traceback
129
- return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
130
 
131
- # Create Gradio interface with separate output components
132
  iface = gr.Interface(
133
  fn=parse_resume,
134
  inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
135
- outputs=[
136
- gr.Textbox(label="Companies Worked For", lines=10),
137
- gr.Textbox(label="Colleges Attended", lines=10),
138
- gr.Textbox(label="Years of Experience"),
139
- gr.Textbox(label="Phone Number"),
140
- gr.Textbox(label="Email ID"),
141
- gr.Textbox(label="Summary", lines=3),
142
- gr.Textbox(label="LinkedIn ID")
143
- ],
144
  title="Advanced Resume Parser",
145
  description="Upload a resume in PDF or DOCX format to extract key information."
146
  )
147
 
148
- iface.launch(share=True)
 
36
  return "\n".join([para.text for para in doc.paragraphs])
37
 
38
  def extract_companies(text):
39
+ # Process the text with the spaCy model
40
  doc = nlp(text)
41
  companies = []
42
 
43
+ # Define a regex pattern for common company name suffixes
44
  company_pattern = re.compile(
45
+ r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
46
 
47
+ # Iterate over the identified entities in the text
48
  for ent in doc.ents:
49
+ if ent.label_ == "ORG":
50
+ # Apply the regex pattern to filter out company names
51
+ if company_pattern.search(ent.text):
52
+ companies.append(ent.text)
53
 
54
+ return companies
 
55
 
56
  def extract_colleges(text):
57
  doc = nlp(text)
58
  colleges = []
59
 
60
+ # Extended list of education-related keywords
61
  edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
62
 
63
  for sent in doc.sents:
64
+ # Extract entities labeled as ORG and check if they contain education-related keywords
65
  edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
66
  for edu in edu_ents:
67
  colleges.append(edu.text)
68
+
69
+ return colleges
 
70
 
71
  def extract_years_of_experience(text):
72
  years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
 
127
  summary = extract_summary(doc)
128
  linkedin = extract_linkedin(text)
129
 
130
+ result = {
131
+ "Companies Worked For": companies,
132
+ "Colleges Attended": colleges,
133
+ "Years of Experience": years_of_experience,
134
+ "Phone Number": phone,
135
+ "Email ID": email,
136
+ "Summary": summary,
137
+ "LinkedIn ID": linkedin
138
+ }
139
+
140
+ return result
141
  except Exception as e:
142
  import traceback
143
+ return {"Error": f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"}
144
 
145
+ # Create Gradio interface
146
  iface = gr.Interface(
147
  fn=parse_resume,
148
  inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
149
+ outputs=gr.JSON(label="Extracted Information"),
 
 
 
 
 
 
 
 
150
  title="Advanced Resume Parser",
151
  description="Upload a resume in PDF or DOCX format to extract key information."
152
  )
153
 
154
+ iface.launch(share=True)