bangaboy commited on
Commit
a45aab6
·
verified ·
1 Parent(s): 5840b65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -2,16 +2,16 @@ import streamlit as st
2
  import google.generativeai as genai
3
  import fitz
4
  import spacy
5
- from transformers import pipeline
6
  from docx import Document
7
  import dateparser
8
  from datetime import datetime
 
9
 
10
  # Load SpaCy model
11
  nlp = spacy.load('en_core_web_sm')
12
 
13
- # Load NER pipeline
14
- ner = pipeline('ner', model="Babelscape/wikineural-multilingual-ner", aggregation_strategy="simple")
15
 
16
  def extract_text_from_pdf(file):
17
  pdf = fitz.open(stream=file.read(), filetype="pdf")
@@ -40,22 +40,30 @@ def generate_summary(text, model):
40
 
41
  def extract_info(text):
42
  doc = nlp(text)
43
- ner_results = ner(text)
44
 
45
  # Extract companies
46
  companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
47
- companies.update([result['word'] for result in ner_results if result['entity_group'] == "ORG"])
48
 
49
  # Extract experience
50
  experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
51
 
52
  # Extract education
53
  education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
 
54
 
55
  # Extract contact information
56
  email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
57
  phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
58
 
 
 
 
 
 
 
 
59
  return list(companies), experience, list(education), email, phone
60
 
61
  def main():
 
2
  import google.generativeai as genai
3
  import fitz
4
  import spacy
 
5
  from docx import Document
6
  import dateparser
7
  from datetime import datetime
8
+ from giner import GiNER
9
 
10
  # Load SpaCy model
11
  nlp = spacy.load('en_core_web_sm')
12
 
13
+ # Load GLiNER model
14
+ giner = GiNER("roberta-large")
15
 
16
  def extract_text_from_pdf(file):
17
  pdf = fitz.open(stream=file.read(), filetype="pdf")
 
40
 
41
  def extract_info(text):
42
  doc = nlp(text)
43
+ giner_results = giner.annotate(text)
44
 
45
  # Extract companies
46
  companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
47
+ companies.update([entity['text'] for entity in giner_results if entity['type'] == "ORG"])
48
 
49
  # Extract experience
50
  experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
51
 
52
  # Extract education
53
  education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
54
+ education.update([entity['text'] for entity in giner_results if entity['type'] == "ORG" and any(keyword in entity['text'].lower() for keyword in ["university", "college", "institute", "school"])])
55
 
56
  # Extract contact information
57
  email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
58
  phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
59
 
60
+ # Use GLiNER for additional entity extraction
61
+ for entity in giner_results:
62
+ if entity['type'] == "PER" and email == "Not found":
63
+ email = entity['text']
64
+ elif entity['type'] == "PHONE" and phone == "Not found":
65
+ phone = entity['text']
66
+
67
  return list(companies), experience, list(education), email, phone
68
 
69
  def main():