Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,16 +2,16 @@ import streamlit as st
|
|
2 |
import google.generativeai as genai
|
3 |
import fitz
|
4 |
import spacy
|
5 |
-
from transformers import pipeline
|
6 |
from docx import Document
|
7 |
import dateparser
|
8 |
from datetime import datetime
|
|
|
9 |
|
10 |
# Load SpaCy model
|
11 |
nlp = spacy.load('en_core_web_sm')
|
12 |
|
13 |
-
# Load
|
14 |
-
|
15 |
|
16 |
def extract_text_from_pdf(file):
|
17 |
pdf = fitz.open(stream=file.read(), filetype="pdf")
|
@@ -40,22 +40,30 @@ def generate_summary(text, model):
|
|
40 |
|
41 |
def extract_info(text):
|
42 |
doc = nlp(text)
|
43 |
-
|
44 |
|
45 |
# Extract companies
|
46 |
companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
|
47 |
-
companies.update([
|
48 |
|
49 |
# Extract experience
|
50 |
experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
|
51 |
|
52 |
# Extract education
|
53 |
education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
|
|
|
54 |
|
55 |
# Extract contact information
|
56 |
email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
|
57 |
phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
return list(companies), experience, list(education), email, phone
|
60 |
|
61 |
def main():
|
|
|
2 |
import google.generativeai as genai
|
3 |
import fitz
|
4 |
import spacy
|
|
|
5 |
from docx import Document
|
6 |
import dateparser
|
7 |
from datetime import datetime
|
8 |
+
from giner import GiNER
|
9 |
|
10 |
# Load SpaCy model
|
11 |
nlp = spacy.load('en_core_web_sm')
|
12 |
|
13 |
+
# Load GLiNER model
|
14 |
+
giner = GiNER("roberta-large")
|
15 |
|
16 |
def extract_text_from_pdf(file):
|
17 |
pdf = fitz.open(stream=file.read(), filetype="pdf")
|
|
|
40 |
|
41 |
def extract_info(text):
|
42 |
doc = nlp(text)
|
43 |
+
giner_results = giner.annotate(text)
|
44 |
|
45 |
# Extract companies
|
46 |
companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
|
47 |
+
companies.update([entity['text'] for entity in giner_results if entity['type'] == "ORG"])
|
48 |
|
49 |
# Extract experience
|
50 |
experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
|
51 |
|
52 |
# Extract education
|
53 |
education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
|
54 |
+
education.update([entity['text'] for entity in giner_results if entity['type'] == "ORG" and any(keyword in entity['text'].lower() for keyword in ["university", "college", "institute", "school"])])
|
55 |
|
56 |
# Extract contact information
|
57 |
email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
|
58 |
phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
|
59 |
|
60 |
+
# Use GLiNER for additional entity extraction
|
61 |
+
for entity in giner_results:
|
62 |
+
if entity['type'] == "PER" and email == "Not found":
|
63 |
+
email = entity['text']
|
64 |
+
elif entity['type'] == "PHONE" and phone == "Not found":
|
65 |
+
phone = entity['text']
|
66 |
+
|
67 |
return list(companies), experience, list(education), email, phone
|
68 |
|
69 |
def main():
|