Spaces:

bangaboy
/

glinerdemo

Sleeping

App Files Files Community

bangaboy commited on Oct 17, 2024

Commit

a45aab6

verified ·

1 Parent(s): 5840b65

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -5

app.py CHANGED Viewed

@@ -2,16 +2,16 @@ import streamlit as st
 import google.generativeai as genai
 import fitz
 import spacy
-from transformers import pipeline
 from docx import Document
 import dateparser
 from datetime import datetime
 # Load SpaCy model
 nlp = spacy.load('en_core_web_sm')
-# Load NER pipeline
-ner = pipeline('ner', model="Babelscape/wikineural-multilingual-ner", aggregation_strategy="simple")
 def extract_text_from_pdf(file):
     pdf = fitz.open(stream=file.read(), filetype="pdf")
@@ -40,22 +40,30 @@ def generate_summary(text, model):
 def extract_info(text):
     doc = nlp(text)
-    ner_results = ner(text)
     # Extract companies
     companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
-    companies.update([result['word'] for result in ner_results if result['entity_group'] == "ORG"])
     # Extract experience
     experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
     # Extract education
     education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
     # Extract contact information
     email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
     phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
     return list(companies), experience, list(education), email, phone
 def main():

 import google.generativeai as genai
 import fitz
 import spacy
 from docx import Document
 import dateparser
 from datetime import datetime
+from giner import GiNER
 # Load SpaCy model
 nlp = spacy.load('en_core_web_sm')
+# Load GLiNER model
+giner = GiNER("roberta-large")
 def extract_text_from_pdf(file):
     pdf = fitz.open(stream=file.read(), filetype="pdf")
 def extract_info(text):
     doc = nlp(text)
+    giner_results = giner.annotate(text)
     # Extract companies
     companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
+    companies.update([entity['text'] for entity in giner_results if entity['type'] == "ORG"])
     # Extract experience
     experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
     # Extract education
     education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
+    education.update([entity['text'] for entity in giner_results if entity['type'] == "ORG" and any(keyword in entity['text'].lower() for keyword in ["university", "college", "institute", "school"])])
     # Extract contact information
     email = next((ent.text for ent in doc.ents if ent.label_ == "EMAIL"), "Not found")
     phone = next((ent.text for ent in doc.ents if ent.label_ == "PHONE_NUMBER"), "Not found")
+    # Use GLiNER for additional entity extraction
+    for entity in giner_results:
+        if entity['type'] == "PER" and email == "Not found":
+            email = entity['text']
+        elif entity['type'] == "PHONE" and phone == "Not found":
+            phone = entity['text']
     return list(companies), experience, list(education), email, phone
 def main():