sotirios-slv commited on
Commit
2e73fb1
·
1 Parent(s): 8b6c55a

Added ner function

Browse files
Files changed (2) hide show
  1. app.py +14 -0
  2. requirements.txt +1 -0
app.py CHANGED
@@ -2,6 +2,11 @@ from PIL import Image
2
  import pytesseract
3
  import gradio as gr
4
  import os
 
 
 
 
 
5
 
6
  langs = []
7
 
@@ -10,6 +15,15 @@ choices = os.popen("tesseract --list-langs").read().split("\n")[1:-1]
10
  blocks = gr.Blocks()
11
 
12
 
 
 
 
 
 
 
 
 
 
13
  # If you don't have tesseract executable in your PATH, include the following:
14
  # pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
15
  # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
 
2
  import pytesseract
3
  import gradio as gr
4
  import os
5
+ from flair.data import Sentence
6
+ from flair.models import SequenceTagger
7
+ from segtok.segmenter import split_single
8
+
9
+ tagger = SequenceTagger.load("ner-ontonotes")
10
 
11
  langs = []
12
 
 
15
  blocks = gr.Blocks()
16
 
17
 
18
+ def get_named_entities(ocr_text: str):
19
+ sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(ocr_text)]
20
+ tagger.predict(sentence)
21
+
22
+ entities = [entity for entity in sent.get_spans("ner") for sent in sentence]
23
+
24
+ return entities
25
+
26
+
27
  # If you don't have tesseract executable in your PATH, include the following:
28
  # pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
29
  # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
requirements.txt CHANGED
@@ -1,2 +1,3 @@
 
1
  gradio
2
  pytesseract
 
1
+ flair
2
  gradio
3
  pytesseract