Update app.py
Browse files
app.py
CHANGED
@@ -16,11 +16,10 @@ import docx2txt
|
|
16 |
from io import StringIO
|
17 |
from PyPDF2 import PdfFileReader
|
18 |
import warnings
|
19 |
-
import nltk
|
20 |
|
21 |
-
nltk.
|
22 |
|
23 |
-
from nltk import sent_tokenize
|
24 |
warnings.filterwarnings("ignore")
|
25 |
|
26 |
|
@@ -71,7 +70,7 @@ def article_text_extractor(url: str):
|
|
71 |
|
72 |
def chunk_clean_text(text):
|
73 |
|
74 |
-
sentences =
|
75 |
current_chunk = 0
|
76 |
chunks = []
|
77 |
|
|
|
16 |
from io import StringIO
|
17 |
from PyPDF2 import PdfFileReader
|
18 |
import warnings
|
19 |
+
import nltk.data
|
20 |
|
21 |
+
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
22 |
|
|
|
23 |
warnings.filterwarnings("ignore")
|
24 |
|
25 |
|
|
|
70 |
|
71 |
def chunk_clean_text(text):
|
72 |
|
73 |
+
sentences = tokenizer(text)
|
74 |
current_chunk = 0
|
75 |
chunks = []
|
76 |
|