Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
import docx
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
|
@@ -21,11 +21,7 @@ model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", trust_rem
|
|
21 |
def process_document(document_file):
|
22 |
document_text = ""
|
23 |
if document_file.type == "application/pdf":
|
24 |
-
|
25 |
-
for page in pdf.pages:
|
26 |
-
text = page.extract_text()
|
27 |
-
if text:
|
28 |
-
document_text += text.strip() + "\n\n"
|
29 |
elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
30 |
docx_file = docx.Document(document_file)
|
31 |
for paragraph in docx_file.paragraphs:
|
|
|
1 |
import streamlit as st
|
2 |
+
from pdfminer.high_level import extract_text
|
3 |
import docx
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
|
|
|
21 |
def process_document(document_file):
|
22 |
document_text = ""
|
23 |
if document_file.type == "application/pdf":
|
24 |
+
document_text = extract_text(document_file)
|
|
|
|
|
|
|
|
|
25 |
elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
26 |
docx_file = docx.Document(document_file)
|
27 |
for paragraph in docx_file.paragraphs:
|