Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,16 +3,15 @@ import os
|
|
3 |
import time
|
4 |
import pandas as pd
|
5 |
import sqlite3
|
6 |
-
import ocrmypdf
|
7 |
import logging
|
8 |
|
9 |
-
from langchain.document_loaders import OnlinePDFLoader # for loading the PDF
|
10 |
from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
|
11 |
from langchain.text_splitter import CharacterTextSplitter
|
12 |
from langchain_community.vectorstores import Chroma # updated import for vectorization
|
13 |
from langchain.chains import RetrievalQA # for QA chain
|
14 |
from langchain_community.chat_models import ChatOpenAI # updated import for ChatOpenAI
|
15 |
-
from langchain_core.prompts import PromptTemplate #
|
16 |
|
17 |
# Setup basic logging
|
18 |
logging.basicConfig(level=logging.INFO)
|
@@ -24,36 +23,16 @@ def update_log(message):
|
|
24 |
log_messages += message + "\n"
|
25 |
logger.info(message)
|
26 |
|
27 |
-
def ocr_converter(input_file):
|
28 |
-
image_pdf = input_file.name
|
29 |
-
try:
|
30 |
-
# Disable deskew, clean_final, and remove_background to avoid compatibility issues with --redo-ocr.
|
31 |
-
ocrmypdf.ocr(
|
32 |
-
image_pdf,
|
33 |
-
image_pdf,
|
34 |
-
redo_ocr=True,
|
35 |
-
force_ocr=True,
|
36 |
-
language="eng",
|
37 |
-
output_type="pdf",
|
38 |
-
deskew=False,
|
39 |
-
clean_final=False,
|
40 |
-
remove_background=False
|
41 |
-
)
|
42 |
-
update_log(f"OCR conversion successful for {image_pdf}")
|
43 |
-
except Exception as e:
|
44 |
-
error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
|
45 |
-
update_log(error_msg)
|
46 |
-
raise e
|
47 |
-
return image_pdf
|
48 |
-
|
49 |
def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
|
50 |
try:
|
51 |
if open_ai_key is not None:
|
52 |
os.environ['OPENAI_API_KEY'] = open_ai_key
|
53 |
-
|
54 |
-
|
|
|
|
|
55 |
pages = loader.load_and_split()
|
56 |
-
update_log(f"
|
57 |
|
58 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
59 |
pages_to_be_loaded = []
|
@@ -138,13 +117,13 @@ def load_master_questionset_into_sqlite(connection):
|
|
138 |
update_log(f"Total questions in DB: {total_questions}")
|
139 |
|
140 |
def create_field_and_question_list_for_DOC_A():
|
141 |
-
# Two sample entries
|
142 |
fields = ["Loan Number", "Borrower"]
|
143 |
queries = ["What is the Loan Number?", "Who is the Borrower?"]
|
144 |
return fields, queries
|
145 |
|
146 |
def create_field_and_question_list_for_DOC_B():
|
147 |
-
# Two sample entries
|
148 |
fields = ["Property Address", "Signed Date"]
|
149 |
queries = ["What is the Property Address?", "What is the Signed Date?"]
|
150 |
return fields, queries
|
@@ -294,11 +273,26 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
|
|
294 |
log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
|
295 |
|
296 |
with gr.Tab("OCR Converter"):
|
|
|
297 |
with gr.Column():
|
298 |
-
image_pdf = gr.File(label="Load PDF for
|
299 |
with gr.Row():
|
300 |
-
|
301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
with gr.Tab("Upload Question Set"):
|
304 |
with gr.Column():
|
@@ -320,7 +314,6 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
|
|
320 |
load_fields_btn.click(retrieve_fields_and_questions, inputs=questionsets, outputs=fields_and_questions)
|
321 |
answer_predefined_btn.click(answer_predefined_questions, inputs=questionsets, outputs=answers_df)
|
322 |
|
323 |
-
convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
|
324 |
load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
|
325 |
|
326 |
demo.launch(debug=True)
|
|
|
3 |
import time
|
4 |
import pandas as pd
|
5 |
import sqlite3
|
|
|
6 |
import logging
|
7 |
|
8 |
+
from langchain.document_loaders import OnlinePDFLoader # for loading the PDF text
|
9 |
from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
|
10 |
from langchain.text_splitter import CharacterTextSplitter
|
11 |
from langchain_community.vectorstores import Chroma # updated import for vectorization
|
12 |
from langchain.chains import RetrievalQA # for QA chain
|
13 |
from langchain_community.chat_models import ChatOpenAI # updated import for ChatOpenAI
|
14 |
+
from langchain_core.prompts import PromptTemplate # prompt template import
|
15 |
|
16 |
# Setup basic logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
23 |
log_messages += message + "\n"
|
24 |
logger.info(message)
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
|
27 |
try:
|
28 |
if open_ai_key is not None:
|
29 |
os.environ['OPENAI_API_KEY'] = open_ai_key
|
30 |
+
|
31 |
+
# Use the file path directly as OCR is removed; text is extracted via the document loader.
|
32 |
+
pdf_path = pdf_doc.name
|
33 |
+
loader = OnlinePDFLoader(pdf_path)
|
34 |
pages = loader.load_and_split()
|
35 |
+
update_log(f"Extracted text from {len(pages)} pages in {pdf_path}")
|
36 |
|
37 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
38 |
pages_to_be_loaded = []
|
|
|
117 |
update_log(f"Total questions in DB: {total_questions}")
|
118 |
|
119 |
def create_field_and_question_list_for_DOC_A():
|
120 |
+
# Two sample entries for DOC_A
|
121 |
fields = ["Loan Number", "Borrower"]
|
122 |
queries = ["What is the Loan Number?", "Who is the Borrower?"]
|
123 |
return fields, queries
|
124 |
|
125 |
def create_field_and_question_list_for_DOC_B():
|
126 |
+
# Two sample entries for DOC_B
|
127 |
fields = ["Property Address", "Signed Date"]
|
128 |
queries = ["What is the Property Address?", "What is the Signed Date?"]
|
129 |
return fields, queries
|
|
|
273 |
log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
|
274 |
|
275 |
with gr.Tab("OCR Converter"):
|
276 |
+
# This tab is now repurposed (or can be removed)
|
277 |
with gr.Column():
|
278 |
+
image_pdf = gr.File(label="Load PDF for Conversion", file_types=['.pdf'], type='filepath')
|
279 |
with gr.Row():
|
280 |
+
extracted_text = gr.Textbox(label="Extracted Text", lines=10)
|
281 |
+
extract_btn = gr.Button("Extract Text")
|
282 |
+
|
283 |
+
# For demonstration, extract text using OnlinePDFLoader
|
284 |
+
def extract_text(pdf_file):
|
285 |
+
try:
|
286 |
+
loader = OnlinePDFLoader(pdf_file.name)
|
287 |
+
docs = loader.load_and_split()
|
288 |
+
text = "\n".join([doc.page_content for doc in docs])
|
289 |
+
update_log(f"Extracted text from {len(docs)} pages.")
|
290 |
+
return text
|
291 |
+
except Exception as e:
|
292 |
+
err = f"Error extracting text: {str(e)}"
|
293 |
+
update_log(err)
|
294 |
+
return err
|
295 |
+
extract_btn.click(extract_text, inputs=image_pdf, outputs=extracted_text)
|
296 |
|
297 |
with gr.Tab("Upload Question Set"):
|
298 |
with gr.Column():
|
|
|
314 |
load_fields_btn.click(retrieve_fields_and_questions, inputs=questionsets, outputs=fields_and_questions)
|
315 |
answer_predefined_btn.click(answer_predefined_questions, inputs=questionsets, outputs=answers_df)
|
316 |
|
|
|
317 |
load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
|
318 |
|
319 |
demo.launch(debug=True)
|