Omarrran commited on
Commit
8b91948
·
verified ·
1 Parent(s): 24f4bce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -34
app.py CHANGED
@@ -3,16 +3,15 @@ import os
3
  import time
4
  import pandas as pd
5
  import sqlite3
6
- import ocrmypdf
7
  import logging
8
 
9
- from langchain.document_loaders import OnlinePDFLoader # for loading the PDF
10
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain_community.vectorstores import Chroma # updated import for vectorization
13
  from langchain.chains import RetrievalQA # for QA chain
14
  from langchain_community.chat_models import ChatOpenAI # updated import for ChatOpenAI
15
- from langchain_core.prompts import PromptTemplate # updated import per deprecation notice
16
 
17
  # Setup basic logging
18
  logging.basicConfig(level=logging.INFO)
@@ -24,36 +23,16 @@ def update_log(message):
24
  log_messages += message + "\n"
25
  logger.info(message)
26
 
27
- def ocr_converter(input_file):
28
- image_pdf = input_file.name
29
- try:
30
- # Disable deskew, clean_final, and remove_background to avoid compatibility issues with --redo-ocr.
31
- ocrmypdf.ocr(
32
- image_pdf,
33
- image_pdf,
34
- redo_ocr=True,
35
- force_ocr=True,
36
- language="eng",
37
- output_type="pdf",
38
- deskew=False,
39
- clean_final=False,
40
- remove_background=False
41
- )
42
- update_log(f"OCR conversion successful for {image_pdf}")
43
- except Exception as e:
44
- error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
45
- update_log(error_msg)
46
- raise e
47
- return image_pdf
48
-
49
  def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
50
  try:
51
  if open_ai_key is not None:
52
  os.environ['OPENAI_API_KEY'] = open_ai_key
53
- pdf_doc = ocr_converter(pdf_doc)
54
- loader = OnlinePDFLoader(pdf_doc)
 
 
55
  pages = loader.load_and_split()
56
- update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
57
 
58
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
59
  pages_to_be_loaded = []
@@ -138,13 +117,13 @@ def load_master_questionset_into_sqlite(connection):
138
  update_log(f"Total questions in DB: {total_questions}")
139
 
140
  def create_field_and_question_list_for_DOC_A():
141
- # Two sample entries
142
  fields = ["Loan Number", "Borrower"]
143
  queries = ["What is the Loan Number?", "Who is the Borrower?"]
144
  return fields, queries
145
 
146
  def create_field_and_question_list_for_DOC_B():
147
- # Two sample entries
148
  fields = ["Property Address", "Signed Date"]
149
  queries = ["What is the Property Address?", "What is the Signed Date?"]
150
  return fields, queries
@@ -294,11 +273,26 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
294
  log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
295
 
296
  with gr.Tab("OCR Converter"):
 
297
  with gr.Column():
298
- image_pdf = gr.File(label="Load PDF for OCR", file_types=['.pdf'], type='filepath')
299
  with gr.Row():
300
- ocr_pdf = gr.File(label="OCR'd PDF", file_types=['.pdf'], type='filepath', file_count="single")
301
- convert_btn = gr.Button("Convert")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  with gr.Tab("Upload Question Set"):
304
  with gr.Column():
@@ -320,7 +314,6 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
320
  load_fields_btn.click(retrieve_fields_and_questions, inputs=questionsets, outputs=fields_and_questions)
321
  answer_predefined_btn.click(answer_predefined_questions, inputs=questionsets, outputs=answers_df)
322
 
323
- convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
324
  load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
325
 
326
  demo.launch(debug=True)
 
3
  import time
4
  import pandas as pd
5
  import sqlite3
 
6
  import logging
7
 
8
+ from langchain.document_loaders import OnlinePDFLoader # for loading the PDF text
9
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain_community.vectorstores import Chroma # updated import for vectorization
12
  from langchain.chains import RetrievalQA # for QA chain
13
  from langchain_community.chat_models import ChatOpenAI # updated import for ChatOpenAI
14
+ from langchain_core.prompts import PromptTemplate # prompt template import
15
 
16
  # Setup basic logging
17
  logging.basicConfig(level=logging.INFO)
 
23
  log_messages += message + "\n"
24
  logger.info(message)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
27
  try:
28
  if open_ai_key is not None:
29
  os.environ['OPENAI_API_KEY'] = open_ai_key
30
+
31
+ # Use the file path directly as OCR is removed; text is extracted via the document loader.
32
+ pdf_path = pdf_doc.name
33
+ loader = OnlinePDFLoader(pdf_path)
34
  pages = loader.load_and_split()
35
+ update_log(f"Extracted text from {len(pages)} pages in {pdf_path}")
36
 
37
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
38
  pages_to_be_loaded = []
 
117
  update_log(f"Total questions in DB: {total_questions}")
118
 
119
  def create_field_and_question_list_for_DOC_A():
120
+ # Two sample entries for DOC_A
121
  fields = ["Loan Number", "Borrower"]
122
  queries = ["What is the Loan Number?", "Who is the Borrower?"]
123
  return fields, queries
124
 
125
  def create_field_and_question_list_for_DOC_B():
126
+ # Two sample entries for DOC_B
127
  fields = ["Property Address", "Signed Date"]
128
  queries = ["What is the Property Address?", "What is the Signed Date?"]
129
  return fields, queries
 
273
  log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
274
 
275
  with gr.Tab("OCR Converter"):
276
+ # This tab is now repurposed (or can be removed)
277
  with gr.Column():
278
+ image_pdf = gr.File(label="Load PDF for Conversion", file_types=['.pdf'], type='filepath')
279
  with gr.Row():
280
+ extracted_text = gr.Textbox(label="Extracted Text", lines=10)
281
+ extract_btn = gr.Button("Extract Text")
282
+
283
+ # For demonstration, extract text using OnlinePDFLoader
284
+ def extract_text(pdf_file):
285
+ try:
286
+ loader = OnlinePDFLoader(pdf_file.name)
287
+ docs = loader.load_and_split()
288
+ text = "\n".join([doc.page_content for doc in docs])
289
+ update_log(f"Extracted text from {len(docs)} pages.")
290
+ return text
291
+ except Exception as e:
292
+ err = f"Error extracting text: {str(e)}"
293
+ update_log(err)
294
+ return err
295
+ extract_btn.click(extract_text, inputs=image_pdf, outputs=extracted_text)
296
 
297
  with gr.Tab("Upload Question Set"):
298
  with gr.Column():
 
314
  load_fields_btn.click(retrieve_fields_and_questions, inputs=questionsets, outputs=fields_and_questions)
315
  answer_predefined_btn.click(answer_predefined_questions, inputs=questionsets, outputs=answers_df)
316
 
 
317
  load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
318
 
319
  demo.launch(debug=True)