Omarrran commited on
Commit
26cbdf6
·
verified ·
1 Parent(s): 21bef18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -351
app.py CHANGED
@@ -4,79 +4,102 @@ import time
4
  import pandas as pd
5
  import sqlite3
6
  import ocrmypdf
 
7
 
8
  from langchain.document_loaders import OnlinePDFLoader # for loading the pdf
9
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain.vectorstores import Chroma # for the vectorization part
12
- from langchain.chains import RetrievalQA # for conversing with chatGPT
13
  from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
14
  from langchain_core.prompts import PromptTemplate # updated import per warning
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
17
- # If an OpenAI API key is provided, it will be used for ChatOpenAI (GPT-4)
18
- if open_ai_key is not None:
19
- os.environ['OPENAI_API_KEY'] = open_ai_key
20
- # OCR Conversion - skips conversion of pages that already contain text
21
- pdf_doc = ocr_converter(pdf_doc)
22
- # Load the pdf file
23
- loader = OnlinePDFLoader(pdf_doc)
24
- pages = loader.load_and_split()
25
- print('pages loaded:', len(pages))
26
-
27
- # Create an instance of HuggingFaceEmbeddings (open source) for generating embeddings
28
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
29
 
30
- pages_to_be_loaded = []
31
-
32
- if relevant_pages:
33
- page_numbers = relevant_pages.split(",")
34
- if len(page_numbers) != 0:
35
  for page_number in page_numbers:
36
- if page_number.isdigit():
37
- pageIndex = int(page_number) - 1
38
- if pageIndex >= 0 and pageIndex < len(pages):
39
  pages_to_be_loaded.append(pages[pageIndex])
40
-
41
- # If no valid pages are specified, use the entire PDF.
42
- if len(pages_to_be_loaded) == 0:
43
- pages_to_be_loaded = pages.copy()
44
-
45
- # Create a vector store using the Chroma class with the open-source embeddings
46
- vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
47
 
48
- # Configure the prompt template for the QA chain
49
- prompt_template = (
50
- """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A.
51
- If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface.
52
- Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1.
53
- {context}
54
- Question: {question}
55
- Return the answer. Provide the answer in the JSON format and extract the key from the question. Where applicable, break the answer into bullet points.
56
- When the sentences are long, try and break them into sub sections and include all the information and do not skip any information.
57
- If there is an exception to the answer, please do include it in a 'Note:' section. If there are no exceptions to the answer, please skip the 'Note:' section.
58
- Include a 'For additional details refer to' section when the document has more information to offer on the topic being questioned.
59
- If the document has a Preface or 'Table of Contents' section, extract the chapter# and a short description and include the info under the 'For additional details refer to' section.
60
- List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different.
61
- If additional information is found in multiple pages within the same chapter, list the chapter only once.
62
- If chapter information cannot be extracted, include any other information that will help the user navigate to the relevant sections of the document.
63
- If the document does not contain a Preface or 'Table of Contents' section, please do not call that out. For example, do not include statements like
64
- the following in the answer - 'The document does not contain a Preface or 'Table of Contents' section'"""
65
- )
66
-
67
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
68
- chain_type_kwargs = {"prompt": PROMPT}
69
-
70
- global pdf_qa
71
- pdf_qa = RetrievalQA.from_chain_type(
72
- llm=ChatOpenAI(temperature=0, model_name="gpt-4"),
73
- chain_type="stuff",
74
- retriever=vectordb.as_retriever(search_kwargs={"k": 5}),
75
- chain_type_kwargs=chain_type_kwargs,
76
- return_source_documents=False
77
- )
78
-
79
- return "Ready"
 
 
 
 
 
 
 
 
 
80
 
81
  def create_db_connection():
82
  DB_FILE = "./questionset.db"
@@ -84,132 +107,127 @@ def create_db_connection():
84
  return connection
85
 
86
  def create_sqlite_table(connection):
87
- print("*****Entered the create_sqlite_table method*****")
88
  cursor = connection.cursor()
89
  try:
90
- data = f'SELECT * FROM questions'
91
- cursor.execute(data)
92
  cursor.fetchall()
93
  except sqlite3.OperationalError:
94
  cursor.execute(
95
- '''
96
- CREATE TABLE questions (document_type TEXT NOT NULL, questionset_tag TEXT NOT NULL, field TEXT NOT NULL, question TEXT NOT NULL)
97
- ''')
98
- print("*****questions table has been created******")
 
99
  connection.commit()
100
 
101
  def load_master_questionset_into_sqlite(connection):
102
  create_sqlite_table(connection)
103
  cursor = connection.cursor()
104
  masterlist_for_DOT_count = cursor.execute(
105
- "Select COUNT(document_type) from questions where document_type=? and questionset_tag=?",
106
  ("DOT", "masterlist",)
107
  ).fetchone()[0]
108
  if masterlist_for_DOT_count == 0:
109
- print("DOT masterlist has not yet been loaded, proceeding to load.")
110
  fieldListForDOT, queryListForDOT = create_field_and_question_list_for_DOT()
111
  fieldListForTransmittalSummary, queryListForTransmittalSummary = create_field_and_question_list_for_Transmittal_Summary()
112
- i = 0
113
- print("*****Entered the load master question set method*****")
114
- while i < len(queryListForDOT):
115
  cursor.execute(
116
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
117
  ["DOT", "masterlist", fieldListForDOT[i], queryListForDOT[i]]
118
  )
119
- i += 1
120
- i = 0
121
- while i < len(queryListForTransmittalSummary):
122
  cursor.execute(
123
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
124
  ["Transmittal Summary", "masterlist", fieldListForTransmittalSummary[i], queryListForTransmittalSummary[i]]
125
  )
126
- i += 1
127
  connection.commit()
128
- total_questions = cursor.execute("Select COUNT(document_type) from questions").fetchone()[0]
129
- print("*******Total number of questions in the DB:", total_questions)
130
 
131
  def create_field_and_question_list_for_DOT():
132
- query1 = "what is the Loan Number?"
133
- field1 = "Loan Number"
134
- query2 = "Who is the Borrower?"
135
- field2 = "Borrower"
136
- query3 = "what is the Case Number?"
137
- field3 = "Case Number"
138
- query4 = "what is the Mortgage Identification number?"
139
- field4 = "MIN Number"
140
- query5 = "DOT signed date?"
141
- field5 = "Signed Date"
142
- query6 = "Who is the Lender?"
143
- field6 = "Lender"
144
- query7 = "what is the VA/FHA Number?"
145
- field7 = "VA/FHA Number"
146
- query8 = "Who is the Co-Borrower?"
147
- field8 = "Co-Borrower"
148
- query9 = "What is the property type - single family, multi family?"
149
- field9 = "Property Type"
150
- query10 = "what is the Property Address?"
151
- field10 = "Property Address"
152
- query11 = "In what County is the property located?"
153
- field11 = "Property County"
154
- query12 = "what is the Electronically recorded date"
155
- field12 = "Electronic Recording Date"
156
- queryList = [query1, query2, query3, query4, query5, query6, query7, query8, query9, query10, query11, query12]
157
- fieldList = [field1, field2, field3, field4, field5, field6, field7, field8, field9, field10, field11, field12]
 
 
158
  return fieldList, queryList
159
 
160
  def create_field_and_question_list_for_Transmittal_Summary():
161
- query1 = "Who is the Borrower?"
162
- field1 = "Borrower"
163
- query2 = "what is the Property Address?"
164
- field2 = "Property Address"
165
- query3 = "what is the Loan Term?"
166
- field3 = "Loan Term"
167
- query4 = "What is the Base Income?"
168
- field4 = "Base Income"
169
- query5 = "what is the Borrower's SSN?"
170
- field5 = "Borrower's SSN"
171
- query6 = "Who is the Co-Borrower?"
172
- field6 = "Co-Borrower"
173
- query7 = "What is the Original Loan Amount?"
174
- field7 = "Original Loan Amount"
175
- query8 = "What is the Initial P&I payment?"
176
- field8 = "Initial P&I payment"
177
- query9 = "What is the Co-Borrower's SSN?"
178
- field9 = "Co-Borrower’s SSN"
179
- query10 = "Number of units?"
180
- field10 = "Units#"
181
- query11 = "Who is the Seller?"
182
- field11 = "Seller"
183
- query12 = "Document signed date?"
184
- field12 = "Signed Date"
185
- queryList = [query1, query2, query3, query4, query5, query6, query7, query8, query9, query10, query11, query12]
186
- fieldList = [field1, field2, field3, field4, field5, field6, field7, field8, field9, field10, field11, field12]
 
 
187
  return fieldList, queryList
188
 
189
  def retrieve_document_type_and_questionsettag_from_sqlite():
190
  connection = create_db_connection()
191
  load_master_questionset_into_sqlite(connection)
192
  cursor = connection.cursor()
193
- rows = cursor.execute("SELECT document_type, questionset_tag FROM questions order by document_type, upper(questionset_tag)").fetchall()
194
- print("Number of rows retrieved from DB:", len(rows))
195
  list_for_dropdown = []
196
  for i in rows:
197
- entries_in_row = list(i)
198
- concatenated_value = entries_in_row[0] + ":" + entries_in_row[1]
199
  if concatenated_value not in list_for_dropdown:
200
  list_for_dropdown.append(concatenated_value)
201
- print(concatenated_value)
202
- print("Number of unique entries found in the DB:", len(list_for_dropdown))
203
  connection.close()
204
  return gr.Dropdown.update(choices=list_for_dropdown, value=list_for_dropdown[0])
205
 
206
  def retrieve_fields_and_questions(dropdownoption):
207
- print("dropdownoption is:", dropdownoption)
208
  splitwords = dropdownoption.split(":")
209
  connection = create_db_connection()
210
  cursor = connection.cursor()
211
  fields_and_questions = cursor.execute(
212
- "SELECT document_type, field, question FROM questions where document_type=? and questionset_tag=?",
213
  (splitwords[0], splitwords[1],)
214
  ).fetchall()
215
  connection.close()
@@ -228,68 +246,76 @@ def add_questionset(data, document_type, tag_for_questionset):
228
  connection.close()
229
 
230
  def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_questionset):
231
- print('document type is:', document_type)
232
- print('tag_for_questionset is:', tag_for_questionset)
233
-
234
- if tag_for_questionset:
235
- if document_type:
236
- data = pd.read_csv(csv_file.name)
237
- add_questionset(data, document_type, tag_for_questionset)
238
- responseString = "Task Complete. Uploaded {} fields and the corresponding questions into the Database for {}:{}".format(data.shape[0], document_type, tag_for_questionset)
239
- return responseString
240
- else:
241
- return "Please select the Document Type and provide a name for the Question Set"
242
 
243
  def answer_predefined_questions(document_type_and_questionset):
244
- print('chosen document_type_and_questionset:', document_type_and_questionset)
245
- option_chosen = document_type_and_questionset.split(":")
246
- document_type = option_chosen[0]
247
- question_set = option_chosen[1]
248
- fields = []
249
- questions = []
250
- responses = []
251
  connection = create_db_connection()
252
  cursor = connection.cursor()
253
- if document_type is not None and question_set is not None:
254
- rows = cursor.execute(
255
- "SELECT field, question FROM questions where document_type=? and questionset_tag=?",
256
- (document_type, question_set,)
257
- ).fetchall()
258
- for i in rows:
259
- entries_in_row = list(i)
260
- fields.append(entries_in_row[0])
261
- questions.append(entries_in_row[1])
262
- responses.append(pdf_qa.run(entries_in_row[1]))
263
- else:
264
- return "Please choose your Document Type:QuestionSet"
 
 
 
265
  return pd.DataFrame({"Field": fields, "Question to gpt-4": questions, "Response from gpt-4": responses})
266
 
267
- def ocr_converter(input_file):
268
- image_pdf = input_file.name
269
- ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, language="eng")
270
- return image_pdf
271
-
272
  def summarize_contents():
273
  question = "Generate a short summary of the contents along with no more than 3 leading/example questions. Do not return the response in json format"
274
- return pdf_qa.run(question)
 
 
 
 
 
 
 
275
 
276
  def answer_query(query):
277
- question = query
278
- return pdf_qa.run(question)
279
-
 
 
 
 
 
 
 
 
 
 
280
  css = """
281
  #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
282
  """
283
 
284
  title = """
285
- <div style="text-align: center;max-width: 700px;">
286
  <h1>AskMoli - Chatbot for PDFs</h1>
287
- <p style="text-align: center;">Upload a .PDF, click the "Upload PDF and generate embeddings" button, <br />
288
- Wait for the Status to show Ready. You can choose to get answers to the pre-defined question set OR ask your own question <br />
289
- The app is built on GPT-4 and leverages the magic of PromptTemplate</p>
290
  </div>
291
  """
292
 
 
293
  with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
294
  with gr.Column(elem_id="col-container"):
295
  gr.HTML(title)
@@ -297,40 +323,42 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
297
  with gr.Tab("Chatbot"):
298
  with gr.Column():
299
  open_ai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
300
- pdf_doc = gr.File(label="Load a pdf", file_types=['.pdf'], type='filepath')
301
- relevant_pages = gr.Textbox(label="*Optional - List comma separated page numbers to load or leave this field blank to use the entire PDF")
302
 
303
  with gr.Row():
304
- status = gr.Textbox(label="Status", placeholder="", interactive=False)
305
- load_pdf = gr.Button("Upload PDF and generate embeddings")
306
 
307
  with gr.Row():
308
  summary = gr.Textbox(label="Summary")
309
- summarize_pdf = gr.Button("Have Moli Summarize the Contents")
310
 
311
  with gr.Row():
312
- input = gr.Textbox(label="Type in your question")
313
- output = gr.Textbox(label="Answer")
314
- submit_query = gr.Button("Submit your own question to AskMoli")
315
 
316
  with gr.Row():
317
- questionsets = gr.Dropdown(label="Pre-defined Question Sets stored in the DB", choices=[])
318
- load_questionsets = gr.Button("Retrieve Pre-defined Question Sets from DB")
319
- fields_and_questions = gr.Dataframe(label="Fields and Questions in the chosen Question Set")
320
- load_fields_and_questions = gr.Button("Retrieve Pre-defined Questions from the DB for the chosen QuestionSet")
321
 
322
  with gr.Row():
323
- answers = gr.Dataframe(label="Answers to Predefined Question set")
324
- answers_for_predefined_question_set = gr.Button("Get answers to the chosen pre-defined question set")
325
-
 
 
 
326
  with gr.Tab("OCR Converter"):
327
  with gr.Column():
328
- image_pdf = gr.File(label="Load the pdf to be converted", file_types=['.pdf'], type='filepath')
329
-
330
  with gr.Row():
331
- ocr_pdf = gr.File(label="OCR'd pdf", file_types=['.pdf'], type='filepath', file_count="single")
332
- convert_into_ocr = gr.Button("Convert")
333
-
334
  with gr.Tab("Upload Question Set"):
335
  with gr.Column():
336
  document_types = [
@@ -455,158 +483,31 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
455
  "Mortgage Earnest Money Promissory Note",
456
  "Mortgage Rental Agreement",
457
  "Mortgage Repayment Plan",
458
- "Mortgage Short Sale Miscellaneous",
459
- "Mortgage LM - Trial Offer Letter or Plan",
460
- "Mortgage Errors and Omissions Agreement",
461
- "Mortgage Custom Type 2",
462
- "Mortgage Custom Type 1",
463
- "Mortgage Loan Agreement",
464
- "Mortgage Loan Closing Information Summary",
465
- "Mortgage Loan Modification",
466
- "Mortgage Loan Summary Report",
467
- "Mortgage Lock Confirmation",
468
- "Mortgage Loss Drafts",
469
- "Mortgage Loss Mitigation",
470
- "Mortgage Lost Assignment Affidavit",
471
- "Mortgage Mech Lien",
472
- "Mortgage Mediation",
473
- "Mortgage MI Claim Explanation of Benefits",
474
- "Mortgage MI Policy Cancellation Document",
475
- "Mortgage MI Repurchase Document",
476
- "Mortgage Miscellaneous Lien Release",
477
- "Mortgage Mobile Home Documentation",
478
- "Mortgage Monthly Activity Report",
479
- "Mortgage Deed of Trust-Recorded",
480
- "Mortgage PMI Disclosure",
481
- "Mortgage Payments",
482
- "Mortgage Deed of Trust-Unrecorded",
483
- "Mortgage Motion For Relief",
484
- "Mortgage Note",
485
- "Mortgage Note Affidavit",
486
- "Mortgage Note Endorsements",
487
- "Mortgage Notice Of Appearance",
488
- "Mortgage Notice of Default Filedrecorded",
489
- "Mortgage Notice of Final Cure",
490
- "Mortgage Notice of Levy",
491
- "Mortgage Notice of Payment Change",
492
- "Mortgage Notice of Right to Cancel",
493
- "Mortgage Notice of Sale",
494
- "Mortgage Notice of Second Lien",
495
- "Mortgage Notice of Servicing Transfer-Transferee",
496
- "Mortgage Notice of Servicing Transfer-Transferor",
497
- "Mortgage Notice of Termination",
498
- "Mortgage Notice to Quit",
499
- "Mortgage Objection to Claim",
500
- "Mortgage Processing and Underwriting Doc Set",
501
- "Mortgage Objection to Motion for Relief",
502
- "Mortgage Affidavit of Occupancy",
503
- "Mortgage Occupancy Agreement",
504
- "Mortgage Occupancy Termination Agreement",
505
- "Mortgage Ombudsman Documents",
506
- "Mortgage Owner Affidavit",
507
- "Mortgage Ownership and Encumbrances Report",
508
- "Mortgage Pay History External",
509
- "Mortgage Paystub",
510
- "Mortgage Payoff Demand Statement",
511
- "Mortgage PMI Certificate",
512
- "Mortgage Post Petition Fee Notices",
513
- "Mortgage Post Sale Documents",
514
- "Mortgage Power of Attorney-Recorded",
515
- "Mortgage Power of Attorney-Unrecorded",
516
- "Mortgage Closing Instructions",
517
- "Mortgage Preliminary Modification",
518
- "Mortgage Merged-Privacy Policy Notice-Title Policy - Privacy Policy-1098 Privacy Policy",
519
- "Mortgage Probate Court Order",
520
- "Mortgage Proof of Claim",
521
- "Mortgage Property Legal and Vesting Report",
522
- "Mortgage Property Management Agreement",
523
- "Mortgage Property Notices",
524
- "Mortgage Public Assistance",
525
- "Mortgage Record Owner and Lien Certificate",
526
- "Mortgage Recorded Satisfaction",
527
- "Mortgage Regfore Affidavit Executed",
528
- "Mortgage Release of Lis Pendens",
529
- "Mortgage REO Bids",
530
- "Mortgage REO Other",
531
- "Mortgage Form 26-1820 Report and Certificate of Loan Disbursement",
532
- "Mortgage Request for Verification of Rent or Mortgage",
533
- "Mortgage Request for Waiver of R.E. Tax Escrow Requirements",
534
- "Mortgage 1003",
535
- "Mortgage RMA Package",
536
- "Mortgage Sale Postponement",
537
- "Mortgage Sale or Milestone Rescission",
538
- "Mortgage Satisfaction of Judgement Tax Mortgage Liens",
539
- "Mortgage Security Agreement",
540
- "Mortgage Separation Agreement",
541
- "Mortgage Servicing Acquisition",
542
- "Mortgage Servicing Disclosure Statement",
543
- "Mortgage Short Payoffs",
544
- "Mortgage Signature-Name Affidavit",
545
- "Mortgage Assumption of Mortgage",
546
- "Mortgage SCRA Related Documents",
547
- "Mortgage Social Security Card or Customer ID",
548
- "Mortgage Soft Delete",
549
- "Mortgage Flood Hazard Determination Form",
550
- "Mortgage Stipulated Agreement",
551
- "Mortgage Subordination Agreement",
552
- "Mortgage Subordination Request Form",
553
- "Mortgage Appointment of Substitute Trustee",
554
- "Mortgage Merged-Real Estate Taxes-Tax Bill-Tax Certificate",
555
- "Mortgage Tax Certificate",
556
- "Mortgage Tax Record Information Sheet",
557
- "Mortgage Tax Liens",
558
- "Mortgage Tax Search",
559
- "Mortgage Third Party Authorization",
560
- "Mortgage Title Commitment-Equity or Property Report",
561
- "Mortgage Title Policy",
562
- "Mortgage Title Policy Endorsement",
563
- "Mortgage Title Search",
564
- "Mortgage Title Insurance Other",
565
- "Mortgage Transfer of Claim",
566
- "Mortgage Uniform Underwriting and Transmittal Summary",
567
- "Mortgage Trustee Sale Guarantee",
568
- "Mortgage UCC-1 Financing Statement",
569
- "Mortgage Others",
570
- "Mortgage Unknown",
571
- "Mortgage Utility Bill",
572
- "Mortgage Valuation Orders",
573
- "Mortgage Verification Document Set",
574
- "Mortgage Verification of Service for Military Home Buyers",
575
- "Mortgage W2",
576
- "Mortgage W9",
577
- "Mortgage Wire Transfer Instructions",
578
- "Mortgage Workmens Compensation",
579
- "Mortgage Writ of Possession",
580
- "Mortgage Cover Page",
581
- "Mortgage Barcode Page",
582
- "Mortgage Wisconsin Tax Escrow Option Notice",
583
- "Mortgage Hazard Insurance Declaration",
584
- "Mortgage Flood Insurance Declaration",
585
- "Mortgage Quitclaim Deed",
586
- "Mortgage Tax Deed",
587
- "Mortgage Warranty Deed",
588
- "Mortgage ALTA Settlement Statement",
589
- "Mortgage Home Inspection Waiver",
590
- "Mortgage Insurance Disclosure"
591
  ]
592
- document_type_for_questionset = gr.Dropdown(choices=document_types, label="Select the Document Type")
593
- tag_for_questionset = gr.Textbox(label="Please provide a name for the question set. Ex: rwikd-dot-basic-questionset-20230707.")
594
- csv_file = gr.File(label="Load a csv - 2 columns with the headers as field, question", file_types=['.csv'], type='filepath')
595
 
596
  with gr.Row():
597
- status_for_loading_csv = gr.Textbox(label="Status", placeholder="", interactive=False)
598
- load_csv = gr.Button("Upload data into the database")
599
-
600
- load_pdf.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
601
- summarize_pdf.click(summarize_contents, outputs=summary)
602
- load_csv.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_loading_csv)
603
-
604
- load_questionsets.click(retrieve_document_type_and_questionsettag_from_sqlite, outputs=questionsets)
605
- load_fields_and_questions.click(retrieve_fields_and_questions, questionsets, fields_and_questions)
606
- answers_for_predefined_question_set.click(answer_predefined_questions, questionsets, answers)
607
-
608
- convert_into_ocr.click(ocr_converter, image_pdf, ocr_pdf)
609
- submit_query.click(answer_query, input, output)
 
 
 
 
 
610
 
611
- # Use this flavor of demo.launch if you need the app to have an admin page.
612
  demo.launch(debug=True)
 
4
  import pandas as pd
5
  import sqlite3
6
  import ocrmypdf
7
+ import logging
8
 
9
  from langchain.document_loaders import OnlinePDFLoader # for loading the pdf
10
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.vectorstores import Chroma # for the vectorization part
13
+ from langchain.chains import RetrievalQA # for conversing with ChatGPT
14
  from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
15
  from langchain_core.prompts import PromptTemplate # updated import per warning
16
 
17
+ # Setup basic logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+ log_messages = "" # Global log collector
21
+
22
+ def update_log(message):
23
+ global log_messages
24
+ log_messages += message + "\n"
25
+ logger.info(message)
26
+
27
+ def ocr_converter(input_file):
28
+ image_pdf = input_file.name
29
+ try:
30
+ # Specify output_type="pdf" to bypass Ghostscript issues.
31
+ ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, language="eng", output_type="pdf")
32
+ update_log(f"OCR conversion successful for {image_pdf}")
33
+ except Exception as e:
34
+ error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
35
+ update_log(error_msg)
36
+ raise e
37
+ return image_pdf
38
+
39
  def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
40
+ try:
41
+ if open_ai_key is not None:
42
+ os.environ['OPENAI_API_KEY'] = open_ai_key
43
+ # Perform OCR conversion; errors here will be logged.
44
+ pdf_doc = ocr_converter(pdf_doc)
45
+ # Load the PDF file
46
+ loader = OnlinePDFLoader(pdf_doc)
47
+ pages = loader.load_and_split()
48
+ update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
49
+
50
+ # Use HuggingFaceEmbeddings (open source) for generating embeddings.
51
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
52
+ pages_to_be_loaded = []
53
 
54
+ if relevant_pages:
55
+ page_numbers = relevant_pages.split(",")
 
 
 
56
  for page_number in page_numbers:
57
+ if page_number.strip().isdigit():
58
+ pageIndex = int(page_number.strip()) - 1
59
+ if 0 <= pageIndex < len(pages):
60
  pages_to_be_loaded.append(pages[pageIndex])
 
 
 
 
 
 
 
61
 
62
+ if not pages_to_be_loaded:
63
+ pages_to_be_loaded = pages.copy()
64
+ update_log("No specific pages selected; using entire PDF.")
65
+
66
+ # Create a vector store using Chroma with the embeddings.
67
+ vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
68
+
69
+ # Configure the prompt template for the QA chain.
70
+ prompt_template = (
71
+ """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A.
72
+ If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface.
73
+ Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1.
74
+ {context}
75
+ Question: {question}
76
+ Return the answer. Provide the answer in the JSON format and extract the key from the question. Where applicable, break the answer into bullet points.
77
+ When the sentences are long, try and break them into sub sections and include all the information and do not skip any information.
78
+ If there is an exception to the answer, please do include it in a 'Note:' section. If there are no exceptions to the answer, please skip the 'Note:' section.
79
+ Include a 'For additional details refer to' section when the document has more information to offer on the topic being questioned.
80
+ If the document has a Preface or 'Table of Contents' section, extract the chapter# and a short description and include the info under the 'For additional details refer to' section.
81
+ List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different.
82
+ If additional information is found in multiple pages within the same chapter, list the chapter only once.
83
+ If chapter information cannot be extracted, include any other information that will help the user navigate to the relevant sections of the document.
84
+ If the document does not contain a Preface or 'Table of Contents' section, please do not call that out."""
85
+ )
86
+
87
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
88
+ chain_type_kwargs = {"prompt": PROMPT}
89
+
90
+ global pdf_qa
91
+ pdf_qa = RetrievalQA.from_chain_type(
92
+ llm=ChatOpenAI(temperature=0, model_name="gpt-4"),
93
+ chain_type="stuff",
94
+ retriever=vectordb.as_retriever(search_kwargs={"k": 5}),
95
+ chain_type_kwargs=chain_type_kwargs,
96
+ return_source_documents=False
97
+ )
98
+ update_log("PDF embeddings generated and QA chain initialized.")
99
+ return "Ready"
100
+ except Exception as e:
101
+ update_log(f"Error in load_pdf_and_generate_embeddings: {str(e)}")
102
+ return f"Error: {str(e)}"
103
 
104
  def create_db_connection():
105
  DB_FILE = "./questionset.db"
 
107
  return connection
108
 
109
  def create_sqlite_table(connection):
110
+ update_log("Creating/Verifying SQLite table for questions.")
111
  cursor = connection.cursor()
112
  try:
113
+ cursor.execute('SELECT * FROM questions')
 
114
  cursor.fetchall()
115
  except sqlite3.OperationalError:
116
  cursor.execute(
117
+ '''
118
+ CREATE TABLE questions (document_type TEXT NOT NULL, questionset_tag TEXT NOT NULL, field TEXT NOT NULL, question TEXT NOT NULL)
119
+ '''
120
+ )
121
+ update_log("Questions table created.")
122
  connection.commit()
123
 
124
  def load_master_questionset_into_sqlite(connection):
125
  create_sqlite_table(connection)
126
  cursor = connection.cursor()
127
  masterlist_for_DOT_count = cursor.execute(
128
+ "SELECT COUNT(document_type) FROM questions WHERE document_type=? AND questionset_tag=?",
129
  ("DOT", "masterlist",)
130
  ).fetchone()[0]
131
  if masterlist_for_DOT_count == 0:
132
+ update_log("Loading DOT masterlist into DB.")
133
  fieldListForDOT, queryListForDOT = create_field_and_question_list_for_DOT()
134
  fieldListForTransmittalSummary, queryListForTransmittalSummary = create_field_and_question_list_for_Transmittal_Summary()
135
+ for i in range(len(queryListForDOT)):
 
 
136
  cursor.execute(
137
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
138
  ["DOT", "masterlist", fieldListForDOT[i], queryListForDOT[i]]
139
  )
140
+ for i in range(len(queryListForTransmittalSummary)):
 
 
141
  cursor.execute(
142
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
143
  ["Transmittal Summary", "masterlist", fieldListForTransmittalSummary[i], queryListForTransmittalSummary[i]]
144
  )
 
145
  connection.commit()
146
+ total_questions = cursor.execute("SELECT COUNT(document_type) FROM questions").fetchone()[0]
147
+ update_log(f"Total questions in DB: {total_questions}")
148
 
149
  def create_field_and_question_list_for_DOT():
150
+ queryList = [
151
+ "what is the Loan Number?",
152
+ "Who is the Borrower?",
153
+ "what is the Case Number?",
154
+ "what is the Mortgage Identification number?",
155
+ "DOT signed date?",
156
+ "Who is the Lender?",
157
+ "what is the VA/FHA Number?",
158
+ "Who is the Co-Borrower?",
159
+ "What is the property type - single family, multi family?",
160
+ "what is the Property Address?",
161
+ "In what County is the property located?",
162
+ "what is the Electronically recorded date"
163
+ ]
164
+ fieldList = [
165
+ "Loan Number",
166
+ "Borrower",
167
+ "Case Number",
168
+ "MIN Number",
169
+ "Signed Date",
170
+ "Lender",
171
+ "VA/FHA Number",
172
+ "Co-Borrower",
173
+ "Property Type",
174
+ "Property Address",
175
+ "Property County",
176
+ "Electronic Recording Date"
177
+ ]
178
  return fieldList, queryList
179
 
180
  def create_field_and_question_list_for_Transmittal_Summary():
181
+ queryList = [
182
+ "Who is the Borrower?",
183
+ "what is the Property Address?",
184
+ "what is the Loan Term?",
185
+ "What is the Base Income?",
186
+ "what is the Borrower's SSN?",
187
+ "Who is the Co-Borrower?",
188
+ "What is the Original Loan Amount?",
189
+ "What is the Initial P&I payment?",
190
+ "What is the Co-Borrower's SSN?",
191
+ "Number of units?",
192
+ "Who is the Seller?",
193
+ "Document signed date?"
194
+ ]
195
+ fieldList = [
196
+ "Borrower",
197
+ "Property Address",
198
+ "Loan Term",
199
+ "Base Income",
200
+ "Borrower's SSN",
201
+ "Co-Borrower",
202
+ "Original Loan Amount",
203
+ "Initial P&I payment",
204
+ "Co-Borrower’s SSN",
205
+ "Units#",
206
+ "Seller",
207
+ "Signed Date"
208
+ ]
209
  return fieldList, queryList
210
 
211
  def retrieve_document_type_and_questionsettag_from_sqlite():
212
  connection = create_db_connection()
213
  load_master_questionset_into_sqlite(connection)
214
  cursor = connection.cursor()
215
+ rows = cursor.execute("SELECT document_type, questionset_tag FROM questions ORDER BY document_type, UPPER(questionset_tag)").fetchall()
 
216
  list_for_dropdown = []
217
  for i in rows:
218
+ concatenated_value = f"{i[0]}:{i[1]}"
 
219
  if concatenated_value not in list_for_dropdown:
220
  list_for_dropdown.append(concatenated_value)
221
+ update_log(f"Found question set: {concatenated_value}")
 
222
  connection.close()
223
  return gr.Dropdown.update(choices=list_for_dropdown, value=list_for_dropdown[0])
224
 
225
  def retrieve_fields_and_questions(dropdownoption):
 
226
  splitwords = dropdownoption.split(":")
227
  connection = create_db_connection()
228
  cursor = connection.cursor()
229
  fields_and_questions = cursor.execute(
230
+ "SELECT document_type, field, question FROM questions WHERE document_type=? AND questionset_tag=?",
231
  (splitwords[0], splitwords[1],)
232
  ).fetchall()
233
  connection.close()
 
246
  connection.close()
247
 
248
  def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_questionset):
249
+ if tag_for_questionset and document_type:
250
+ data = pd.read_csv(csv_file.name)
251
+ add_questionset(data, document_type, tag_for_questionset)
252
+ responseString = f"Task Complete. Uploaded {data.shape[0]} fields and corresponding questions for {document_type}:{tag_for_questionset}"
253
+ update_log(responseString)
254
+ return responseString
255
+ else:
256
+ return "Please select the Document Type and provide a name for the Question Set"
 
 
 
257
 
258
  def answer_predefined_questions(document_type_and_questionset):
259
+ splitwords = document_type_and_questionset.split(":")
260
+ document_type = splitwords[0]
261
+ question_set = splitwords[1]
262
+ fields, questions, responses = [], [], []
 
 
 
263
  connection = create_db_connection()
264
  cursor = connection.cursor()
265
+ rows = cursor.execute(
266
+ "SELECT field, question FROM questions WHERE document_type=? AND questionset_tag=?",
267
+ (document_type, question_set,)
268
+ ).fetchall()
269
+ connection.close()
270
+ for entry in rows:
271
+ fields.append(entry[0])
272
+ questions.append(entry[1])
273
+ # Call pdf_qa.run only if pdf_qa is defined
274
+ try:
275
+ responses.append(pdf_qa.run(entry[1]))
276
+ except Exception as e:
277
+ error_str = f"Error in pdf_qa.run for question '{entry[1]}': {str(e)}"
278
+ update_log(error_str)
279
+ responses.append(error_str)
280
  return pd.DataFrame({"Field": fields, "Question to gpt-4": questions, "Response from gpt-4": responses})
281
 
 
 
 
 
 
282
  def summarize_contents():
283
  question = "Generate a short summary of the contents along with no more than 3 leading/example questions. Do not return the response in json format"
284
+ try:
285
+ response = pdf_qa.run(question)
286
+ update_log("Summarization successful.")
287
+ return response
288
+ except Exception as e:
289
+ error_str = f"Error in summarization: {str(e)}"
290
+ update_log(error_str)
291
+ return error_str
292
 
293
  def answer_query(query):
294
+ try:
295
+ response = pdf_qa.run(query)
296
+ update_log(f"Query answered: {query}")
297
+ return response
298
+ except Exception as e:
299
+ error_str = f"Error in answering query: {str(e)}"
300
+ update_log(error_str)
301
+ return error_str
302
+
303
+ def get_log():
304
+ return log_messages
305
+
306
+ # Define CSS and title HTML
307
  css = """
308
  #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
309
  """
310
 
311
  title = """
312
+ <div style="text-align: center; max-width: 700px;">
313
  <h1>AskMoli - Chatbot for PDFs</h1>
314
+ <p>Upload a .PDF and click "Upload PDF and generate embeddings". Wait for the status to show "Ready". Then either choose a pre-defined question set or ask your own question. The app uses GPT-4 with a custom prompt template.</p>
 
 
315
  </div>
316
  """
317
 
318
+ # Build the Gradio interface
319
  with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
320
  with gr.Column(elem_id="col-container"):
321
  gr.HTML(title)
 
323
  with gr.Tab("Chatbot"):
324
  with gr.Column():
325
  open_ai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
326
+ pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')
327
+ relevant_pages = gr.Textbox(label="*Optional - Comma separated page numbers (leave blank for entire PDF)")
328
 
329
  with gr.Row():
330
+ status = gr.Textbox(label="Status", interactive=False)
331
+ load_pdf_btn = gr.Button("Upload PDF and generate embeddings")
332
 
333
  with gr.Row():
334
  summary = gr.Textbox(label="Summary")
335
+ summarize_pdf_btn = gr.Button("Summarize Contents")
336
 
337
  with gr.Row():
338
+ input_query = gr.Textbox(label="Type your question")
339
+ output_answer = gr.Textbox(label="Answer")
340
+ submit_query_btn = gr.Button("Submit your question")
341
 
342
  with gr.Row():
343
+ questionsets = gr.Dropdown(label="Pre-defined Question Sets", choices=[])
344
+ load_questionsets_btn = gr.Button("Retrieve Question Sets")
345
+ fields_and_questions = gr.Dataframe(label="Fields & Questions in the chosen set")
346
+ load_fields_btn = gr.Button("Retrieve Questions for chosen set")
347
 
348
  with gr.Row():
349
+ answers_df = gr.Dataframe(label="Answers to Pre-defined Question Set")
350
+ answer_predefined_btn = gr.Button("Get answers for chosen question set")
351
+
352
+ # Log window for error and info messages
353
+ log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
354
+
355
  with gr.Tab("OCR Converter"):
356
  with gr.Column():
357
+ image_pdf = gr.File(label="Load PDF for OCR", file_types=['.pdf'], type='filepath')
 
358
  with gr.Row():
359
+ ocr_pdf = gr.File(label="OCR'd PDF", file_types=['.pdf'], type='filepath', file_count="single")
360
+ convert_btn = gr.Button("Convert")
361
+
362
  with gr.Tab("Upload Question Set"):
363
  with gr.Column():
364
  document_types = [
 
483
  "Mortgage Earnest Money Promissory Note",
484
  "Mortgage Rental Agreement",
485
  "Mortgage Repayment Plan",
486
+ "Mortgage Short Sale Miscellaneous"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  ]
488
+ document_type_for_questionset = gr.Dropdown(choices=document_types, label="Select Document Type")
489
+ tag_for_questionset = gr.Textbox(label="Name for Question Set (e.g., rwikd-dot-basic-questionset-20230707)")
490
+ csv_file = gr.File(label="Load CSV (2 columns: field, question)", file_types=['.csv'], type='filepath')
491
 
492
  with gr.Row():
493
+ status_for_csv = gr.Textbox(label="Status", interactive=False)
494
+ load_csv_btn = gr.Button("Upload CSV into DB")
495
+
496
+ # Set up button actions
497
+ load_pdf_btn.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
498
+ summarize_pdf_btn.click(summarize_contents, outputs=summary)
499
+ submit_query_btn.click(answer_query, inputs=input_query, outputs=output_answer)
500
+
501
+ load_questionsets_btn.click(retrieve_document_type_and_questionsettag_from_sqlite, outputs=questionsets)
502
+ load_fields_btn.click(retrieve_fields_and_questions, inputs=questionsets, outputs=fields_and_questions)
503
+ answer_predefined_btn.click(answer_predefined_questions, inputs=questionsets, outputs=answers_df)
504
+
505
+ convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
506
+ load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
507
+
508
+ # Button to refresh the log window
509
+ refresh_log_btn = gr.Button("Refresh Log")
510
+ refresh_log_btn.click(get_log, outputs=log_window)
511
 
512
+ # Launch the Gradio app
513
  demo.launch(debug=True)