muradkhan commited on
Commit
ddb0d1e
1 Parent(s): 2f6398a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from pprint import pprint
3
+ from haystack import Pipeline
4
+ from haystack.schema import Document
5
+ from haystack.nodes import BM25Retriever
6
+ from haystack.document_stores import InMemoryDocumentStore
7
+ from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
8
+ from pdf2image import convert_from_path
9
+ import pytesseract
10
+ from PIL import Image
11
+ import gradio as gr
12
+ import os
13
+
14
+ # Function to extract text from a PDF file using OCR
15
+ def extract_text_from_pdf(pdf_path):
16
+ text = ""
17
+ # Convert PDF pages to images
18
+ images = convert_from_path(pdf_path)
19
+ for image in images:
20
+ # Perform OCR on the image
21
+ text += pytesseract.image_to_string(image)
22
+ return text
23
+
24
+ # Process and retrieve answers
25
+ def process_invoice(pdf, hf_token, questions):
26
+ # Extract text from the PDF
27
+ extracted_text = extract_text_from_pdf(pdf.name)
28
+ document = Document(content=extracted_text)
29
+ docs = [document]
30
+
31
+ # Initializing the processor
32
+ processor = PreProcessor(
33
+ clean_empty_lines=True,
34
+ clean_whitespace=True,
35
+ clean_header_footer=True,
36
+ split_by="word",
37
+ split_length=500,
38
+ split_respect_sentence_boundary=True,
39
+ split_overlap=0,
40
+ )
41
+
42
+ preprocessed_docs = processor.process(docs)
43
+ document_store = InMemoryDocumentStore(use_bm25=True)
44
+ document_store.write_documents(preprocessed_docs)
45
+ retriever = BM25Retriever(document_store, top_k=2)
46
+
47
+ qa_template = PromptTemplate(prompt=
48
+ """ Using exclusively the information contained in the context, answer only the question asked without adding
49
+ suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
50
+ context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
51
+ respond: "Not sure because not relevant to the context.
52
+ Context: {join(documents)};
53
+ Question: {query}
54
+ """)
55
+
56
+ prompt_node = PromptNode(
57
+ model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
58
+ api_key=hf_token,
59
+ default_prompt_template=qa_template,
60
+ max_length=500,
61
+ model_kwargs={"model_max_length": 5000}
62
+ )
63
+
64
+ rag_pipeline = Pipeline()
65
+ rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
66
+ rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
67
+
68
+ answers = {}
69
+ for question in questions.split(','):
70
+ result = rag_pipeline.run(query=question.strip())
71
+ answers[question] = result["results"][0].strip()
72
+
73
+ return answers
74
+
75
+ # Gradio interface
76
+ def gradio_interface(pdf, hf_token, questions):
77
+ answers = process_invoice(pdf, hf_token, questions)
78
+ return answers
79
+
80
+ interface = gr.Interface(
81
+ fn=gradio_interface,
82
+ inputs=[
83
+ gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
84
+ gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
85
+ gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
86
+ ],
87
+ outputs="json",
88
+ title="Invoice Data Extraction",
89
+ description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
90
+ )
91
+
92
+ if __name__ == "__main__":
93
+ interface.launch()