File size: 3,340 Bytes
641b252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import PyPDF2
from pprint import pprint
from haystack import Pipeline
from haystack.schema import Document
from haystack.nodes import BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import gradio as gr
import os

# Function to extract text from a PDF file using OCR
def extract_text_from_pdf(pdf_path):
    text = ""
    # Convert PDF pages to images
    images = convert_from_path(pdf_path)
    for image in images:
        # Perform OCR on the image
        text += pytesseract.image_to_string(image)
    return text

# Process and retrieve answers
def process_invoice(pdf, hf_token, questions):
    # Extract text from the PDF
    extracted_text = extract_text_from_pdf(pdf.name)
    document = Document(content=extracted_text)
    docs = [document]

    # Initializing the processor
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=500,
        split_respect_sentence_boundary=True,
        split_overlap=0,
    )

    preprocessed_docs = processor.process(docs)
    document_store = InMemoryDocumentStore(use_bm25=True)
    document_store.write_documents(preprocessed_docs)
    retriever = BM25Retriever(document_store, top_k=2)

    qa_template = PromptTemplate(prompt=
        """ Using exclusively the information contained in the context, answer only the question asked without adding
        suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
        context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
        respond: "Not sure because not relevant to the context.
        Context: {join(documents)};
        Question: {query}
        """)

    prompt_node = PromptNode(
        model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
        api_key=hf_token,
        default_prompt_template=qa_template,
        max_length=500,
        model_kwargs={"model_max_length": 5000}
    )

    rag_pipeline = Pipeline()
    rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
    rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

    answers = {}
    for question in questions.split(','):
        result = rag_pipeline.run(query=question.strip())
        answers[question] = result["results"][0].strip()

    return answers

# Gradio interface
def gradio_interface(pdf, hf_token, questions):
    answers = process_invoice(pdf, hf_token, questions)
    return answers

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
        gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
        gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
    ],
    outputs="json",
    title="Invoice Data Extraction",
    description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
)

if __name__ == "__main__":
    interface.launch()