import PyPDF2 from pprint import pprint from haystack import Pipeline from haystack.schema import Document from haystack.nodes import BM25Retriever from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import PreProcessor, PromptTemplate, PromptNode from pdf2image import convert_from_path import pytesseract from PIL import Image import gradio as gr import os # Function to extract text from a PDF file using OCR def extract_text_from_pdf(pdf_path): text = "" # Convert PDF pages to images images = convert_from_path(pdf_path) for image in images: # Perform OCR on the image text += pytesseract.image_to_string(image) return text # Process and retrieve answers def process_invoice(pdf, hf_token, questions): # Extract text from the PDF extracted_text = extract_text_from_pdf(pdf.name) document = Document(content=extracted_text) docs = [document] # Initializing the processor processor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=500, split_respect_sentence_boundary=True, split_overlap=0, ) preprocessed_docs = processor.process(docs) document_store = InMemoryDocumentStore(use_bm25=True) document_store.write_documents(preprocessed_docs) retriever = BM25Retriever(document_store, top_k=2) qa_template = PromptTemplate(prompt= """ Using exclusively the information contained in the context, answer only the question asked without adding suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice respond: "Not sure because not relevant to the context. Context: {join(documents)}; Question: {query} """) prompt_node = PromptNode( model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1', api_key=hf_token, default_prompt_template=qa_template, max_length=500, model_kwargs={"model_max_length": 5000} ) rag_pipeline = Pipeline() rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) answers = {} for question in questions.split(','): result = rag_pipeline.run(query=question.strip()) answers[question] = result["results"][0].strip() return answers # Gradio interface def gradio_interface(pdf, hf_token, questions): answers = process_invoice(pdf, hf_token, questions) return answers interface = gr.Interface( fn=gradio_interface, inputs=[ gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"), gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"), gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas") ], outputs="json", title="Invoice Data Extraction", description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions." ) if __name__ == "__main__": interface.launch()