indemo / app.py
muradkhan's picture
Create app.py
641b252 verified
raw
history blame
3.34 kB
import PyPDF2
from pprint import pprint
from haystack import Pipeline
from haystack.schema import Document
from haystack.nodes import BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import gradio as gr
import os
# Function to extract text from a PDF file using OCR
def extract_text_from_pdf(pdf_path):
text = ""
# Convert PDF pages to images
images = convert_from_path(pdf_path)
for image in images:
# Perform OCR on the image
text += pytesseract.image_to_string(image)
return text
# Process and retrieve answers
def process_invoice(pdf, hf_token, questions):
# Extract text from the PDF
extracted_text = extract_text_from_pdf(pdf.name)
document = Document(content=extracted_text)
docs = [document]
# Initializing the processor
processor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=500,
split_respect_sentence_boundary=True,
split_overlap=0,
)
preprocessed_docs = processor.process(docs)
document_store = InMemoryDocumentStore(use_bm25=True)
document_store.write_documents(preprocessed_docs)
retriever = BM25Retriever(document_store, top_k=2)
qa_template = PromptTemplate(prompt=
""" Using exclusively the information contained in the context, answer only the question asked without adding
suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
respond: "Not sure because not relevant to the context.
Context: {join(documents)};
Question: {query}
""")
prompt_node = PromptNode(
model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
api_key=hf_token,
default_prompt_template=qa_template,
max_length=500,
model_kwargs={"model_max_length": 5000}
)
rag_pipeline = Pipeline()
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
answers = {}
for question in questions.split(','):
result = rag_pipeline.run(query=question.strip())
answers[question] = result["results"][0].strip()
return answers
# Gradio interface
def gradio_interface(pdf, hf_token, questions):
answers = process_invoice(pdf, hf_token, questions)
return answers
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
],
outputs="json",
title="Invoice Data Extraction",
description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
)
if __name__ == "__main__":
interface.launch()