Spaces:

muradkhan
/

indemo

Paused

App Files Files Community

indemo / app.py

muradkhan

Create app.py

641b252 verified 3 months ago

raw

history blame

3.34 kB

	import PyPDF2
	from pprint import pprint
	from haystack import Pipeline
	from haystack.schema import Document
	from haystack.nodes import BM25Retriever
	from haystack.document_stores import InMemoryDocumentStore
	from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
	from pdf2image import convert_from_path
	import pytesseract
	from PIL import Image
	import gradio as gr
	import os

	# Function to extract text from a PDF file using OCR
	def extract_text_from_pdf(pdf_path):
	text = ""
	# Convert PDF pages to images
	images = convert_from_path(pdf_path)
	for image in images:
	# Perform OCR on the image
	text += pytesseract.image_to_string(image)
	return text

	# Process and retrieve answers
	def process_invoice(pdf, hf_token, questions):
	# Extract text from the PDF
	extracted_text = extract_text_from_pdf(pdf.name)
	document = Document(content=extracted_text)
	docs = [document]

	# Initializing the processor
	processor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=True,
	split_by="word",
	split_length=500,
	split_respect_sentence_boundary=True,
	split_overlap=0,
	)

	preprocessed_docs = processor.process(docs)
	document_store = InMemoryDocumentStore(use_bm25=True)
	document_store.write_documents(preprocessed_docs)
	retriever = BM25Retriever(document_store, top_k=2)

	qa_template = PromptTemplate(prompt=
	""" Using exclusively the information contained in the context, answer only the question asked without adding
	suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
	context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
	respond: "Not sure because not relevant to the context.
	Context: {join(documents)};
	Question: {query}
	""")

	prompt_node = PromptNode(
	model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
	api_key=hf_token,
	default_prompt_template=qa_template,
	max_length=500,
	model_kwargs={"model_max_length": 5000}
	)

	rag_pipeline = Pipeline()
	rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
	rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

	answers = {}
	for question in questions.split(','):
	result = rag_pipeline.run(query=question.strip())
	answers[question] = result["results"][0].strip()

	return answers

	# Gradio interface
	def gradio_interface(pdf, hf_token, questions):
	answers = process_invoice(pdf, hf_token, questions)
	return answers

	interface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
	gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
	gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
	],
	outputs="json",
	title="Invoice Data Extraction",
	description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
	)

	if __name__ == "__main__":
	interface.launch()