muradkhan commited on
Commit
2f6398a
1 Parent(s): 04afe69

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -93
app.py DELETED
@@ -1,93 +0,0 @@
1
- import PyPDF2
2
- from pprint import pprint
3
- from haystack import Pipeline
4
- from haystack.schema import Document
5
- from haystack.nodes import BM25Retriever
6
- from haystack.document_stores import InMemoryDocumentStore
7
- from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
8
- from pdf2image import convert_from_path
9
- import pytesseract
10
- from PIL import Image
11
- import gradio as gr
12
- import os
13
-
14
- # Function to extract text from a PDF file using OCR
15
- def extract_text_from_pdf(pdf_path):
16
- text = ""
17
- # Convert PDF pages to images
18
- images = convert_from_path(pdf_path)
19
- for image in images:
20
- # Perform OCR on the image
21
- text += pytesseract.image_to_string(image)
22
- return text
23
-
24
- # Process and retrieve answers
25
- def process_invoice(pdf, hf_token, questions):
26
- # Extract text from the PDF
27
- extracted_text = extract_text_from_pdf(pdf.name)
28
- document = Document(content=extracted_text)
29
- docs = [document]
30
-
31
- # Initializing the processor
32
- processor = PreProcessor(
33
- clean_empty_lines=True,
34
- clean_whitespace=True,
35
- clean_header_footer=True,
36
- split_by="word",
37
- split_length=500,
38
- split_respect_sentence_boundary=True,
39
- split_overlap=0,
40
- )
41
-
42
- preprocessed_docs = processor.process(docs)
43
- document_store = InMemoryDocumentStore(use_bm25=True)
44
- document_store.write_documents(preprocessed_docs)
45
- retriever = BM25Retriever(document_store, top_k=2)
46
-
47
- qa_template = PromptTemplate(prompt=
48
- """ Using exclusively the information contained in the context, answer only the question asked without adding
49
- suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
50
- context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
51
- respond: "Not sure because not relevant to the context.
52
- Context: {join(documents)};
53
- Question: {query}
54
- """)
55
-
56
- prompt_node = PromptNode(
57
- model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
58
- api_key=hf_token,
59
- default_prompt_template=qa_template,
60
- max_length=500,
61
- model_kwargs={"model_max_length": 5000}
62
- )
63
-
64
- rag_pipeline = Pipeline()
65
- rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
66
- rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
67
-
68
- answers = {}
69
- for question in questions.split(','):
70
- result = rag_pipeline.run(query=question.strip())
71
- answers[question] = result["results"][0].strip()
72
-
73
- return answers
74
-
75
- # Gradio interface
76
- def gradio_interface(pdf, hf_token, questions):
77
- answers = process_invoice(pdf, hf_token, questions)
78
- return answers
79
-
80
- interface = gr.Interface(
81
- fn=gradio_interface,
82
- inputs=[
83
- gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
84
- gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
85
- gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
86
- ],
87
- outputs="json",
88
- title="Invoice Data Extraction",
89
- description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
90
- )
91
-
92
- if __name__ == "__main__":
93
- interface.launch()