Spaces:

muradkhan
/

indemo

Paused

App Files Files Community

muradkhan commited on Jul 24, 2024

Commit

2f6398a

verified ·

1 Parent(s): 04afe69

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -93

app.py DELETED Viewed

@@ -1,93 +0,0 @@
-import PyPDF2
-from pprint import pprint
-from haystack import Pipeline
-from haystack.schema import Document
-from haystack.nodes import BM25Retriever
-from haystack.document_stores import InMemoryDocumentStore
-from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
-from pdf2image import convert_from_path
-import pytesseract
-from PIL import Image
-import gradio as gr
-import os
-# Function to extract text from a PDF file using OCR
-def extract_text_from_pdf(pdf_path):
-    text = ""
-    # Convert PDF pages to images
-    images = convert_from_path(pdf_path)
-    for image in images:
-        # Perform OCR on the image
-        text += pytesseract.image_to_string(image)
-    return text
-# Process and retrieve answers
-def process_invoice(pdf, hf_token, questions):
-    # Extract text from the PDF
-    extracted_text = extract_text_from_pdf(pdf.name)
-    document = Document(content=extracted_text)
-    docs = [document]
-    # Initializing the processor
-    processor = PreProcessor(
-        clean_empty_lines=True,
-        clean_whitespace=True,
-        clean_header_footer=True,
-        split_by="word",
-        split_length=500,
-        split_respect_sentence_boundary=True,
-        split_overlap=0,
-    )
-    preprocessed_docs = processor.process(docs)
-    document_store = InMemoryDocumentStore(use_bm25=True)
-    document_store.write_documents(preprocessed_docs)
-    retriever = BM25Retriever(document_store, top_k=2)
-    qa_template = PromptTemplate(prompt=
-        """ Using exclusively the information contained in the context, answer only the question asked without adding
-        suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
-        context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
-        respond: "Not sure because not relevant to the context.
-        Context: {join(documents)};
-        Question: {query}
-        """)
-    prompt_node = PromptNode(
-        model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
-        api_key=hf_token,
-        default_prompt_template=qa_template,
-        max_length=500,
-        model_kwargs={"model_max_length": 5000}
-    )
-    rag_pipeline = Pipeline()
-    rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
-    rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
-    answers = {}
-    for question in questions.split(','):
-        result = rag_pipeline.run(query=question.strip())
-        answers[question] = result["results"][0].strip()
-    return answers
-# Gradio interface
-def gradio_interface(pdf, hf_token, questions):
-    answers = process_invoice(pdf, hf_token, questions)
-    return answers
-interface = gr.Interface(
-    fn=gradio_interface,
-    inputs=[
-        gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
-        gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
-        gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
-    ],
-    outputs="json",
-    title="Invoice Data Extraction",
-    description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
-)
-if __name__ == "__main__":
-    interface.launch()