Spaces:

ankur-bohra
/

demo-automatic-reimbursement-tool

Build error

App Files Files Community

ankur-bohra commited on Jul 31, 2023

Commit

0d99179

1 Parent(s): 901322a

Add basic structure

Browse files

Files changed (26) hide show

.gitattributes +2 -0
.gitignore +3 -0
Dockerfile +26 -0
app.py +366 -0
categories/__init__.py +192 -0
categories/accomodation/__init__.py +41 -0
categories/accomodation/model.py +29 -0
categories/random_/__init__.py +134 -0
categories/random_/model.py +82 -0
categories/travel_cab/__init__.py +37 -0
categories/travel_cab/model.py +19 -0
categories/travel_flight/__init__.py +23 -0
categories/travel_flight/model.py +30 -0
categories/vendor/__init__.py +38 -0
categories/vendor/model.py +46 -0
environment.yml +181 -0
examples/example1.pdf +3 -0
examples/rotated.jpeg +3 -0
examples/rotated.pdf +3 -0
examples/upright.jpeg +3 -0
examples/upright.pdf +3 -0
extract.py +67 -0
main.py +66 -0
packages.txt +1 -0
processing.py +171 -0
requirements.txt +124 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.conda
+temp*
+__pycache__/

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM continuumio/miniconda3
+WORKDIR /code
+# Create the environment:
+COPY ./environment.yml /code/environment.yml
+RUN conda config --set channel_priority strict
+RUN conda config --add channels conda-forge
+RUN conda env create -f environment.yml
+# Make RUN commands use the new environment:
+SHELL ["conda", "run", "-n", "env", "/bin/bash", "-c"]
+RUN pip install -r requirements.txt
+# Demonstrate the environment is activated:
+RUN echo "Making sure installation worked:"
+RUN python -c "import gradio, pypdf, pdf2image, langchain, openai, datasets"
+COPY . .
+# The code to run when container is started:
+ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "env", "python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import base64
+import os
+from io import BytesIO
+from pathlib import Path
+from langchain.schema.output_parser import OutputParserException
+import gradio as gr
+from PIL import Image
+import categories
+from categories import Category
+from main import process_image, process_pdf
+HF_TOKEN = os.getenv("HF_TOKEN")
+PDF_IFRAME = """
+<div style="border-radius: 10px; width: 100%; overflow: hidden;">
+    <iframe
+        src="data:application/pdf;base64,{0}"
+        width="100%"
+        height="400"
+        type="application/pdf">
+    </iframe>
+</div>"""
+hf_writer_normal = gr.HuggingFaceDatasetSaver(
+    HF_TOKEN, "automatic-reimbursement-tool-demo", separate_dirs=False
+)
+hf_writer_incorrect = gr.HuggingFaceDatasetSaver(
+    HF_TOKEN, "automatic-reimbursement-tool-demo-incorrect", separate_dirs=False
+)
+# with open("examples/example1.pdf", "rb") as pdf_file:
+#     base64_pdf = base64.b64encode(pdf_file.read())
+# example_paths = []
+# current_file_path = None
+# def ignore_examples(function):
+#     def new_function(*args, **kwargs):
+#         global example_paths, current_file_path
+#         if current_file_path not in example_paths:
+#             return function(*args, **kwargs)
+def display_file(input_file):
+    global current_file_path
+    current_file_path = input_file.name if input_file else None
+    if not input_file:
+        return gr.HTML.update(visible=False), gr.Image.update(visible=False)
+    if input_file.name.endswith(".pdf"):
+        with open(input_file.name, "rb") as input_file:
+            pdf_base64 = base64.b64encode(input_file.read()).decode()
+        return gr.HTML.update(
+            PDF_IFRAME.format(pdf_base64), visible=True
+        ), gr.Image.update(visible=False)
+    else:
+        # image = Image.open(input_file.name)
+        return gr.HTML.update(visible=False), gr.Image.update(
+            input_file.name, visible=True
+        )
+def show_intermediate_outputs(show_intermediate):
+    if show_intermediate:
+        return gr.Accordion.update(visible=True)
+    else:
+        return gr.Accordion.update(visible=False)
+def show_share_contact(share_result):
+    return gr.Textbox.update(visible=share_result)
+def clear_inputs():
+    return gr.File.update(value=None)
+def submit(input_file, old_text):
+    if not input_file:
+        gr.Error("Please upload a file to continue!")
+        return gr.Textbox.update()
+    # Send change to preprocessed image or to extracted text
+    if input_file.name.endswith(".pdf"):
+        text = process_pdf(Path(input_file.name), extract_only=True)
+    else:
+        text = process_image(Path(input_file.name), extract_only=True)
+    return text
+def categorize_extracted_text(extracted_text):
+    category = categories.categorize_text(extracted_text)
+    # gr.Info(f"Recognized category: {category}")
+    return category
+def extract_from_category(category, extracted_text):
+    # gr.Info("Received category: " + category)
+    if not category:
+        return (
+            gr.Chatbot.update(None),
+            gr.JSON.update(None),
+            gr.Button.update(interactive=False),
+            gr.Button.update(interactive=False),
+        )
+    category = Category[category]
+    chain = categories.category_modules[category].chain
+    formatted_prompt = chain.prompt.format_prompt(
+        text=extracted_text,
+        format_instructions=chain.output_parser.get_format_instructions(),
+    )
+    result = chain.generate(
+        input_list=[
+            {
+                "text": extracted_text,
+                "format_instructions": chain.output_parser.get_format_instructions(),
+            }
+        ]
+    )
+    question = f""
+    if len(formatted_prompt.messages) > 1:
+        question += f"**System:**\n{formatted_prompt.messages[1].content}"
+    question += f"\n\n**Human:**\n{formatted_prompt.messages[0].content}"
+    answer = result.generations[0][0].text
+    try:
+        information = chain.output_parser.parse_with_prompt(answer, formatted_prompt)
+        information = information.json() if information else {}
+    except OutputParserException as e:
+        information = {
+            "error": "Unable to parse chatbot output",
+            "details": str(e),
+            "output": e.llm_output,
+        }
+    return (
+        gr.Chatbot.update([[question, answer]]),
+        gr.JSON.update(information),
+        gr.Button.update(interactive=True),
+        gr.Button.update(interactive=True),
+    )
+def dynamic_auto_flag(flag_method):
+    def modified_flag_method(share_result, *args, **kwargs):
+        if share_result:
+            flag_method(*args, **kwargs)
+    return modified_flag_method
+# def save_example_and_submit(input_file):
+#     example_paths.append(input_file.name)
+#     submit(input_file, "")
+with gr.Blocks(title="Automatic Reimbursement Tool Demo") as page:
+    gr.Markdown("<center><h1>Automatic Reimbursement Tool Demo</h1></center>")
+    gr.Markdown("<h2>Description</h2>")
+    gr.Markdown(
+        "The reimbursement filing process can be time-consuming and cumbersome, causing "
+        "frustration for faculty members and finance departments. Our project aims to "
+        "automate the information extraction involved in the process by feeding "
+        "extracted text to language models such as ChatGPT. This demo showcases the "
+        "categorization and extraction parts of the pipeline. Categorization is done "
+        "to identify the relevant details associated with the text, after which "
+        "extraction is done for those details using a language model."
+    )
+    gr.Markdown("<h2>Try it out!</h2>")
+    with gr.Box() as demo:
+        with gr.Row():
+            with gr.Column(variant="panel"):
+                gr.HTML(
+                    '<div><center style="color:rgb(200, 200, 200);">Input</center></div>'
+                )
+                pdf_preview = gr.HTML(label="Preview", show_label=True, visible=False)
+                image_preview = gr.Image(
+                    label="Preview", show_label=True, visible=False, height=350
+                )
+                input_file = gr.File(
+                    label="Input receipt",
+                    show_label=True,
+                    type="file",
+                    file_count="single",
+                    file_types=["image", ".pdf"],
+                )
+                input_file.change(
+                    display_file, input_file, [pdf_preview, image_preview]
+                )
+                with gr.Row():
+                    clear = gr.Button("Clear", variant="secondary")
+                    submit_button = gr.Button("Submit", variant="primary")
+                show_intermediate = gr.Checkbox(
+                    False,
+                    label="Show intermediate outputs",
+                    info="There are several intermediate steps in the process such as preprocessing, OCR, chatbot interaction. You can choose to show their results here.",
+                )
+                share_result = gr.Checkbox(
+                    True,
+                    label="Share results",
+                    info="Sharing your result with us will help us immensely in improving this tool.",
+                    interactive=True,
+                )
+                contact = gr.Textbox(
+                    type="email",
+                    label="Contact",
+                    interactive=True,
+                    placeholder="Enter your email address",
+                    info="Optionally, enter your email address to allow us to contact you regarding your result.",
+                    visible=True,
+                )
+                share_result.change(show_share_contact, share_result, [contact])
+            with gr.Column(variant="panel"):
+                gr.HTML(
+                    '<div><center style="color:rgb(200, 200, 200);">Output</center></div>'
+                )
+                category = gr.Dropdown(
+                    value=None,
+                    choices=Category.__members__.keys(),
+                    label=f"Recognized category ({', '.join(Category.__members__.keys())})",
+                    show_label=True,
+                    interactive=False,
+                )
+                intermediate_outputs = gr.Accordion(
+                    "Intermediate outputs", open=True, visible=False
+                )
+                with intermediate_outputs:
+                    extracted_text = gr.Textbox(
+                        label="Extracted text",
+                        show_label=True,
+                        max_lines=5,
+                        show_copy_button=True,
+                        lines=5,
+                        interactive=False,
+                    )
+                    chatbot = gr.Chatbot(
+                        None,
+                        label="Chatbot interaction",
+                        show_label=True,
+                        interactive=False,
+                        height=240,
+                    )
+                information = gr.JSON(label="Extracted information")
+                with gr.Row():
+                    flag_incorrect_button = gr.Button(
+                        "Flag as incorrect", variant="stop", interactive=True
+                    )
+                    flag_irrelevant_button = gr.Button(
+                        "Flag as irrelevant", variant="stop", interactive=True
+                    )
+            show_intermediate.change(
+                show_intermediate_outputs, show_intermediate, [intermediate_outputs]
+            )
+            clear.click(clear_inputs, None, [input_file])
+            submit_button.click(
+                submit,
+                [input_file, extracted_text],
+                [extracted_text],
+            )
+            submit_button.click(
+                lambda input_file, category, chatbot, information: (
+                    gr.Dropdown.update(None),
+                    gr.Chatbot.update(None),
+                    gr.Textbox.update(None),
+                ) if input_file else (category, chatbot, information),
+                [input_file, category, chatbot, information],
+                [category, chatbot, information],
+            )
+            extracted_text.change(
+                categorize_extracted_text,
+                [extracted_text],
+                [category],
+            )
+            category.change(
+                extract_from_category,
+                [category, extracted_text],
+                [chatbot, information, flag_incorrect_button, flag_irrelevant_button],
+            )
+            hf_writer_normal.setup(
+                [input_file, extracted_text, category, chatbot, information, contact],
+                flagging_dir="flagged",
+            )
+            flag_method = gr.flagging.FlagMethod(
+                hf_writer_normal, "", "", visual_feedback=True
+            )
+            information.change(
+                dynamic_auto_flag(flag_method),
+                inputs=[
+                    share_result,
+                    input_file,
+                    extracted_text,
+                    category,
+                    chatbot,
+                    information,
+                    contact,
+                ],
+                outputs=None,
+                preprocess=False,
+                queue=False,
+            )
+            hf_writer_incorrect.setup(
+                [input_file, extracted_text, category, chatbot, information, contact],
+                flagging_dir="flagged_incorrect",
+            )
+            flag_incorrect_method = gr.flagging.FlagMethod(
+                hf_writer_incorrect,
+                "Flag as incorrect",
+                "Incorrect",
+                visual_feedback=True,
+            )
+            flag_incorrect_button.click(
+                lambda: gr.Button.update(value="Saving...", interactive=False),
+                None,
+                flag_incorrect_button,
+                queue=False,
+            )
+            flag_incorrect_button.click(
+                flag_incorrect_method,
+                inputs=[
+                    input_file,
+                    extracted_text,
+                    category,
+                    chatbot,
+                    information,
+                    contact,
+                ],
+                outputs=[flag_incorrect_button],
+                preprocess=False,
+                queue=False,
+            )
+            flag_irrelevant_method = gr.flagging.FlagMethod(
+                hf_writer_incorrect,
+                "Flag as irrelevant",
+                "Irrelevant",
+                visual_feedback=True,
+            )
+            flag_irrelevant_button.click(
+                lambda: gr.Button.update(value="Saving...", interactive=False),
+                None,
+                flag_irrelevant_button,
+                queue=False,
+            )
+            flag_irrelevant_button.click(
+                flag_irrelevant_method,
+                inputs=[
+                    input_file,
+                    extracted_text,
+                    category,
+                    chatbot,
+                    information,
+                    contact,
+                ],
+                outputs=[flag_irrelevant_button],
+                preprocess=False,
+                queue=False,
+            )
+page.launch(show_api=True, show_error=True, debug=True)

categories/__init__.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from enum import Enum
+from typing import Union
+# from . import vendor
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser
+from langchain.output_parsers.enum import EnumOutputParser
+from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate,
+                               SystemMessagePromptTemplate)
+from pydantic import BaseModel
+from . import accomodation, random_, travel_cab, travel_flight
+class Category(Enum):
+    ACCOMODATION = "ACCOMODATION"
+    TRAVEL_FLIGHT = "TRAVEL_FLIGHT"
+    TRAVEL_CAB = "TRAVEL_CAB"
+    # VENDOR = "VENDOR"
+    RANDOM = "RANDOM"
+category_modules = {
+    Category.ACCOMODATION: accomodation,
+    Category.TRAVEL_FLIGHT: travel_flight,
+    Category.TRAVEL_CAB: travel_cab,
+    # Category.VENDOR: vendor,
+    Category.RANDOM: random_,
+}
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    # max_tokens=300,
+    model_kwargs={
+        "stop": None,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    },
+)
+# Build categorizing chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are a classifier that, given a bill's text, states what type of bill "
+    "category it belongs to: accomodation (bills regarding stays), travel (bills "
+    "concerning cab or other land rides), travel (bills concerning flights), random "
+    "(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
+    "You may want to see if there are Room Details, Check-in/Check-out Date for "
+    "Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
+    "Travel; Conference Details for Conference organizers; anything else comes under "
+    "random category. Your answers must be only the appropriate choice e.g. 'option' and "
+    "not 'The given bill belongs to the option category.'\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+category_parser = EnumOutputParser(enum=Category)
+categorize_chain = LLMChain(
+    llm=model, prompt=chat_prompt, output_parser=category_parser
+)
+def categorize_text(text: str) -> Category:
+    """Categories the text into one of the categories defined in Category by querying
+    ChatGPT.
+    Args:
+        text(str): The text to categorize.
+    Returns: The category of the text.
+    """
+    return categorize_chain.run(
+        text=text, format_instructions=category_parser.get_format_instructions()
+    )
+def run_category_chain(category: Category, text: str) -> Union[BaseModel, None]:
+    """Runs the chain for the given category on the given text.
+    Args:
+        category(Category): The category for which the chain is to be run.
+        text(str): The text on which the chain is to be run.
+    Returns: The output of the chain.
+    """
+    output_parser = category_modules[category].output_parser
+    try:
+        return category_modules[category].chain.run(
+            text=text, format_instructions=output_parser.get_format_instructions()
+        )
+    except Exception as e:
+        print("Error in running chain for category", category, ":", e)
+if __name__ == "__main__":
+    text = """amazonin
+we)
+Sold By :
+Spigen India Pvt. Ltd.
+* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
+37//15/1, 15/2,, Adjacent to Starex School, Village
+- Binola, National Highway -8, Tehsil - Manesar
+Gurgaon, Haryana, 122413
+IN
+PAN No: ABACS5056L
+GST Registration No: O6ABACS5056L12Z5
+Order Number: 407-5335982-7837125
+Order Date: 30.05.2023
+Tax Invoice/Bill of Supply/Cash Memo
+(Original for Recipient)
+Billing Address :
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Shipping Address :
+Praveen Bohra
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Place of supply: HARYANA
+Place of delivery: HARYANA
+Invoice Number : DEL5-21033
+Invoice Details : HR-DEL5-918080915-2324
+Invoice Date : 30.05.2023
+Description at Tax |Tax /|Tax Total
+p y Rate |Type |Amount|Amount
+Black) | BO8BHLZHBH ( ACS01744INP )
+HSN:39269099
+1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
+1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
+9% |SGST| %76.19
+TOTAL:
+Amount in Words:
+Nine Hundred Ninety-nine only
+Whether tax is payable under reverse charge - No
+For Spigen India Pvt. Ltd.:
+sSoigenrn
+Authorized Signatory
+Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
+2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
+*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
+Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
+Please note that this invoice is not a demand for payment
+Page 1 of 1"""
+    category = categorize_text(text)
+    print("Category:", category)
+    print("\n\n")
+    result = run_category_chain(category, text)
+    print(result)

categories/accomodation/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0.6,
+    max_tokens=300,
+    n=1,
+    request_timeout=None,
+    model_kwargs={
+        'stop': None,
+        'top_p': 1,
+    }
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are tasked with developing an OCR data extraction system for hotel bills in PDF "
+    "format given as text. The system should extract important information necessary for "
+    "the reimbursement process from a college. Your prompt should fetch the following "
+    "essential details from the hotel bill: hotel name, address, bill number/invoice "
+    "number, booking ID / confirmation ID / booking number, check-in date and time, "
+    "check-out date and time, total amount, booking platform, bill date.\n"
+    "Ensure that the system accurately extracts the above information from the OCR text "
+    "of the hotel bill.\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/accomodation/model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from __future__ import annotations
+from datetime import datetime
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    1. Hotel Name: [Hotel Name]
+    2. Address: [Hotel Address]
+    3. Bill number/Invoice number: [Bill Number]
+    4. booking ID / Confirmation ID / Booking #: [Booking ID]
+    5. Check-in Date and Time: [Check-in Date Time]
+    6. Check-out Date and Time: [Check-out Date Time]
+    7. Total Amount: [Total Amount Charged]
+    8. Booking platform: [Booking Platform]
+    9. Bill date: [Bill Date]
+    """
+    hostel_name: str = Field(..., title="The name of the hotel")
+    address: str = Field(..., title="The address of the hotel")
+    bill_number: str = Field(..., title="The bill number/invoice number")
+    booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
+    check_in_date_time: datetime = Field(..., title="The check-in date and time")
+    check_out_date_time: datetime = Field(..., title="The check-out date and time")
+    total_amount_charged: float = Field(..., title="The total amount charged")
+    booking_platform: str = Field(..., title="The booking platform")
+    bill_date: datetime = Field(..., title="The bill date")

categories/random_/__init__.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs={
+        'stop': None,
+        'top_p': 1,
+        'frequency_penalty': 0,
+        'presence_penalty': 0,
+    }
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents like uids, total, tax, name, currency, date, seller details, summary. You "
+    "may use context to make an educated guess about the currency. Use null if you are "
+    "unable to find certain details\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
+if __name__ == "__main__":
+    text = """amazonin
+we)
+Sold By :
+Spigen India Pvt. Ltd.
+* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
+37//15/1, 15/2,, Adjacent to Starex School, Village
+- Binola, National Highway -8, Tehsil - Manesar
+Gurgaon, Haryana, 122413
+IN
+PAN No: ABACS5056L
+GST Registration No: O6ABACS5056L12Z5
+Order Number: 407-5335982-7837125
+Order Date: 30.05.2023
+Tax Invoice/Bill of Supply/Cash Memo
+(Original for Recipient)
+Billing Address :
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Shipping Address :
+Praveen Bohra
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Place of supply: HARYANA
+Place of delivery: HARYANA
+Invoice Number : DEL5-21033
+Invoice Details : HR-DEL5-918080915-2324
+Invoice Date : 30.05.2023
+Description at Tax |Tax /|Tax Total
+p y Rate |Type |Amount|Amount
+Black) | BO8BHLZHBH ( ACS01744INP )
+HSN:39269099
+1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
+1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
+9% |SGST| %76.19
+TOTAL:
+Amount in Words:
+Nine Hundred Ninety-nine only
+Whether tax is payable under reverse charge - No
+For Spigen India Pvt. Ltd.:
+sSoigenrn
+Authorized Signatory
+Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
+2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
+*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
+Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
+Please note that this invoice is not a demand for payment
+Page 1 of 1"""
+    # result = chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions())
+    # print(result.json(indent=4))
+    result = chain.generate(input_list=[{"text": text, "format_instructions": fixing_parser.get_format_instructions()}])
+    print(result)
+    result = fixing_parser.parse_with_prompt(result.generations[0][0].text, chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions()))
+    print(result)
+    # result = chain.run(text=text, format_instructions=output_parser.get_format_instructions(), verbose=True)
+    # print(result)

categories/random_/model.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# generated by datamodel-codegen:
+#   filename:  schema.json
+#   timestamp: 2023-07-28T11:36:16+00:00
+from __future__ import annotations
+from datetime import date
+from typing import Dict, Optional, Union
+import iso4217
+from pydantic import BaseModel, Field, constr, validator, ValidationError
+class TaxItem(BaseModel):
+    gst: float = Field(
+        ...,
+        title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
+    )
+class TaxItem1(BaseModel):
+    vat: float = Field(..., title="The total VAT present in the invoice")
+class TaxNumberItem(BaseModel):
+    gst_number: constr(min_length=15) = Field(
+        ..., title="The alphanumeric GSTIN/GST number code"
+    )
+class TaxNumberItem1(BaseModel):
+    vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
+class TaxNumberItem2(BaseModel):
+    ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
+class SellerDetails(BaseModel):
+    name: Optional[str] = None
+    address: Optional[str] = None
+    contact: Optional[str] = None
+    tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
+        ..., title="Tax information"
+    )
+    pan_number: constr(min_length=10, max_length=10) = Field(
+        ..., title="The 10-character alphanumeric PAN code"
+    )
+class UIDDict(BaseModel):
+    invoice_number: str = Field(..., title="The invoice number")
+    other_uids: Dict[str, str] = Field(
+        ...,
+        title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
+    )
+class InformationExtractedFromABillReceipt(BaseModel):
+    uids: UIDDict = Field(..., title="Invoice number and other UIDs")
+    total: float = Field(..., title="Total amount or price")
+    tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
+    name: str = Field(
+        ...,
+        title="Name of the person/entity that the invoice item was charged or delivered to",
+    )
+    currency: str = Field(
+        default="INR",
+        title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
+    )
+    issue_date: date = Field(
+        ..., title="The date the invoice was issued"
+    )
+    seller_details: SellerDetails = Field(..., title="Information about the seller")
+    summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
+    @validator("currency")
+    @classmethod
+    def check_currency(cls, v: str) -> str:
+        if not iso4217.Currency.__members__.get(v.lower()):
+            raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
+        return v.upper()

categories/travel_cab/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs= {
+        'stop': None,
+        'top_p': 1,
+        'frequency_penalty': 0,
+        'presence_penalty': 0,
+    }
+)
+# Build categorizing chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents such as date/time/place of departure and arrival.\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(
+    llm=model, prompt=chat_prompt, output_parser=fixing_parser
+)

categories/travel_cab/model.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from __future__ import annotations
+from datetime import date, time
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    ''''''
+    place_from: str = Field(..., title="place where journey starts")
+    date_from: date = Field(
+        ..., title="date on which journey starts (DD/MM/YYYY)"
+    )
+    time_from: time = Field(..., title="time at which journey starts")
+    place_to: str = Field(..., title="place where journey end")
+    date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
+    time_to: time = Field(..., title="time at which journey end")
+    amount: float = Field(..., title="cost of journey ticket")

categories/travel_flight/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+)
+model = ChatOpenAI(temperature=0)
+# Build categorizing chain
+human_message_prompt = HumanMessagePromptTemplate.from_template(
+    "Parse through and find the following details from the text extracted from a travel "
+    "bill\n"
+    "{format_instructions}\n"
+    "{text}"
+)
+chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/travel_flight/model.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from __future__ import annotations
+from datetime import date, time
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    response_schemas = [
+        ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
+        ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
+        ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
+        ResponseSchema(name="place (to)", description="place where flight end/lands"),
+        ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
+        ResponseSchema(name="time (to)", description="time at which flight end/lands"),
+        ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
+        ResponseSchema(name="amount", description="cost of flight ticket")
+    ]"""
+    place_from: str = Field(..., title="place where flight starts/takes-off")
+    date_from: date = Field(
+        ..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
+    )
+    time_from: time = Field(..., title="time at which flight starts/takes-off")
+    place_to: str = Field(..., title="place where flight end/lands")
+    date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
+    time_to: time = Field(..., title="time at which flight end/lands")
+    pnr_number: str = Field(..., title="PNR Number of flight")
+    amount: float = Field(..., title="cost of flight ticket")

categories/vendor/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs={
+        "stop": None,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    },
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents like uids, total, tax, addresses, bank details, invoice details, "
+    "participant registration details."
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+print(output_parser.get_format_instructions())
+# exit()
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/vendor/model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# generated by datamodel-codegen:
+#   filename:  schema.json
+#   timestamp: 2023-07-28T11:36:16+00:00
+from __future__ import annotations
+from datetime import datetime
+from pydantic import BaseModel, Field, constr, validator, ValidationError
+class BankDetails(BaseModel):
+    """account holder name, bank name, account number, branch, ifs code, swift code"""
+    account_holder_name: str = Field(..., title="The name of the account holder")
+    bank_name: str = Field(..., title="The name of the bank")
+    account_number: str = Field(..., title="The account number")
+    branch: str = Field(..., title="The branch of the bank")
+    ifs_code: str = Field(..., title="The IFS code of the bank")
+    swift_code: str = Field(..., title="The SWIFT code of the bank")
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
+    bank details: (account holder name, bank name, account number, branch, ifs code, swift
+    code), recipient, registration id, registration fee, registration date/time
+    """
+    gstin: constr(min_length=15) = Field(
+        ..., title="The alphanumeric GSTIN/GST number code"
+    )
+    billing_address: str = Field(..., title="The billing address")
+    invoice_number: str = Field(..., title="The invoice number")
+    invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
+    due_date: datetime = Field(..., title="The date-time the invoice is due")
+    total: float = Field(..., title="Total amount or price")
+    balance_due: float = Field(..., title="The amount due")
+    bank_details: BankDetails = Field(..., title="Bank details")
+    recipient: str = Field(
+        ...,
+        title="Name of the person/entity that the invoice item was charged or delivered to",
+    )
+    registration_id: str = Field(..., title="The registration ID")
+    registration_fee: float = Field(..., title="The registration fee")
+    registration_date_time: datetime = Field(..., title="The registration date-time")

environment.yml ADDED Viewed

	@@ -0,0 +1,181 @@

+name: env
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - asttokens=2.2.1=pyhd8ed1ab_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=pyhd8ed1ab_3
+  - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
+  - boost-cpp=1.78.0=h9f4b32c_3
+  - bzip2=1.0.8=h8ffe710_4
+  - ca-certificates=2023.7.22=h56e8100_0
+  - cairo=1.16.0=hdecc03f_1016
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.1.3=pyhd8ed1ab_0
+  - debugpy=1.6.7=py39h99910a6_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - executing=1.2.0=pyhd8ed1ab_0
+  - expat=2.5.0=h63175ca_1
+  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+  - font-ttf-inconsolata=3.000=h77eed37_0
+  - font-ttf-source-code-pro=2.038=h77eed37_0
+  - font-ttf-ubuntu=0.83=hab24e00_0
+  - fontconfig=2.14.2=hbde0cde_0
+  - fonts-conda-ecosystem=1=0
+  - fonts-conda-forge=1=0
+  - freetype=2.12.1=h546665d_1
+  - gettext=0.21.1=h5728263_0
+  - icu=72.1=h63175ca_0
+  - importlib-metadata=6.8.0=pyha770c72_0
+  - importlib_metadata=6.8.0=hd8ed1ab_0
+  - ipykernel=6.25.0=pyh6817e22_0
+  - ipython=8.14.0=pyh08f2357_0
+  - jedi=0.18.2=pyhd8ed1ab_0
+  - jupyter_client=8.3.0=pyhd8ed1ab_0
+  - jupyter_core=5.3.1=py39hcbf5309_0
+  - krb5=1.21.1=heb0366b_0
+  - lcms2=2.15=h3e3b177_1
+  - lerc=4.0.0=h63175ca_0
+  - libcurl=8.2.1=hd5e4a3a_0
+  - libdeflate=1.18=hcfcfb64_0
+  - libexpat=2.5.0=h63175ca_1
+  - libffi=3.4.2=h8ffe710_5
+  - libglib=2.76.4=he8f3873_0
+  - libiconv=1.17=h8ffe710_0
+  - libjpeg-turbo=2.1.5.1=hcfcfb64_0
+  - libpng=1.6.39=h19919ed_0
+  - libsodium=1.0.18=h8d14728_1
+  - libssh2=1.11.0=h7dfc565_0
+  - libtiff=4.5.1=h6c8260b_0
+  - libzlib=1.2.13=hcfcfb64_5
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - nest-asyncio=1.5.6=pyhd8ed1ab_0
+  - openjpeg=2.5.0=ha2aaf27_2
+  - openssl=3.1.1=hcfcfb64_1
+  - packaging=23.1=pyhd8ed1ab_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pcre2=10.40=h17e33f8_0
+  - pickleshare=0.7.5=py_1003
+  - pip=23.2.1=py39haa95532_0
+  - pixman=0.40.0=h8ffe710_0
+  - platformdirs=3.9.1=pyhd8ed1ab_0
+  - poppler=23.07.0=h45d20d0_0
+  - poppler-data=0.4.12=hd8ed1ab_0
+  - prompt-toolkit=3.0.39=pyha770c72_0
+  - prompt_toolkit=3.0.39=hd8ed1ab_0
+  - psutil=5.9.5=py39ha55989b_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pygments=2.15.1=pyhd8ed1ab_0
+  - python=3.9.17=h1aa4202_0
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.9=2_cp39
+  - pywin32=304=py39h99910a6_2
+  - pyzmq=25.1.0=py39hea35a22_0
+  - setuptools=68.0.0=py39haa95532_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlite=3.41.2=h2bbff1b_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - tornado=6.3.2=py39ha55989b_0
+  - traitlets=5.9.0=pyhd8ed1ab_0
+  - typing-extensions=4.7.1=hd8ed1ab_0
+  - typing_extensions=4.7.1=pyha770c72_0
+  - ucrt=10.0.22621.0=h57928b3_0
+  - vc=14.3=h64f974e_17
+  - vc14_runtime=14.36.32532=hfdfe4a8_17
+  - vs2015_runtime=14.36.32532=h05e6639_17
+  - wcwidth=0.2.6=pyhd8ed1ab_0
+  - wheel=0.38.4=py39haa95532_0
+  - xz=5.2.6=h8d14728_0
+  - zeromq=4.3.4=h0e60522_1
+  - zipp=3.16.2=pyhd8ed1ab_0
+  - zlib=1.2.13=hcfcfb64_5
+  - zstd=1.5.2=h12be248_7
+  - pip:
+      - aiofiles==23.1.0
+      - aiohttp==3.8.5
+      - aiosignal==1.3.1
+      - altair==5.0.1
+      - annotated-types==0.5.0
+      - anyio==3.7.1
+      - async-timeout==4.0.2
+      - attrs==23.1.0
+      - certifi==2023.7.22
+      - charset-normalizer==3.2.0
+      - click==8.1.6
+      - contourpy==1.1.0
+      - cycler==0.11.0
+      - dataclasses-json==0.5.13
+      - datasets==2.14.1
+      - dill==0.3.7
+      - exceptiongroup==1.1.2
+      - fastapi==0.100.1
+      - ffmpy==0.3.1
+      - filelock==3.12.2
+      - fonttools==4.41.1
+      - frozenlist==1.4.0
+      - fsspec==2023.6.0
+      - gradio==3.39.0
+      - gradio-client==0.3.0
+      - greenlet==2.0.2
+      - h11==0.14.0
+      - httpcore==0.17.3
+      - httpx==0.24.1
+      - huggingface-hub==0.16.4
+      - idna==3.4
+      - importlib-resources==6.0.0
+      - iso4217==1.11.20220401
+      - jinja2==3.1.2
+      - jsonschema==4.18.4
+      - jsonschema-specifications==2023.7.1
+      - kiwisolver==1.4.4
+      - langchain==0.0.247
+      - langsmith==0.0.15
+      - linkify-it-py==2.0.2
+      - markdown-it-py==2.2.0
+      - markupsafe==2.1.3
+      - marshmallow==3.20.1
+      - matplotlib==3.7.2
+      - mdit-py-plugins==0.3.3
+      - mdurl==0.1.2
+      - multidict==6.0.4
+      - multiprocess==0.70.15
+      - mypy-extensions==1.0.0
+      - numexpr==2.8.4
+      - numpy==1.25.1
+      - openai==0.27.8
+      - openapi-schema-pydantic==1.2.4
+      - opencv-python-headless==4.8.0.74
+      - orjson==3.9.2
+      - pandas==2.0.3
+      - pdf2image==1.16.3
+      - pillow==10.0.0
+      - pyarrow==12.0.1
+      - pydantic==1.10.12
+      - pydantic-core==2.4.0
+      - pydub==0.25.1
+      - pyocr==0.8.3
+      - pyparsing==3.0.9
+      - pypdf==3.13.0
+      - pypiwin32==223
+      - python-multipart==0.0.6
+      - pytz==2023.3
+      - pyyaml==6.0.1
+      - referencing==0.30.0
+      - requests==2.31.0
+      - rpds-py==0.9.2
+      - semantic-version==2.10.0
+      - sniffio==1.3.0
+      - sqlalchemy==2.0.19
+      - starlette==0.27.0
+      - tenacity==8.2.2
+      - toolz==0.12.0
+      - tqdm==4.65.0
+      - typing-inspect==0.9.0
+      - tzdata==2023.3
+      - uc-micro-py==1.0.2
+      - urllib3==2.0.4
+      - uvicorn==0.23.1
+      - websockets==11.0.3
+      - xxhash==3.3.0
+      - yarl==1.9.2

examples/example1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
+size 45782

examples/rotated.jpeg ADDED Viewed

Git LFS Details

SHA256: e98aa24e25b2c3f277c237664cba4616fbe5d80fe3099459fb81e2ef3720d23c
Pointer size: 132 Bytes
Size of remote file: 1.79 MB

examples/rotated.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
+size 333463

examples/upright.jpeg ADDED Viewed

Git LFS Details

SHA256: 728be2c94b4af573145e5e89ffe5c3dfddb12a3055b85e60a23bd7697cff83f7
Pointer size: 132 Bytes
Size of remote file: 2.93 MB

examples/upright.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
+size 325064

extract.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
+"""
+from io import BytesIO
+from typing import List
+import pyocr.tesseract
+import pypdf
+from PIL import Image
+def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
+    """Extracts text from the given PDF file using pypdf.
+    Args:
+        bytes_stream (BytesIO): The PDF file to extract text from.
+    Returns: The extracted text
+    """
+    pdf_reader = pypdf.PdfReader(bytes_stream)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+        text += "\n\n"
+    return text
+def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
+    """Extracts text from the given image using tesseract via pyocr.
+    Args:
+        image(PIL.Image.Image): The image to extract text from.
+    Returns: The extracted text.
+    """
+    if not pyocr.tesseract.is_available():
+        raise Exception("Tesseract is not available.")
+    text = pyocr.tesseract.image_to_string(image, lang="eng")
+    return text
+def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
+    """Extracts text from the given images using tesseract via pyocr.
+    Args:
+        images(List[PIL.Image.Image]): The images to extract text from.
+    Returns: The extracted text.
+    """
+    text = ""
+    for image in images:
+        text += extract_text_from_image_pyocr_tesseract(image)
+        text += "\n\n"
+        image.close()
+    return text
+if __name__ == '__main__':
+    filename = 'examples/upright.pdf'
+    with open(filename, 'rb') as file:
+        bytes_stream = BytesIO(file.read())
+    text = extract_text_from_pdf_pypdf(bytes_stream)
+    print(text)
+    print("-"*25)
+    filename = 'examples/upright.jpeg'
+    image = Image.open(filename)
+    text = extract_text_from_image_pyocr_tesseract(image)
+    print(text)
+    image.close()

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from pathlib import Path
+import categories
+import processing
+import extract
+from PIL import Image
+from pydantic import BaseModel
+from io import BytesIO
+def categorize_and_parse_text(text: str) -> BaseModel:
+    """Categorizes the text and parses the information from it.
+    Args:
+        text(str): The text to categorize and parse information from.
+    Returns: The category of the text.
+    """
+    category = categories.categorize_text(text)
+    # if stop_on_category:
+    #     return category, text
+    result = categories.run_category_chain(category, text)
+    return result
+def process_pdf(filename: Path, extract_only=False) -> BaseModel:
+    """Processes the given PDF file and extracts information from it.
+    Args:
+        filename(Path): The PDF file to process.
+    Returns: The extracted information.
+    """
+    with open(filename, "rb") as f:
+        pdf_bytes = bytes(f.read())
+    text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
+    # If the encoded text is too short, a pdf scanner probably added a watermark
+    if len(text) < 20:
+        # Try to extract text from images
+        images = processing.preprocess_pdf_pdf2image(pdf_bytes)
+        text = extract.extract_text_from_images_pyocr_tesseract(images)
+    if extract_only:
+        return text
+    result = categorize_and_parse_text(text)
+    return result
+def process_image(filename: Path, extract_only=False) -> BaseModel:
+    """Processes the given image file and extracts information from it.
+    Args:
+        filename(Path): The image file to process.
+    Returns: The extracted information.
+    """
+    image = Image.open(filename)
+    image = processing.preprocess_image(image)
+    text = extract.extract_text_from_image_pyocr_tesseract(image)
+    image.close()
+    if extract_only:
+        return text
+    result = categorize_and_parse_text(text)
+    return result
+if __name__ == "__main__":
+    filename = Path("examples/example1.pdf")
+    result = process_pdf(filename)
+    print(result.json(indent=4))

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

processing.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Responsible for (pre)processing images and PDFs before they are passed to the OCR
+engine and other miscellaneous actions concerning processing.
+"""
+import os
+from pathlib import Path
+from typing import List
+# import cv2
+# import numpy as np
+import pyocr
+from pdf2image import pdf2image
+from PIL import Image  #, ImageOps
+PDF_CONVERSION_DPI = 300
+ROTATION_CONFIDENCE_THRESHOLD = 2.0
+# def rotate_image(image: Image, angle: float):
+#     """Rotates the given image by the given angle.
+#     Args:
+#         image(PIL.Image.Image): The image to be rotated.
+#         angle(float): The angle to rotate the image by.
+#     Returns: The rotated image.
+#     """
+#     image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+#     height, width, _ = image.shape  # Get the image height, width, and channels
+#     # Compute the rotation matrix
+#     rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
+#     # Apply the rotation to the image
+#     rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
+#     rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
+#     return rotated_image
+# class PDF_CONVERTER(enum.Enum):
+#     PDF2IMAGE = 1
+#     IMAGEMAGICK = 2
+def correct_orientation(image: Image.Image) -> Image.Image:
+    """Corrects the orientation of an image if it is not upright.
+    Args:
+        image(PIL.Image.Image): The pillow image to be corrected.
+    Returns: The corrected pillow image as a copy. The original image is not closed.
+    """
+    if not pyocr.tesseract.is_available():
+        raise Exception("Tesseract is not available.")
+    # image = ImageOps.exif_transpose(image)  # EXIF rotation is apparent, not actual
+    orientation_info = {}
+    try:
+        orientation_info = pyocr.tesseract.detect_orientation(image)
+    except pyocr.PyocrException as e:
+        print("Orientation detection failed: {}".format(e))
+    # output = pytesseract.image_to_osd(
+    #     image, config=" --psm 0", output_type=pytesseract.Output.DICT
+    # )
+    angle = orientation_info.get("angle", 0)
+    confidence = orientation_info.get("confidence", 100)
+    # rotate = output["rotate"]
+    # confidence = output["orientation_conf"]
+    if confidence > ROTATION_CONFIDENCE_THRESHOLD:
+        new_image = image.rotate(angle, expand=True)
+    else:
+        new_image = image.copy()
+    return new_image
+def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
+    """Converts a PDF to an image using pdf2image.
+    Args:
+        pdf_bytes(bytes): The bytes of the PDF to be converted.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
+    return images
+def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
+    """Converts a PDF to an image using ImageMagick.
+    Args:
+        filename(pathlib.Path): The path to the PDF to be converted.
+        dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
+                                   are saved in the folder as page.jpg or as page-01.jpg,
+                                   page-02.jpg, etc.
+    Returns: dest_folder
+    """
+    os.system(f"magick convert"
+                f"-density {PDF_CONVERSION_DPI}"
+                f"{filename}"
+                f"-quality 100"
+                f"{dest_folder/'page.jpg'}")
+    return dest_folder
+def preprocess_image(image: Image.Image) -> Image.Image:
+    """Preprocesses an image for future use with OCR.
+    The following operations are performed:
+      1. Orientation correction
+    Args:
+        image(PIL.Image.Image): The image to be preprocessed.
+    Returns: The preprocessed pillow image.
+    """
+    rotated_image = correct_orientation(image)
+    result = rotated_image
+    image.close()
+    return result
+def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
+    """Preprocesses a PDF for future use with OCR.
+    The following operations are performed:
+      1. PDF to image conversion
+      2. Orientation correction
+    Args:
+        pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    images = convert_pdf_to_image_pdf2image(pdf_bytes)
+    result = []
+    for image in images:
+        new_image = preprocess_image(image)
+        image.close()
+        result.append(new_image)
+    return result
+def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
+    """Preprocesses a PDF for future use with OCR.
+    The following operations are performed:
+      1. PDF to image conversion
+      2. Orientation correction
+    Args:
+        filename(pathlib.Path): The path to the PDF to be preprocessed.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
+    result = []
+    for image in dest_folder.glob("*.jpg"):
+        new_image = preprocess_image(image)
+        image.close()
+        result.append(new_image)
+    return result
+if __name__ == '__main__':
+    filename = 'examples/upright.jpeg'
+    image = Image.open(filename)
+    new_image = preprocess_image(image)
+    image.close()
+    new_image.show()
+    new_image.close()
+    filename = 'examples/rotated.pdf'
+    with open(filename, 'rb') as file:
+        bytes_ = bytes(file.read())
+    images = preprocess_pdf_pdf2image(bytes_)
+    for image in images:
+        image.show()
+        image.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,124 @@

+aiofiles==23.1.0
+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+annotated-types==0.5.0
+anyio==3.7.1
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==23.1.0
+backcall==0.2.0
+backports.functools-lru-cache==1.6.5
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+colorama==0.4.6
+comm==0.1.3
+contourpy==1.1.0
+cycler==0.11.0
+dataclasses-json==0.5.13
+datasets==2.14.1
+debugpy==1.6.7
+decorator==5.1.1
+dill==0.3.7
+exceptiongroup==1.1.2
+executing==1.2.0
+fastapi==0.100.1
+ffmpy==0.3.1
+filelock==3.12.2
+fonttools==4.41.1
+frozenlist==1.4.0
+fsspec==2023.6.0
+gradio==3.39.0
+gradio_client==0.3.0
+greenlet==2.0.2
+h11==0.14.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.16.4
+idna==3.4
+importlib-metadata==6.8.0
+importlib-resources==6.0.0
+ipykernel==6.25.0
+ipython==8.14.0
+iso4217==1.11.20220401
+jedi==0.18.2
+Jinja2==3.1.2
+jsonschema==4.18.4
+jsonschema-specifications==2023.7.1
+jupyter_client==8.3.0
+jupyter_core==5.3.1
+kiwisolver==1.4.4
+langchain==0.0.247
+langsmith==0.0.15
+linkify-it-py==2.0.2
+markdown-it-py==2.2.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+matplotlib==3.7.2
+matplotlib-inline==0.1.6
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+multidict==6.0.4
+multiprocess==0.70.15
+mypy-extensions==1.0.0
+nest-asyncio==1.5.6
+numexpr==2.8.4
+numpy==1.25.1
+openai==0.27.8
+openapi-schema-pydantic==1.2.4
+opencv-python-headless==4.8.0.74
+orjson==3.9.2
+packaging==23.1
+pandas==2.0.3
+parso==0.8.3
+pdf2image==1.16.3
+pickleshare==0.7.5
+Pillow==10.0.0
+pip==23.2.1
+platformdirs==3.9.1
+prompt-toolkit==3.0.39
+psutil==5.9.5
+pure-eval==0.2.2
+pyarrow==12.0.1
+pydantic==1.10.12
+pydantic_core==2.4.0
+pydub==0.25.1
+Pygments==2.15.1
+pyocr==0.8.3
+pyparsing==3.0.9
+pypdf==3.13.0
+pypiwin32==223
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+pywin32==304
+PyYAML==6.0.1
+pyzmq==25.1.0
+referencing==0.30.0
+requests==2.31.0
+rpds-py==0.9.2
+semantic-version==2.10.0
+setuptools==68.0.0
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.19
+stack-data==0.6.2
+starlette==0.27.0
+tenacity==8.2.2
+toolz==0.12.0
+tornado==6.3.2
+tqdm==4.65.0
+traitlets==5.9.0
+typing_extensions==4.7.1
+typing-inspect==0.9.0
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==2.0.4
+uvicorn==0.23.1
+wcwidth==0.2.6
+websockets==11.0.3
+wheel==0.38.4
+xxhash==3.3.0
+yarl==1.9.2
+zipp==3.16.2