First commit · kkawamu1/Utility_Bill

	@@ -0,0 +1,139 @@

+import json
+import os
+import google.generativeai as genai
+import gradio as gr
+import pandas as pd
+from gradio_pdf import PDF
+from pdf2image import convert_from_path
+from pypdf import PdfReader
+from pathlib import Path
+dir_ = Path(__file__).parent
+genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
+headers=[
+            "DUE DATE",
+            "SERVICE ADDRESS",
+            "SERVICE PERIOD",
+            "ELECTRICITY USAGE (KWH)",
+            "ELECTRICITY SPEND ($)",
+            "GAS USAGE (THERMS)",
+            "GAS SPEND ($)",
+            "WATER USAGE (CCF)",
+            "WATER SPEND ($)",
+            "SEWER ($)",
+            "REFUSE ($)",
+            "STORM DRAIN ($)",
+            "UTILITY USERS TAX ($)",
+            "TOTAL CURRENT CHARGES ($)",
+            "TOTAL AMOUNT DUE",
+        ]
+inputs = [PDF(label="Document")]
+outputs = [
+    gr.Dataframe(
+        row_count=(1, "dynamic"),
+        col_count=(15, "fixed"),
+        label="Utility",
+        headers=headers,
+        datatype=[
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+            "str",
+        ],
+    )
+]
+def get_content_between_curly_braces(text):
+    """
+    This function extracts the content between the opening and closing curly braces of a string.
+    Args:
+        text: The string to extract content from.
+    Returns:
+        The extracted content as a string, or None if no curly braces are found.
+    """
+    start_index = text.find("{")
+    end_index = text.rfind("}")
+    if start_index != -1 and end_index > start_index:
+        return text[start_index : end_index + 1]
+    else:
+        return None
+def parse_utility_bill(filepath):
+    print("FOUND PDF!")
+    reader = PdfReader(filepath)
+    number_of_pages = len(reader.pages)
+    images = convert_from_path(filepath)
+    assert number_of_pages == len(images)
+    page = reader.pages[0]
+    text = page.extract_text()
+    image = images[0]
+    print("---------------------------------------------------------------")
+    print(f"We have the image at: ")
+    print(image)
+    print(f"Here is the text:")
+    print(text)
+    print("---------------------------------------------------------------")
+    model = genai.GenerativeModel(
+        "gemini-pro-vision",
+    )
+    promt_text = (
+        f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull."""
+        + """This JSON schema:
+{'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}."""
+    )
+    print(f"PROMPT: {promt_text}")
+    response = model.generate_content(
+        [
+            promt_text,
+            image,
+        ],
+        generation_config={"max_output_tokens": 2048, "temperature": 0.0},
+    )
+    json_response = get_content_between_curly_braces(response.text)
+    respone_dict = json.loads(json_response)
+    print(respone_dict)
+    rectified_dict = {}
+    for target_key in headers:
+        for key, value in respone_dict.items():
+            if key == target_key:
+                rectified_dict[key] = value
+                break
+        else:
+            rectified_dict[target_key] = None
+    print(rectified_dict)
+    example_data = [rectified_dict]
+    return pd.DataFrame(example_data)
+gr.Interface(
+    fn=parse_utility_bill,
+    inputs=inputs,
+    outputs=outputs,
+    examples=["utl-bill-sample.pdf", "nem-2-utility-bill-sample.pdf", "Sample_Utility_Bill.pdf", "Water Bill Sample.pdf", "canada.pdf", "water.pdf"],
+    title="🌏⚡💧🔥PDF Utitlity Bill Parser",
+).launch()


1	+ 3.10.0

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr-all
2	+ poppler-utils

	@@ -0,0 +1,3 @@