kkawamu1's picture
Fix package issues
376bc5c
import json
import os
from pathlib import Path
import google.generativeai as genai
import gradio as gr
import pandas as pd
from gradio_pdf import PDF
from pdf2image import convert_from_path
from pypdf import PdfReader
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
headers = [
"DUE DATE",
"SERVICE ADDRESS",
"SERVICE PERIOD",
"ELECTRICITY USAGE (KWH)",
"ELECTRICITY SPEND ($)",
"GAS USAGE (THERMS)",
"GAS SPEND ($)",
"WATER USAGE (CCF)",
"WATER SPEND ($)",
"SEWER ($)",
"REFUSE ($)",
"STORM DRAIN ($)",
"UTILITY USERS TAX ($)",
"TOTAL CURRENT CHARGES ($)",
"TOTAL AMOUNT DUE",
]
inputs = [PDF(label="Document")]
outputs = [
gr.Dataframe(
row_count=(1, "dynamic"),
col_count=(15, "fixed"),
label="Utility",
headers=headers,
datatype=[
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
"str",
],
)
]
def get_content_between_curly_braces(text):
"""
This function extracts the content between the opening and closing curly braces of a string.
Args:
text: The string to extract content from.
Returns:
The extracted content as a string, or None if no curly braces are found.
"""
start_index = text.find("{")
end_index = text.rfind("}")
if start_index != -1 and end_index > start_index:
return text[start_index : end_index + 1]
else:
return None
def parse_utility_bill(filepath):
print("FOUND PDF!")
reader = PdfReader(filepath)
number_of_pages = len(reader.pages)
images = convert_from_path(filepath)
assert number_of_pages == len(images)
page = reader.pages[0]
text = page.extract_text()
image = images[0]
print("---------------------------------------------------------------")
print(f"We have the image at: ")
print(image)
print(f"Here is the text:")
print(text)
print("---------------------------------------------------------------")
model = genai.GenerativeModel(
"gemini-pro-vision",
)
promt_text = (
f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull."""
+ """This JSON schema:
{'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}."""
)
print(f"PROMPT: {promt_text}")
response = model.generate_content(
[
promt_text,
image,
],
generation_config={"max_output_tokens": 2048, "temperature": 0.0},
)
json_response = get_content_between_curly_braces(response.text)
respone_dict = json.loads(json_response)
print(respone_dict)
rectified_dict = {}
for target_key in headers:
for key, value in respone_dict.items():
if key == target_key:
rectified_dict[key] = value
break
else:
rectified_dict[target_key] = None
print(rectified_dict)
example_data = [rectified_dict]
return pd.DataFrame(example_data)
gr.Interface(
fn=parse_utility_bill,
inputs=inputs,
outputs=outputs,
examples=[
"utl-bill-sample.pdf",
"nem-2-utility-bill-sample.pdf",
"Sample_Utility_Bill.pdf",
"Water Bill Sample.pdf",
"canada.pdf",
"water.pdf",
],
title="🌏⚡💧🔥PDF Utitlity Bill Parser",
).launch()