kkawamu1 commited on
Commit
b17d312
1 Parent(s): 093230d

First commit

Browse files
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.0
Sample_Utility_Bill.pdf ADDED
Binary file (696 kB). View file
 
Water Bill Sample.pdf ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import google.generativeai as genai
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from gradio_pdf import PDF
8
+ from pdf2image import convert_from_path
9
+ from pypdf import PdfReader
10
+ from pathlib import Path
11
+ dir_ = Path(__file__).parent
12
+
13
+ genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
14
+ headers=[
15
+ "DUE DATE",
16
+ "SERVICE ADDRESS",
17
+ "SERVICE PERIOD",
18
+ "ELECTRICITY USAGE (KWH)",
19
+ "ELECTRICITY SPEND ($)",
20
+ "GAS USAGE (THERMS)",
21
+ "GAS SPEND ($)",
22
+ "WATER USAGE (CCF)",
23
+ "WATER SPEND ($)",
24
+ "SEWER ($)",
25
+ "REFUSE ($)",
26
+ "STORM DRAIN ($)",
27
+ "UTILITY USERS TAX ($)",
28
+ "TOTAL CURRENT CHARGES ($)",
29
+ "TOTAL AMOUNT DUE",
30
+ ]
31
+
32
+
33
+ inputs = [PDF(label="Document")]
34
+
35
+ outputs = [
36
+ gr.Dataframe(
37
+ row_count=(1, "dynamic"),
38
+ col_count=(15, "fixed"),
39
+ label="Utility",
40
+ headers=headers,
41
+ datatype=[
42
+ "str",
43
+ "str",
44
+ "str",
45
+ "str",
46
+ "str",
47
+ "str",
48
+ "str",
49
+ "str",
50
+ "str",
51
+ "str",
52
+ "str",
53
+ "str",
54
+ "str",
55
+ "str",
56
+ "str",
57
+ ],
58
+ )
59
+ ]
60
+
61
+
62
+ def get_content_between_curly_braces(text):
63
+ """
64
+ This function extracts the content between the opening and closing curly braces of a string.
65
+
66
+ Args:
67
+ text: The string to extract content from.
68
+
69
+ Returns:
70
+ The extracted content as a string, or None if no curly braces are found.
71
+ """
72
+ start_index = text.find("{")
73
+ end_index = text.rfind("}")
74
+
75
+ if start_index != -1 and end_index > start_index:
76
+ return text[start_index : end_index + 1]
77
+ else:
78
+ return None
79
+
80
+
81
+
82
+ def parse_utility_bill(filepath):
83
+ print("FOUND PDF!")
84
+ reader = PdfReader(filepath)
85
+ number_of_pages = len(reader.pages)
86
+ images = convert_from_path(filepath)
87
+ assert number_of_pages == len(images)
88
+ page = reader.pages[0]
89
+ text = page.extract_text()
90
+ image = images[0]
91
+
92
+ print("---------------------------------------------------------------")
93
+ print(f"We have the image at: ")
94
+ print(image)
95
+ print(f"Here is the text:")
96
+ print(text)
97
+ print("---------------------------------------------------------------")
98
+ model = genai.GenerativeModel(
99
+ "gemini-pro-vision",
100
+ )
101
+ promt_text = (
102
+ f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull."""
103
+ + """This JSON schema:
104
+ {'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}."""
105
+ )
106
+ print(f"PROMPT: {promt_text}")
107
+ response = model.generate_content(
108
+ [
109
+ promt_text,
110
+ image,
111
+ ],
112
+ generation_config={"max_output_tokens": 2048, "temperature": 0.0},
113
+ )
114
+ json_response = get_content_between_curly_braces(response.text)
115
+ respone_dict = json.loads(json_response)
116
+ print(respone_dict)
117
+ rectified_dict = {}
118
+ for target_key in headers:
119
+
120
+ for key, value in respone_dict.items():
121
+ if key == target_key:
122
+ rectified_dict[key] = value
123
+ break
124
+ else:
125
+ rectified_dict[target_key] = None
126
+ print(rectified_dict)
127
+ example_data = [rectified_dict]
128
+
129
+
130
+ return pd.DataFrame(example_data)
131
+
132
+ gr.Interface(
133
+ fn=parse_utility_bill,
134
+ inputs=inputs,
135
+ outputs=outputs,
136
+ examples=["utl-bill-sample.pdf", "nem-2-utility-bill-sample.pdf", "Sample_Utility_Bill.pdf", "Water Bill Sample.pdf", "canada.pdf", "water.pdf"],
137
+ title="🌏⚡💧🔥PDF Utitlity Bill Parser",
138
+ ).launch()
139
+
canada.pdf ADDED
Binary file (112 kB). View file
 
nem-2-utility-bill-sample.pdf ADDED
Binary file (889 kB). View file
 
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr-all
2
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pypdf
2
+ pytesseract
3
+ gradio_pdf
utl-bill-sample.pdf ADDED
Binary file (76.4 kB). View file
 
water.pdf ADDED
Binary file (252 kB). View file