import json import os from pathlib import Path import google.generativeai as genai import gradio as gr import pandas as pd from gradio_pdf import PDF from pdf2image import convert_from_path from pypdf import PdfReader genai.configure(api_key=os.environ.get("GOOGLE_API_KEY")) headers = [ "DUE DATE", "SERVICE ADDRESS", "SERVICE PERIOD", "ELECTRICITY USAGE (KWH)", "ELECTRICITY SPEND ($)", "GAS USAGE (THERMS)", "GAS SPEND ($)", "WATER USAGE (CCF)", "WATER SPEND ($)", "SEWER ($)", "REFUSE ($)", "STORM DRAIN ($)", "UTILITY USERS TAX ($)", "TOTAL CURRENT CHARGES ($)", "TOTAL AMOUNT DUE", ] inputs = [PDF(label="Document")] outputs = [ gr.Dataframe( row_count=(1, "dynamic"), col_count=(15, "fixed"), label="Utility", headers=headers, datatype=[ "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", ], ) ] def get_content_between_curly_braces(text): """ This function extracts the content between the opening and closing curly braces of a string. Args: text: The string to extract content from. Returns: The extracted content as a string, or None if no curly braces are found. """ start_index = text.find("{") end_index = text.rfind("}") if start_index != -1 and end_index > start_index: return text[start_index : end_index + 1] else: return None def parse_utility_bill(filepath): print("FOUND PDF!") reader = PdfReader(filepath) number_of_pages = len(reader.pages) images = convert_from_path(filepath) assert number_of_pages == len(images) page = reader.pages[0] text = page.extract_text() image = images[0] print("---------------------------------------------------------------") print(f"We have the image at: ") print(image) print(f"Here is the text:") print(text) print("---------------------------------------------------------------") model = genai.GenerativeModel( "gemini-pro-vision", ) promt_text = ( f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull.""" + """This JSON schema: {'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}.""" ) print(f"PROMPT: {promt_text}") response = model.generate_content( [ promt_text, image, ], generation_config={"max_output_tokens": 2048, "temperature": 0.0}, ) json_response = get_content_between_curly_braces(response.text) respone_dict = json.loads(json_response) print(respone_dict) rectified_dict = {} for target_key in headers: for key, value in respone_dict.items(): if key == target_key: rectified_dict[key] = value break else: rectified_dict[target_key] = None print(rectified_dict) example_data = [rectified_dict] return pd.DataFrame(example_data) gr.Interface( fn=parse_utility_bill, inputs=inputs, outputs=outputs, examples=[ "utl-bill-sample.pdf", "nem-2-utility-bill-sample.pdf", "Sample_Utility_Bill.pdf", "Water Bill Sample.pdf", "canada.pdf", "water.pdf", ], title="🌏⚡💧🔥PDF Utitlity Bill Parser", ).launch()