Spaces:

youdata-ai
/

Document-AI

Sleeping

App Files Files Community

HarshilRamiAISV commited on Sep 24, 2024

Commit

79ac2cd

verified ·

1 Parent(s): 615f50c

Upload 3 files

Browse files

Files changed (3) hide show

app.py +529 -0
process.py +376 -0
requirements.txt +120 -0

app.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import streamlit as st
+from process import process,process_using_llm
+import os
+import shutil
+TEMP_DIR = "temp_files"
+import PyPDF2
+st.set_page_config(layout="wide",page_title="KYC Doc AI")
+def cleanup_temp_files():
+    try:
+        if os.path.exists(TEMP_DIR):
+            # Remove all files in TEMP_DIR
+            shutil.rmtree(TEMP_DIR)
+            print(f"Temporary files in {TEMP_DIR} have been deleted.")
+        # Re-create the temp directory after cleanup
+        os.makedirs(TEMP_DIR)
+    except Exception as e:
+        print(f"An error occurred during cleanup: {e}")
+def extract_pages(input_pdf_path, output_pdf_path, start_page=None, end_page=None):
+    try:
+        # Open the PDF file
+        with open(input_pdf_path, 'rb') as input_pdf:
+            reader = PyPDF2.PdfReader(input_pdf)
+            total_pages = len(reader.pages)
+            # Create a PDF writer object for the new PDF
+            writer = PyPDF2.PdfWriter()
+            # Default: extract only the first page if no specific input is provided
+            if start_page is None and end_page is None:
+                start_page, end_page = 0, 0
+            # Special case: if user specifies 0 for start_page and -1 for end_page
+            # Extract the first and last page only
+            if start_page == 0 and end_page == -1:
+                writer.add_page(reader.pages[0])  # First page
+                writer.add_page(reader.pages[-1])  # Last page
+            # If only first page is required
+            elif start_page == 0 and end_page == 0:
+                writer.add_page(reader.pages[0])  # First page
+            else:
+                print("Invalid input. Only first page or (first and last page) extraction is allowed.")
+                return
+            # Write the combined PDF to a new file
+            with open(output_pdf_path, 'wb') as output_pdf:
+                writer.write(output_pdf)
+            print(f"PDF saved as {output_pdf_path}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+def merge_dicts_by_aadhaar(data):
+    new_dic = {}
+    for dic in data:
+        aadhar = dic.get("Aadhaar Number")
+        aadhar = aadhar.replace(" ", "")
+        if aadhar in new_dic:
+            if dic.get('Gender') and dic.get('Date of birth'):
+                new_dic[aadhar]['Gender'] = dic.get('Gender', None)
+                new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
+                new_dic[aadhar]['Name'] = dic.get('Name', None)
+                new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
+            else:
+                new_dic[aadhar]['Address'] = dic.get('Address', None)
+        else:
+            new_dic[aadhar] = {}
+            if dic.get('Gender') and dic.get('Date of birth'):
+                new_dic[aadhar]['Gender'] = dic.get('Gender', None)
+                new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
+                new_dic[aadhar]['Name'] = dic.get('Name', None)
+                new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
+            else:
+                new_dic[aadhar]['Address'] = dic.get('Address', None)
+    new_data = []
+    for key in new_dic:
+        new_data.append(new_dic[key])
+    return new_data
+def process_uploads(uploaded_files):
+    try:
+        company_name=""
+        company_name_legal=""
+        company_trade_name=""
+        gst_number=""
+        pan_number_company=""
+        coi_number=""
+        director_names=[]
+        extracted_results={}
+        if not os.path.exists(TEMP_DIR):
+            os.makedirs(TEMP_DIR)
+        # director pans: -> can be individual files, single file
+        director_pans=uploaded_files.get('director_pans',None)
+        if director_pans:
+            director_pan_data=[]
+            for pan in director_pans:
+                file_path = os.path.join(TEMP_DIR, pan.name)
+                with open(file_path, "wb") as temp_file:
+                    temp_file.write(pan.getbuffer())
+                ocr_data=process(file_path)
+                content = ""
+                for page_num, text in ocr_data.items():
+                    content += text + '\n'
+                dict=process_using_llm(content,"pan_user")
+                if dict:
+                    director_pan_data.append(dict)
+            for pan_data in director_pan_data:
+                if "Name" in pan_data:
+                    director_names.append(pan_data.get('Name').strip().lower())
+            extracted_results['Pan Cards of Directors']=director_pan_data
+        director_aadhars=uploaded_files.get("director_aadhars",None)
+        if director_aadhars:
+            director_aadhars_data=[]
+            for aadhar in director_aadhars:
+                file_path = os.path.join(TEMP_DIR, aadhar.name)
+                with open(file_path, "wb") as temp_file:
+                    temp_file.write(aadhar.getbuffer())
+                ocr_data = process(file_path)
+                content = ""
+                for page_num, text in ocr_data.items():
+                    content += text + '\n'
+                dict = process_using_llm(content, "aadhar_user")
+                if dict:
+                    director_aadhars_data.append(dict)
+            director_aadhars_data_new=merge_dicts_by_aadhaar(director_aadhars_data)
+            for direc_adhar in director_aadhars_data_new:
+                if "Name" in direc_adhar:
+                    name=direc_adhar.get('Name').strip().lower()
+                    if name not in director_names:
+                        director_names.append(name)
+            extracted_results["Aadhaar Cards of Directors"]=director_aadhars_data_new
+            print(director_aadhars_data_new)
+        gst_cert=uploaded_files.get('gst_certificate',None)
+        if gst_cert:
+            file_path = os.path.join(TEMP_DIR, gst_cert.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(gst_cert.getbuffer())
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "gst")
+            if "Legal Name" in dict:
+                company_name_legal=dict.get('Legal Name','')
+                dict['Is Company Matching'] = "Yes"
+            elif "Trade Name" in dict:
+                company_trade_name=dict.get('Trade Name','')
+                dict['Is Company Matching'] = "Yes"
+            if "Registration Number" in dict:
+                gst_number=dict.get('Registration Number')
+            if "Names of directors" in dict:
+                gst_director_names = dict.get("Names of directors", [])
+                dict["Names of directors"]=",".join(gst_director_names)
+                missing_directors = []
+                gst_director_names_lower = [name.strip().lower() for name in gst_director_names]
+                for direc_name in director_names:
+                    if direc_name not in gst_director_names_lower:
+                        missing_directors.append(direc_name)
+                if not missing_directors:
+                    dict["All director names present?"] = "<span style='color: green;'><strong>Yes</strong></span>"
+                else:
+                    # List missing director names in red
+                    missing_directors_text = ', '.join(
+                        [f"<span style='color: red;'>{name}</span>" for name in missing_directors])
+                    dict[
+                        "All director names present?"] = f"<span style='color: red;'><strong>No</strong></span> (Missing: {missing_directors_text})"
+            extracted_results['GST Certificate Details']=dict
+        company_pan = uploaded_files.get('company_pan',None)
+        if company_pan:
+            file_path = os.path.join(TEMP_DIR, company_pan.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(company_pan.getbuffer())
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "company_pan")
+            if "Company Name" in dict:
+                name=dict.get("Company Name")
+                if name==company_trade_name or name == company_name_legal:
+                    dict['Is Company Matching']="Yes"
+                    company_name=name.strip()
+            if "PAN Number" in dict:
+                pan_number_company=dict.get('PAN Number').strip()
+            extracted_results['Company PAN Details']=dict
+        coi = uploaded_files.get('coi',None)
+        if coi:
+            file_path = os.path.join(TEMP_DIR, coi.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(coi.getbuffer())
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "coi")
+            if "Company Name" in dict:
+                name=dict.get("Company Name")
+                if name==company_trade_name or name == company_name_legal or name==company_name:
+                    dict['Is Company Matching']="Yes"
+                    company_name=name.strip()
+                else:
+                    dict['Is Company Matching'] = "No"
+            if "PAN Number" in dict and dict.get('PAN Number','').strip()==pan_number_company:
+                dict['Is Company PAN Number Matching']="Yes"
+            elif "PAN Number" in dict and dict.get('PAN Number','').strip()!=pan_number_company:
+                dict['Is Company PAN Number Matching'] = "No"
+            if "Corporate Identity Number" in dict:
+                coi_number=dict.get("Corporate Identity Number").strip()
+            extracted_results['COI Details']=dict
+            print(dict)
+        aoa = uploaded_files.get('aoa',None)
+        if aoa:
+            file_path = os.path.join(TEMP_DIR, aoa.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(aoa.getbuffer())
+            extract_pages(file_path,file_path,0,-1)
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "aoa")
+            if "Company Name" in dict:
+                name=dict.get("Company Name").strip()
+                if name==company_trade_name or name == company_name_legal or name==company_name:
+                    dict['Is Company Matching']="Yes"
+                    company_name=name
+                else:
+                    dict['Is Company Matching'] = "No"
+            if "Share Holders" in dict:
+                share_holders=dict.get("Share Holders",[])
+                dict["Share Holders"]=",".join(share_holders)
+            extracted_results['AOA Details']=dict
+            print(dict)
+        moa = uploaded_files.get('moa',None)
+        if moa:
+            file_path = os.path.join(TEMP_DIR, moa.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(moa.getbuffer())
+            extract_pages(file_path, file_path,0,-1)
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "moa")
+            if "Company Name" in dict:
+                name=dict.get("Company Name").strip()
+                if name==company_trade_name or name == company_name_legal or name==company_name:
+                    dict['Is Company Matching']="Yes"
+                    company_name=name
+                else:
+                    dict['Is Company Matching'] = "No"
+            if "Share Holders" in dict:
+                share_holders=dict.get("Share Holders",[])
+                dict["Share Holders"]=",".join(share_holders)
+            extracted_results['MOA Details']=dict
+            print(dict)
+        share=uploaded_files.get('share',None)
+        if share:
+            file_path = os.path.join(TEMP_DIR, share.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(share.getbuffer())
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "share")
+            if "Company Name" in dict:
+                name=dict.get("Company Name").strip()
+                if name==company_trade_name or name == company_name_legal or name==company_name:
+                    dict['Is Company Matching']="Yes"
+                    company_name=name
+                else:
+                    dict['Is Company Matching'] = "No"
+            if "Corporate Identity Number" in dict and dict.get("Corporate Identity Number").strip()==coi_number:
+                dict['Is Corporate Identity Number Matching']="Yes"
+            elif "Corporate Identity Number" in dict and dict.get("Corporate Identity Number").strip()!=coi_number:
+                dict['Is Corporate Identity Number Matching'] = "No"
+            if "Share Holders" in dict:
+                share_holders=dict.get("Share Holders",[])
+                dict["Share Holders"]=",".join(share_holders)
+            extracted_results['Shareholding Details']=dict
+            print(dict)
+        address_proof = uploaded_files.get('address_proof', None)
+        if address_proof:
+            file_path = os.path.join(TEMP_DIR, address_proof.name)
+            with open(file_path, "wb") as temp_file:
+                temp_file.write(address_proof.getbuffer())
+            extract_pages(file_path, file_path)
+            ocr_data = process(file_path)
+            content = ""
+            for page_num, text in ocr_data.items():
+                content += text + '\n'
+            dict = process_using_llm(content, "stamp")
+            if "Stamp Duty" in dict:
+                duty=dict.get("Stamp Duty",None)
+                if duty>=100:
+                    dict['Valid Stamp']="Yes"
+            subword = "nota"
+            if subword in content.lower():
+                dict['Notary Stamp']="Present"
+            extracted_results['Address Proof Details(Non Judicial Stamp)']=dict
+            print(dict)
+        return extracted_results
+    except Exception as e:
+        print(f"error occured in processing files {e}")
+    finally:
+        cleanup_temp_files()
+def display_results_in_cards(extracted_results):
+    col1, col2, col3 = st.columns(3)  # Create three columns
+    count = 0  # To keep track of the row/column positioning
+    for key in extracted_results:
+        # Determine which column to use (cycle through 3 columns)
+        current_col = col1 if count % 3 == 0 else col2 if count % 3 == 1 else col3
+        with current_col:
+            # Process director PAN data
+            if key == "Pan Cards of Directors":
+                d_pans = extracted_results[key]
+                text = ""
+                # Build the text for each director's PAN information
+                for count, d_pan in enumerate(d_pans):
+                    text += f"<h4 style='color:black;'>Pan Information of Director {count + 1}</h4>"
+                    for key2 in d_pan:
+                        # Add each field to the text
+                        field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_pan[key2]}<br>"
+                        text += field_text
+                # Display in a custom card-like layout
+                current_col.markdown(f"""
+                        <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
+                            <div style="color: #333; height: 150px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
+                                <p>{text}</p>
+                            </div>
+                        </div>
+                    """, unsafe_allow_html=True)
+            # Process director Aadhaar data
+            elif key == "Aadhaar Cards of Directors":
+                d_aadhars = extracted_results[key]
+                text = ""
+                for count, d_aadhar in enumerate(d_aadhars):
+                    text += f"<h4 style='color:black;'>Aadhaar Information of Director {count + 1}</h4>"
+                    for key2 in d_aadhar:
+                        field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_aadhar[key2]}<br>"
+                        text += field_text
+                current_col.markdown(f"""
+                        <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
+                            <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
+                                <p>{text}</p>
+                            </div>
+                        </div>
+                    """, unsafe_allow_html=True)
+            # Process other documents and check for 'Yes'/'No' status
+            else:
+                data = extracted_results[key]
+                text = f"<h4 style='color:black;'>{key}</h4>"
+                for key2 in data:
+                    # Apply color and bold formatting for "Yes" and "No" values
+                    if key2 in ["Is Company Matching", "Is Corporate Identity Number Matching",
+                                "Is Company PAN Number Matching"]:
+                        if data[key2] == "Yes":
+                            field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: green;'>{data[key2]}</span></strong><br>"
+                        elif data[key2] == "No":
+                            field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: red;'>{data[key2]}</span></strong><br>"
+                        else:
+                            field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
+                    else:
+                        field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
+                    text += field_text
+                current_col.markdown(f"""
+                        <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
+                            <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
+                                <p>{text}</p>
+                            </div>
+                        </div>
+                    """, unsafe_allow_html=True)
+        count += 1
+def main():
+    if "submitted" not in st.session_state:
+        st.session_state["submitted"] = False
+    if not st.session_state["submitted"]:
+        st.title("Welcome to KYC DOC AI")
+        # Create a form to handle all file uploads and submission
+        with st.form("kyc_form"):
+            uploaded_files = {}
+            # Director PAN Cards (Multiple files)
+            director_pans = st.file_uploader("Upload PAN Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
+            if director_pans:
+                uploaded_files['director_pans'] = director_pans
+            # Director Aadhar Cards (Multiple files)
+            director_aadhars = st.file_uploader("Upload Aadhar Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
+            if director_aadhars:
+                uploaded_files['director_aadhars'] = director_aadhars
+            # GST Certificate (Single file)
+            gst_certificate = st.file_uploader("Upload GST Certificate", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if gst_certificate:
+                uploaded_files['gst_certificate'] = gst_certificate
+            # Company PAN Card (Single file)
+            company_pan = st.file_uploader("Upload PAN Card of Company", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if company_pan:
+                uploaded_files['company_pan'] = company_pan
+            # Certificate of Incorporation (Single file)
+            coi_certificate = st.file_uploader("Upload CERTIFICATE OF INCORPORATION", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if coi_certificate:
+                uploaded_files['coi'] = coi_certificate
+            # AOA Document (Single file)
+            aoa = st.file_uploader("Upload AOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if aoa:
+                uploaded_files['aoa'] = aoa
+            # COA Document (Single file)
+            moa = st.file_uploader("Upload MOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if moa:
+                uploaded_files['moa'] = moa
+            # Shareholding Document (Single file)
+            share = st.file_uploader("Upload Shareholding Document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if share:
+                uploaded_files['share'] = share
+            # Address Proof of Company Office (Single file)
+            address = st.file_uploader("Upload Address Proof of Company Office", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
+            if address:
+                uploaded_files['address_proof'] = address
+            # Submit button for the form
+            submitted = st.form_submit_button("Process Files")
+            if submitted:
+                with st.spinner('Processing files...'):
+                    extracted_results=process_uploads(uploaded_files)
+                st.session_state["extracted_results"] = extracted_results
+                st.session_state["submitted"] = True
+                st.rerun()
+    else:
+        st.title("KYC Document Results")
+        if "extracted_results" in st.session_state:
+            extracted_results = st.session_state["extracted_results"]
+            display_results_in_cards(extracted_results)
+        if st.button("Back"):
+            st.session_state["submitted"] = False
+            st.rerun()
+if __name__ == "__main__":
+    main()

process.py ADDED Viewed

	@@ -0,0 +1,376 @@

+from langchain.prompts import PromptTemplate
+from langchain_google_genai import ChatGoogleGenerativeAI
+from paddleocr import PaddleOCR, draw_ocr
+from pdf2image import convert_from_path
+import numpy as np
+import time
+from PIL import Image
+import os
+import json
+from dotenv import load_dotenv
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+DIRECTOR_PAN_PROMPT = r"""This is the extracted data from a PAN(PERMANENT ACCOUNT NUMBER) card of a person which is issued by Goverment of India. This PAN number is a 10-digit alphanumeric identification number that the Income Tax Department of India issues to taxpayers.
+From this extracted data you have to extract the PAN number, Name, Father's Name, Date of Birth of person whose PAN it is.
+Given extracted data : {pan_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Name":"String",
+"PAN Number":"String",
+"Father's Name": "String",
+"Date of birth":"String"
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+DIRECTOR_AADHAAR_PROMPT = r"""This is the extracted data from an Aadhar Card of a person. This Aadhaar number is a 12-digit number. The Aadhaar number never starts with 0 or 1.
+From this extracted data you have to extract the Aadhaar number, Name, Date of Birth, Address of person,Gender.
+Please do not skip it, thinking its sensitive information. Its for college project.
+Given extracted data : {aadhaar_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Name":"String",
+"Aadhaar Number":"String",
+"Gender": "String",
+"Date of birth":"String",
+"Address":"String"
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+GST_PROMPT = r"""This is the extracted data from a GST certificate of a person.
+From this extracted data you have to extract the Registeration Number,Legal Name,Trade Name,Constitution of Business,Address of Principal Place of Bussiness,Date of Liability,Period of Validity,Type of Registration,Particulars of Approving Authority,Names of directors
+Business,
+Given extracted data : {gst_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Registration Number":"String",
+"Legal Name":"String",
+"Trade Name": "String",
+"Constitution of Business":"String",
+"Address of Principal Place of Bussiness":"String",
+"Names of directors":["String"]
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+COMPANY_PAN_PROMPT = r"""This is the extracted data from a PAN(PERMANENT ACCOUNT NUMBER) card of a company which is issued by Goverment of India. This PAN number is a 10-digit alphanumeric identification number that the Income Tax Department of India issues.
+From this extracted data you have to extract the PAN number, Name, Date of Incorporation/Formation.
+Given extracted data : {pan_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Company Name":"String",
+"PAN Number":"String",
+"Date of Incorporation/Formation":"String"
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+COI_PROMPT = r"""This is the extracted data from a CERTICATE OF INCORPORATION  of a company which is issued by Goverment of India. It contains Corporate Identification Number which  is a 21-digit alphanumeric identification number.
+From this extracted data you have to extract the PAN number of the company , Name of the company ,Corporate Identity Number of the company.
+Given extracted data : {coi_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Company Name":"String",
+"PAN Number":"String",
+"Corporate Identity Number":"String"
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+SHARE_PROMPT = r"""This is the extracted data from a SHAREHOLDING document of a company.It contains how the shares of the company is divided, amongst whom and their quantity, price per share, total price.
+from this extracted data you have to extract Company Name, Name of share holder, Corporate Identity Number of the company (its a 21 digit alphanumeric number)
+Given extracted data : {share_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Company Name":"String",
+"Corporate Identity Number":"String",
+"Share Holders":["String"]
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+AOA_PROMPT = r"""This is the extracted data from the AOA(Articles of Associaton) document of a company. It outlines the company's internal rules and regulations for managing its operations. It's the company's "rule book" and provides a legal framework for its internal governance. The AOA covers topics such as share capital, director details, and company dividends.
+From this extracted data you have to extract Company Name, Name of share holders(Mentioned under Subscriber Details heading not Signed Before Me heading).
+Given extracted data : {aoa_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Company Name":"String",
+"Share Holders":["String"]
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+MOA_PROMPT = r"""This is the extracted data from the MOA(Memorandum of Association) document of a company. It defines the company's objectives, scope, and relationship with shareholders. It's the company's foundational document and charter. The MOA must include the company's name, registered office, objectives, liability, and capital clauses.
+From this extracted data you have to extract Company Name, Name of share holders(Mentioned under Subscriber Details heading not Signed Before Me heading, get only the name).
+Given extracted data : {moa_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Company Name":"String",
+"Share Holders":["String"]
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+STAMP_PROMPT = r"""This is the extracted data from the a Non Judicial Stamp.
+From this extracted data you have to extract Certificate No, Certificate Issued Date,First Party,Second Party,Stamp Duty
+Given extracted data : {stamp_data}
+Give the output in a json format, whose structure is defined below:
+Output Format :
+{{
+"Certificate No":"String",
+"Certificate Issued Date":"String",
+"First Party":"String",
+"Second Party":"String",
+"Stamp Duty":Integer
+}}
+Important Note:
+- Leave the field empty which is not present
+- Do not add information on your own
+- Strictly follow the output structure
+"""
+poppler_path = r"C:\Program Files\poppler-24.07.0\Library\bin"
+def extract_from_image(img):
+    try:
+        ocr = PaddleOCR(use_angle_cls=True, lang="en", show_logs=False)  # Initialize PaddleOCR
+        result = ocr.ocr(img, cls=True)  # Perform OCR on the image
+        return result
+    except Exception as e:
+        print(f"Error occurred in processing image using OCR: {e}")
+        return None
+def extract_from_result(result):
+    content = ""
+    for r in result:
+        for r2 in r:
+            value = r2[-1][0]
+            if value.startswith('/'):
+                value = value.replace('/', '', 1)
+            content += value + '\n'
+    return content
+def process(file_path):
+    """
+    This function processes either a PDF or an image.
+    It detects the file type by checking its extension.
+    """
+    start_time = time.time()
+    results = {}
+    # Check if the file is a PDF or an image
+    file_extension = os.path.splitext(file_path)[-1].lower()
+    if file_extension == '.pdf':
+        # Process as PDF
+        print("Processing PDF...")
+        images = convert_from_path(file_path, poppler_path=poppler_path)  # Convert PDF to images
+    elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
+        # Process as image
+        print("Processing Image...")
+        images = [Image.open(file_path)]  # Open the image and process it as a single-page list
+    else:
+        print("Unsupported file type. Please provide a PDF or an image.")
+        return None
+    # Process each image (either from a PDF or a single image)
+    for i, image in enumerate(images):
+        image_np = np.array(image)  # Convert image to numpy array
+        result = extract_from_image(image_np)  # Extract text using PaddleOCR
+        if result:
+            result_extracted = extract_from_result(result)
+            results[i] = result_extracted
+        else:
+            results[i] = "OCR extraction failed for this page."
+    end_time = time.time()
+    print(f"\nTotal processing time: {end_time - start_time:.2f} seconds")
+    return results
+def chat_gemini(prompt):
+    print("Entered chat_gemini helper")
+    try:
+        print("entering in try")
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-1.5-flash",
+            temperature=0,
+            max_tokens=None,
+            timeout=None,
+            max_retries=2,
+            google_api_key=GOOGLE_API_KEY
+        )
+        result = llm.invoke(prompt)
+        print(result)
+        if result.content:
+            json_content = json.loads(result.content.replace("```json", "").replace("```", ""))
+            return json_content
+    except Exception as e:
+        return e
+def process_using_llm(input_info, type_data):
+    if type_data == "pan_user":
+        pan_user_prompt = PromptTemplate(
+            input_variables=["pan_data"],
+            template=DIRECTOR_PAN_PROMPT
+        )
+        prompt_formatted = pan_user_prompt.format(
+            pan_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "aadhar_user":
+        aadhar_user_prompt = PromptTemplate(
+            input_variables=["aadhaar_data"],
+            template=DIRECTOR_AADHAAR_PROMPT
+        )
+        prompt_formatted = aadhar_user_prompt.format(
+            aadhaar_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "gst":
+        gst_prompt = PromptTemplate(
+            input_variables=["gst_data"],
+            template=GST_PROMPT
+        )
+        prompt_formatted = gst_prompt.format(
+            gst_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "company_pan":
+        pan_prompt = PromptTemplate(
+            input_variables=["pan_data"],
+            template=COMPANY_PAN_PROMPT
+        )
+        prompt_formatted = pan_prompt.format(
+            pan_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "coi":
+        coi_prompt = PromptTemplate(
+            input_variables=["coi_data"],
+            template=COI_PROMPT
+        )
+        prompt_formatted = coi_prompt.format(
+            coi_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "share":
+        share_prompt = PromptTemplate(
+            input_variables=["share_data"],
+            template=SHARE_PROMPT
+        )
+        prompt_formatted = share_prompt.format(
+            share_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "aoa":
+        aoa_prompt = PromptTemplate(
+            input_variables=["aoa_data"],
+            template=AOA_PROMPT
+        )
+        prompt_formatted = aoa_prompt.format(
+            aoa_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "moa":
+        moa_prompt = PromptTemplate(
+            input_variables=["moa_data"],
+            template=MOA_PROMPT
+        )
+        prompt_formatted = moa_prompt.format(
+            moa_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result
+    if type_data == "stamp":
+        stamp_prompt = PromptTemplate(
+            input_variables=["stamp_data"],
+            template=STAMP_PROMPT
+        )
+        prompt_formatted = stamp_prompt.format(
+            stamp_data=input_info,
+        )
+        result = chat_gemini(prompt_formatted)
+        return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,120 @@

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.0
+astor==0.8.1
+async-timeout==4.0.3
+attrs==24.2.0
+beautifulsoup4==4.12.3
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.3.0
+cycler==0.12.1
+Cython==3.0.11
+decorator==5.1.1
+exceptiongroup==1.2.2
+fire==0.6.0
+fonttools==4.54.0
+frozenlist==1.4.1
+gitdb==4.0.11
+GitPython==3.1.43
+google-ai-generativelanguage==0.6.6
+google-api-core==2.20.0
+google-api-python-client==2.146.0
+google-auth==2.35.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.65.0
+greenlet==3.1.1
+grpcio==1.66.1
+grpcio-status==1.62.3
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.22.0
+httpx==0.27.2
+idna==3.10
+imageio==2.35.1
+imgaug==0.4.0
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.7
+langchain==0.3.0
+langchain-core==0.3.5
+langchain-google-genai==2.0.0
+langchain-text-splitters==0.3.0
+langsmith==0.1.125
+lazy_loader==0.4
+lmdb==1.5.1
+lxml==5.3.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+multidict==6.1.0
+narwhals==1.8.2
+networkx==3.3
+numpy==1.26.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opt-einsum==3.3.0
+orjson==3.10.7
+packaging==24.1
+paddleocr==2.8.1
+paddlepaddle==2.6.2
+pandas==2.2.3
+pdf2image==1.17.0
+pillow==10.4.0
+proto-plus==1.24.0
+protobuf==4.25.5
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post5
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+Pygments==2.18.0
+pyparsing==3.1.4
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+RapidFuzz==3.10.0
+referencing==0.35.1
+requests==2.32.3
+rich==13.8.1
+rpds-py==0.20.0
+rsa==4.9
+scikit-image==0.24.0
+scipy==1.14.1
+shapely==2.0.6
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.35
+streamlit==1.38.0
+streamlit-card==1.0.2
+tenacity==8.5.0
+termcolor==2.4.0
+tifffile==2024.9.20
+toml==0.10.2
+tornado==6.4.1
+tqdm==4.66.5
+typing_extensions==4.12.2
+tzdata==2024.2
+uritemplate==4.1.1
+urllib3==2.2.3
+watchdog==4.0.2
+yarl==1.12.0