Document-AI / app.py
akshansh36's picture
Update app.py
6eae52f verified
import streamlit as st
from process import process,process_using_llm
import os
import shutil
import PyPDF2
TEMP_DIR = "temp_files"
st.set_page_config(layout="wide",page_title="KYC Doc AI")
def cleanup_temp_files():
try:
if os.path.exists(TEMP_DIR):
# Remove all files in TEMP_DIR
shutil.rmtree(TEMP_DIR)
print(f"Temporary files in {TEMP_DIR} have been deleted.")
# Re-create the temp directory after cleanup
os.makedirs(TEMP_DIR)
except Exception as e:
print(f"An error occurred during cleanup: {e}")
def extract_pages(input_pdf_path, output_pdf_path, start_page=None, end_page=None):
try:
# Open the PDF file
with open(input_pdf_path, 'rb') as input_pdf:
reader = PyPDF2.PdfReader(input_pdf)
total_pages = len(reader.pages)
# Create a PDF writer object for the new PDF
writer = PyPDF2.PdfWriter()
# Default: extract only the first page if no specific input is provided
if start_page is None and end_page is None:
start_page, end_page = 0, 0
# Special case: if user specifies 0 for start_page and -1 for end_page
# Extract the first and last page only
if start_page == 0 and end_page == -1:
writer.add_page(reader.pages[0]) # First page
writer.add_page(reader.pages[-1]) # Last page
# If only first page is required
elif start_page == 0 and end_page == 0:
writer.add_page(reader.pages[0]) # First page
else:
print("Invalid input. Only first page or (first and last page) extraction is allowed.")
return
# Write the combined PDF to a new file
with open(output_pdf_path, 'wb') as output_pdf:
writer.write(output_pdf)
print(f"PDF saved as {output_pdf_path}")
except Exception as e:
print(f"An error occurred: {e}")
def merge_dicts_by_aadhaar(data):
new_dic = {}
for dic in data:
aadhar = dic.get("Aadhaar Number")
aadhar = aadhar.replace(" ", "")
if aadhar in new_dic:
if dic.get('Gender') and dic.get('Date of birth'):
new_dic[aadhar]['Gender'] = dic.get('Gender', None)
new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
new_dic[aadhar]['Name'] = dic.get('Name', None)
new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
else:
new_dic[aadhar]['Address'] = dic.get('Address', None)
else:
new_dic[aadhar] = {}
if dic.get('Gender') and dic.get('Date of birth'):
new_dic[aadhar]['Gender'] = dic.get('Gender', None)
new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
new_dic[aadhar]['Name'] = dic.get('Name', None)
new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
else:
new_dic[aadhar]['Address'] = dic.get('Address', None)
new_data = []
for key in new_dic:
new_data.append(new_dic[key])
return new_data
def process_uploads(uploaded_files):
try:
company_name=None
company_name_legal=None
company_trade_name=None
gst_number=None
pan_number_company=None
coi_number=None
director_names=[]
extracted_results={}
gst_dict={}
pan_dict={}
coi_dict={}
moa_dict={}
aoa_dict={}
add_dict={}
share_dict={}
total_pan_number=0
total_company_names=0
total_coi_numbers=0
if not os.path.exists(TEMP_DIR):
os.makedirs(TEMP_DIR)
# director pans: -> can be individual files, single file
director_pans=uploaded_files.get('director_pans',None)
if director_pans:
director_pan_data=[]
for pan in director_pans:
file_path = os.path.join(TEMP_DIR, pan.name)
with open(file_path, "wb") as temp_file:
temp_file.write(pan.getbuffer())
ocr_data=process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
dict=process_using_llm(content,"pan_user")
if dict:
director_pan_data.append(dict)
for pan_data in director_pan_data:
if "Name" in pan_data:
director_names.append(pan_data.get('Name').strip().lower())
extracted_results['Pan Cards of Directors']=director_pan_data
director_aadhars=uploaded_files.get("director_aadhars",None)
if director_aadhars:
director_aadhars_data=[]
for aadhar in director_aadhars:
file_path = os.path.join(TEMP_DIR, aadhar.name)
with open(file_path, "wb") as temp_file:
temp_file.write(aadhar.getbuffer())
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
dict = process_using_llm(content, "aadhar_user")
if dict:
director_aadhars_data.append(dict)
director_aadhars_data_new=merge_dicts_by_aadhaar(director_aadhars_data)
for direc_adhar in director_aadhars_data_new:
if "Name" in direc_adhar:
name=direc_adhar.get('Name').strip().lower()
if name not in director_names:
director_names.append(name)
extracted_results["Aadhaar Cards of Directors"]=director_aadhars_data_new
gst_cert=uploaded_files.get('gst_certificate',None)
if gst_cert:
file_path = os.path.join(TEMP_DIR, gst_cert.name)
with open(file_path, "wb") as temp_file:
temp_file.write(gst_cert.getbuffer())
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
gst_dict = process_using_llm(content, "gst")
if "Legal Name" in gst_dict:
company_name_legal=gst_dict.get('Legal Name','').strip()
elif "Trade Name" in gst_dict:
company_trade_name=gst_dict.get('Trade Name','').strip()
if "Registration Number"in gst_dict:
gst_number=gst_dict.get('Registration Number').strip()
if company_name_legal or company_trade_name:
total_company_names+=1
company_pan = uploaded_files.get('company_pan',None)
if company_pan:
file_path = os.path.join(TEMP_DIR, company_pan.name)
with open(file_path, "wb") as temp_file:
temp_file.write(company_pan.getbuffer())
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
pan_dict = process_using_llm(content, "company_pan")
if "Company Name" in pan_dict:
name=pan_dict.get("Company Name").strip()
company_name=name
total_company_names+=1
if "PAN Number" in pan_dict:
pan_number_company=pan_dict.get('PAN Number').strip()
total_pan_number+=1
coi = uploaded_files.get('coi',None)
if coi:
file_path = os.path.join(TEMP_DIR, coi.name)
with open(file_path, "wb") as temp_file:
temp_file.write(coi.getbuffer())
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
coi_dict = process_using_llm(content, "coi")
if "Company Name" in coi_dict:
name=coi_dict.get("Company Name").strip()
company_name=name
total_company_names+=1
if "Corporate Identity Number" in coi_dict:
coi_number=coi_dict.get("Corporate Identity Number").strip()
total_coi_numbers+=1
if "PAN Number" in coi_dict:
total_pan_number+=1
pan_number_company=coi_dict.get("PAN Number").strip()
aoa = uploaded_files.get('aoa',None)
if aoa:
file_path = os.path.join(TEMP_DIR, aoa.name)
with open(file_path, "wb") as temp_file:
temp_file.write(aoa.getbuffer())
extract_pages(file_path,file_path,0,-1)
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
aoa_dict = process_using_llm(content, "aoa")
if "Share Holders" in aoa_dict:
share_holders=aoa_dict.get("Share Holders",[])
aoa_dict["Share Holders"]=",".join(share_holders)
moa = uploaded_files.get('moa',None)
if moa:
file_path = os.path.join(TEMP_DIR, moa.name)
with open(file_path, "wb") as temp_file:
temp_file.write(moa.getbuffer())
extract_pages(file_path, file_path,0,-1)
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
moa_dict = process_using_llm(content, "moa")
if "Share Holders" in moa_dict:
share_holders=moa_dict.get("Share Holders",[])
moa_dict["Share Holders"]=",".join(share_holders)
share=uploaded_files.get('share',None)
if share:
file_path = os.path.join(TEMP_DIR, share.name)
with open(file_path, "wb") as temp_file:
temp_file.write(share.getbuffer())
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
share_dict = process_using_llm(content, "share")
if "Share Holders" in share_dict:
share_holders=share_dict.get("Share Holders",[])
share_dict["Share Holders"]=",".join(share_holders)
if "Corporate Identity Number" in share_dict:
coi_number=share_dict.get('Corporate Identity Number').strip()
total_coi_numbers+=1
address_proof = uploaded_files.get('address_proof', None)
if address_proof:
file_path = os.path.join(TEMP_DIR, address_proof.name)
with open(file_path, "wb") as temp_file:
temp_file.write(address_proof.getbuffer())
extract_pages(file_path, file_path)
ocr_data = process(file_path)
content = ""
for page_num, text in ocr_data.items():
content += text + '\n'
add_dict = process_using_llm(content, "stamp")
if "Stamp Duty" in add_dict:
duty=add_dict.get("Stamp Duty",None)
if duty>=100:
add_dict['Valid Stamp']="Yes"
subword = "nota"
if subword in content.lower():
add_dict['Notary Stamp']="Present"
extracted_results['Address Proof Details(Non Judicial Stamp)']=add_dict
if company_name is not None or company_name_legal is not None or company_trade_name is not None:
if total_company_names>1:
if pan_dict:
name=pan_dict.get("Company Name",None)
if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name):
pan_dict['Is Company Matching']="Yes"
else:
pan_dict['Is Company Matching']="No"
if coi_dict:
name = coi_dict.get("Company Name",None)
if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name):
coi_dict['Is Company Matching'] = "Yes"
else:
coi_dict['Is Company Matching'] = "No"
if gst_dict:
name1 = gst_dict.get("Legal Name",None)
name2 = gst_dict.get("Trade Name",None)
if name1 and (name1.strip() == company_name or name1.strip() == company_name_legal or name1.strip() == company_trade_name):
gst_dict['Is Company Matching'] = "Yes"
elif name2 and (name2.strip() == company_name or name2.strip() == company_name_legal or name2.strip() == company_trade_name):
gst_dict['Is Company Matching'] = "Yes"
else:
gst_dict['Is Company Matching']="No"
if moa_dict:
name = moa_dict.get("Company Name", None)
if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name):
moa_dict['Is Company Matching'] = "Yes"
else:
moa_dict['Is Company Matching'] = "No"
if aoa_dict:
name = moa_dict.get("Company Name", None)
if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name):
moa_dict['Is Company Matching'] = "Yes"
else:
moa_dict['Is Company Matching'] = "No"
if share_dict:
name = share_dict.get("Company Name",None)
if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name):
share_dict['Is Company Matching'] = "Yes"
else:
share_dict['Is Company Matching'] = "No"
else: # if total count is less than or equal to 1 then we cannot validate
if pan_dict:
pan_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if coi_dict:
coi_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if gst_dict:
gst_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if aoa_dict:
aoa_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if moa_dict:
moa_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if share_dict:
share_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
else:
if pan_dict:
pan_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if coi_dict:
coi_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if gst_dict:
gst_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if aoa_dict:
aoa_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if moa_dict:
moa_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if share_dict:
share_dict[
'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if pan_number_company is not None:
if total_pan_number>1:
if pan_dict:
pan_number=pan_dict.get('PAN Number',None)
if pan_number is not None and pan_number.strip()==pan_number_company:
pan_dict['Is Company PAN Number Matching']="Yes"
else:
pan_dict['Is Company PAN Number Matching'] = "No"
if coi_dict:
pan_number = coi_dict.get('PAN Number',None)
if pan_number is not None and pan_number.strip() == pan_number_company:
coi_dict['Is Company PAN Number Matching'] = "Yes"
else:
coi_dict['Is Company PAN Number Matching'] = "No"
else:
if pan_dict:
pan_dict[
'More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if coi_dict:
coi_dict[
'More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
else:
if pan_dict:
pan_dict['More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if coi_dict:
coi_dict['More information needed to validate company PAN number']= f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if coi_number is not None:
if total_coi_numbers>1:
if coi_dict:
coi_number_check=coi_dict.get('Corporate Identity Number',None)
if coi_number_check is not None and coi_number_check.strip()==coi_number:
coi_dict['Is Corporate Identity Number Matching']="Yes"
else:
coi_dict['Is Corporate Identity Number Matching']="No"
if share_dict:
coi_number_check=share_dict.get('Corporate Identity Number',None)
if coi_number_check is not None and coi_number_check.strip()==coi_number:
share_dict['Is Corporate Identity Number Matching']="Yes"
else:
share_dict['Is Corporate Identity Number Matching'] = "No"
else:
if coi_dict:
coi_dict[
'More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if share_dict:
share_dict[
'More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
else:
if coi_dict:
coi_dict['More information needed to validate company COI number']=f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if share_dict:
share_dict['More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if "Names of directors" in gst_dict:
gst_director_names = gst_dict.get("Names of directors", [])
if gst_director_names:
gst_dict["Names of directors"]=",".join(gst_director_names)
missing_directors = []
gst_director_names_lower = [name.strip().lower() for name in gst_director_names]
if director_names:
for direc_name in director_names:
if direc_name not in gst_director_names_lower:
missing_directors.append(direc_name)
if not missing_directors:
gst_dict["All director names present?"] = "<span style='color: green;'><strong>Yes</strong></span>"
else:
# List missing director names in red
missing_directors_text = ', '.join(
[f"<span style='color: red;'>{name}</span>" for name in missing_directors])
gst_dict["All director names present?"] = f"<span style='color: red;'><strong>No</strong></span> (Missing: {missing_directors_text})"
else:
gst_dict['More information needed to validate Director names'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>"
if pan_dict:
extracted_results['Company PAN Details']=pan_dict
if coi_dict:
extracted_results['COI Details']=coi_dict
if gst_dict:
extracted_results['GST Certificate Details']=gst_dict
if moa_dict:
extracted_results['MOA Details']=gst_dict
if aoa_dict:
extracted_results['AOA Details']=aoa_dict
if share_dict:
extracted_results['Shareholding Details']=share_dict
return extracted_results
except Exception as e:
print(f"error occured in processing files {e}")
finally:
cleanup_temp_files()
def display_results_in_cards(extracted_results):
col1, col2, col3 = st.columns(3) # Create three columns
count = 0 # To keep track of the row/column positioning
for key in extracted_results:
# Determine which column to use (cycle through 3 columns)
current_col = col1 if count % 3 == 0 else col2 if count % 3 == 1 else col3
with current_col:
# Process director PAN data
if key == "Pan Cards of Directors":
d_pans = extracted_results[key]
text = ""
# Build the text for each director's PAN information
for count, d_pan in enumerate(d_pans):
text += f"<h4 style='color:black;'>Pan Information of Director {count + 1}</h4>"
for key2 in d_pan:
# Add each field to the text
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_pan[key2]}<br>"
text += field_text
# Display in a custom card-like layout
current_col.markdown(f"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
<div style="color: #333; height: 150px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
<p>{text}</p>
</div>
</div>
""", unsafe_allow_html=True)
# Process director Aadhaar data
elif key == "Aadhaar Cards of Directors":
d_aadhars = extracted_results[key]
text = ""
for count, d_aadhar in enumerate(d_aadhars):
text += f"<h4 style='color:black;'>Aadhaar Information of Director {count + 1}</h4>"
for key2 in d_aadhar:
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_aadhar[key2]}<br>"
text += field_text
current_col.markdown(f"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
<div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
<p>{text}</p>
</div>
</div>
""", unsafe_allow_html=True)
# Process other documents and check for 'Yes'/'No' status
else:
data = extracted_results[key]
text = f"<h4 style='color:black;'>{key}</h4>"
for key2 in data:
# Apply color and bold formatting for "Yes" and "No" values
if key2 in ["Is Company Matching", "Is Corporate Identity Number Matching",
"Is Company PAN Number Matching"]:
if data[key2] == "Yes":
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: green;'>{data[key2]}</span></strong><br>"
elif data[key2] == "No":
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: red;'>{data[key2]}</span></strong><br>"
else:
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
else:
field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
text += field_text
current_col.markdown(f"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
<div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
<p>{text}</p>
</div>
</div>
""", unsafe_allow_html=True)
count += 1
def main():
if "submitted" not in st.session_state:
st.session_state["submitted"] = False
if not st.session_state["submitted"]:
st.title("Welcome to KYC DOC AI")
# Create a form to handle all file uploads and submission
with st.form("kyc_form"):
uploaded_files = {}
# Director PAN Cards (Multiple files)
director_pans = st.file_uploader("Upload PAN Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
if director_pans:
uploaded_files['director_pans'] = director_pans
# Director Aadhar Cards (Multiple files)
director_aadhars = st.file_uploader("Upload Aadhar Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
if director_aadhars:
uploaded_files['director_aadhars'] = director_aadhars
# GST Certificate (Single file)
gst_certificate = st.file_uploader("Upload GST Certificate", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if gst_certificate:
uploaded_files['gst_certificate'] = gst_certificate
# Company PAN Card (Single file)
company_pan = st.file_uploader("Upload PAN Card of Company", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if company_pan:
uploaded_files['company_pan'] = company_pan
# Certificate of Incorporation (Single file)
coi_certificate = st.file_uploader("Upload CERTIFICATE OF INCORPORATION", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if coi_certificate:
uploaded_files['coi'] = coi_certificate
# AOA Document (Single file)
aoa = st.file_uploader("Upload AOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if aoa:
uploaded_files['aoa'] = aoa
# COA Document (Single file)
moa = st.file_uploader("Upload MOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if moa:
uploaded_files['moa'] = moa
# Shareholding Document (Single file)
share = st.file_uploader("Upload Shareholding Document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if share:
uploaded_files['share'] = share
# Address Proof of Company Office (Single file)
address = st.file_uploader("Upload Address Proof of Company Office", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
if address:
uploaded_files['address_proof'] = address
# Submit button for the form
submitted = st.form_submit_button("Process Files")
if submitted:
with st.spinner('Processing files...'):
extracted_results=process_uploads(uploaded_files)
st.session_state["extracted_results"] = extracted_results
st.session_state["submitted"] = True
st.rerun()
else:
st.title("KYC Document Results")
if "extracted_results" in st.session_state:
extracted_results = st.session_state["extracted_results"]
display_results_in_cards(extracted_results)
if st.button("Back"):
st.session_state["submitted"] = False
st.rerun()
if __name__ == "__main__":
main()